ó
¿ênQc        	   @   sÑ  d  Z  d d l Z d d l Z d d l Z d d l Z d d l m Z d d l m Z m	 Z	 m
 Z
 d d l m Z m Z m Z d Z d Z d Z d	 Z d
 Z e
 d d ƒ Z e
 d d ƒ Z d d „ Z d „  Z e d „ Z d „  Z d „  Z d „  Z d „  Z e e d „ Z e d „ Z d „  Z  d „  Z! d d d „ Z" d „  Z# d „  Z$ d d d „ Z% d „  Z& e e d  „ Z' d! „  Z( d" „  Z) e* d# k rÍy e j+ d$ Z, e, d% Z- e j. j/ e- ƒ sØe j0 d& e, d' e1 ƒn  e2 e- ƒ Z3 e e j+ d( ƒ Z4 e e j+ d ƒ Z5 e2 e j+ d) d* ƒ Z6 Wn e7 k
 r@Z8 e e8 ƒ n Xxl e3 D]d Z9 e9 j: ƒ  d(  \ Z; Z< e; e< e4 e5 f Z= e) e= ƒ Z> x' e> D] Z? e e? e6 d+ d d, d ƒq‰WqHWe6 j@ ƒ  e j+ d Gd- GHn  d S(.   s[   Create read islands from RNA-Seq splice and coverage data

Created: 2011-03-16
Author: ian
iÿÿÿÿN(   t	   Tabixfile(   t   defaultdictt   dequet
   namedtuple(   t   GFF3Exont   GFF3mRNAt
   GFF3Recordi   id   i2   g333333ã?gš™™™™™¹?t
   Coverblocks   seqID start end deptht   Splices"   seqID start end strand score countc         C   s?   |  r t  j |  IJn  t  j d t  j d IJt  j d ƒ d  S(   NsN   Usage: python %s genome_seq.fa  observed.juncs.gz coverage.wig.gz  output.gff3i    i   (   t   syst   stderrt   argvt   exit(   t   msg(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   usage   s    c      	   C   sø   g  } yÅ t  | j |  j ƒ  |  j ƒ  |  j ƒ  d ƒ ƒ } x | D]… } | j ƒ  j d ƒ } t | d t |  j ƒ  t	 | d ƒ d ƒ t
 |  j ƒ  d t	 | d ƒ ƒ t	 | d ƒ ƒ } | j | ƒ qA WWn& t k
 ró } t j d I| IJn X| S(   Ni   s   	i    i   i   s$   [find_read_islands.get_cover_blocks](   t   listt   fetcht	   get_seqIDt	   get_startt   get_endt   stript   splitR   t   maxt   intt   mint   appendt
   ValueErrorR	   R
   (   t   exont   coverage_tabixt   covert   wiglinest   linet   fieldst   blockt   ve(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   get_cover_blocks   s    1]c      	   C   s›  g  } d  } ybt | j |  t | ƒ t | ƒ ƒ ƒ } x
| D]} | j ƒ  j d ƒ }	 t |	 ƒ d k  rp q= n  t |	 d t t | ƒ t |	 d ƒ d ƒ t	 t | ƒ t |	 d ƒ ƒ t |	 d ƒ ƒ }
 |
 j
 | k  r| r?| d | d k r| j | ƒ n  d  } q?q= | r$|
 j | d <q= |
 d |
 d |
 d g } q= W| rm| d | d k rm| j | ƒ n  Wn& t k
 r–} t j d I| IJn X| S(   Ns   	i   i    i   i   i   s$   [find_read_islands:contiguous_cover](   t   NoneR   R   R   R   R   t   lenR   R   R   t   depthR   t   endt	   ExceptionR	   R
   (   t   scaffoldt   startR'   R   t	   min_covert   islandst   curr_islandR   R   R    R!   R"   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   contiguous_cover+   s,    'Yc      
   C   sR   t  |  d d |  d |  d d d d d d | ƒ } t |  ƒ } | j | ƒ | S(   Ni    s   RNA-Seqi   i   t   .s   ID=%s(   R   t   make_exon_from_islandt   add_exon(   t   islandt   idt
   transcriptR   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   make_transcript_from_islandE   s    5c      	   C   s`   |  d |  d d k  r1 t  d t |  ƒ ƒ ‚ n  t |  d d |  d |  d d d d d d ƒ S(   Ni   i   s4   [find_read_islands.make_exon_from_island] Too short!i    s   RNA-SeqR/   t    (   R   t   strR   (   R2   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyR0   K   s    c         C   sÔ  |  j  ƒ  } |  j ƒ  } | j t | j ƒ d ƒ yM | j ƒ  d k rd | j t | j ƒ d ƒ n | j t | j ƒ d ƒ Wn n X| j | d ƒ |  j ƒ  } | j	 t | j
 ƒ ƒ yE | j ƒ  d k rê | j t | j
 ƒ ƒ n | j t | j
 ƒ ƒ Wn n X| j | d ƒ x« |  j ƒ  D] } | j ƒ  t | j ƒ k rZ| j | ƒ q)| j ƒ  t | j
 ƒ k r…| j | ƒ q)t | | ƒ \ } } | r°| j | ƒ n  | r)| j | ƒ q)q)W| | f S(   Ni   t   -t   Ut   D(   t   get_IDt   clonet   set_endR   R*   t
   get_strandt   set_CDS_startt   set_CDS_stopt   set_IDt	   set_startR'   t	   get_exonsR   R1   R   t   break_exon_here(   R4   t   spliceR3   t   upstreamt
   downstreamR   t   exon_5t   exon_3(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   break_transcript_hereP   s<    c         C   sš   d  } d  } |  j ƒ  t | j ƒ k  rP |  j ƒ  } | j t | j ƒ d ƒ n  |  j ƒ  t | j ƒ k r |  j ƒ  } | j t | j ƒ ƒ n  | | f S(   Ni   (	   R$   R   R   R*   R<   R=   R   R'   RB   (   R   RE   RH   RI   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyRD   s   s    c         C   s  xû|  D]ó} yÇt  g  | j ƒ  D] } t | ƒ ^ q  ƒ } xg | j ƒ  D]Y } | j d ƒ sH | j d ƒ rˆ | j d | j d ƒ ƒ q¡ t d | j ƒ  ƒ ‚ qH qH W| | k rÓt g  | j ƒ  D] } t	 | j d ƒ ƒ ^ qÁ ƒ } | | k rÓg  } | j ƒ  }	 | j
 | ƒ d | |	 |	 f }
 d |	 |	 f } | j d j g  | j | j d | j | j | j | j | j |
 g	 D] } t | ƒ ^ qvƒ ƒ | j ƒ  } g  | D] } | j ƒ  | f ^ q¥} g  t | ƒ D] } | d ^ qÐ} d } x¾ | D]¶ } t | ƒ } d	 |	 d | | | f } | j
 t	 | j d ƒ ƒ ƒ | j d j g  | j | j d
 | j | j | j | j | j | | g	 D] } t | ƒ ^ qƒƒ ƒ | | 7} qóW| j d ƒ | d j | ƒ IJqÓn  Wq t k
 rù} t j | IJ‚  q Xq Wd  S(   Nt   covsT   [find_read_islands.filter_and_output_contigs.l133] Fragment %s lacks a value for covs   length=%d;Name=%s;ID=%ss   Name=%s;Parent=%ss   	t   contigi   i    s   ;Target=%s %d %d +t   matchs   ###s   
(   t   sumRC   R%   t   has_attributet   add_attributet   get_attributeR   R;   R   t   floatt	   set_scoreR   t   joint   _seqIDt   _sourcet   _startt   _endt   _scoret   _strandt   _phaseR7   R   t   sortedR	   R
   (   t	   fragmentst   outputt
   min_lengthR+   t   fragmentR   t   f_lent   f_covt   linesR3   t   trans_attrst
   exon_attrst   ft   exonst   et   to_sortt   tupt   offsett   e_lent   targetR"   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   filter_and_output_contigs~   s@    +4_%#cc      
   C   së  g  } g  } |  r‰|  d } t  | j ƒ } d } x? t | ƒ D]% \ } } t  | j ƒ | k r> Pq> q> Wt d ƒ ‚ x7 | d k r¬ t  | | j ƒ | k  rŸ Pn  | d 8} qv W| d 7} t  | | j ƒ }	 t | | j d |	 d | d d d d d ƒ }
 | j |
 ƒ |
 j	 | j
 ƒ t |  ƒ d k r‰xZ |  d D]K } t | d j d t  | j ƒ d t  | j ƒ d d | j
 d d ƒ } q4Wq‰n  | rçt j | d ƒ } | j | ƒ | j d t | ƒ ƒ x | D] } | j | ƒ qÍWn  | S(   s  Create one or more transcripts from a list of splice sites and coverage records
    Input:
        splice_list - list of Splice, already filtered for adequate coverage
        coverage_list - list of Coverblock

    Output:
        list of GFF3mRNA instances
    i    s*   End of 1st exon not found in coverage listi   s   RNA-SeqR/   s   ID=exon0s   t%s(   R   R*   t	   enumerateR'   R   R&   R   t   seqIDR   t
   set_strandt   strandR%   R   t
   fromRecordRA   R1   (   t   splice_listt   coverage_listR+   t   transcriptsRg   t   intron0t   end0t   iR!   t   start0t   exon0t   intron1t   exon1R4   R   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   make_transcripts¡   s:    	

0Oc         C   sz   d } } xO |  D]G } d t  | j ƒ t  | j ƒ } | | 7} | | t  | j ƒ 7} q W| d k  rl d St | ƒ | S(   Ni    i   (   R   R'   R*   R&   RR   (   t   blockst   areat   lengthR!   t   block_length(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   get_mean_depthÏ   s    
 
c         C   sZ   d } xM |  D]E } d t  | j ƒ t  | j ƒ } | | t  | j ƒ | d 7} q W| S(   Ng        i   i   (   R   R'   R*   R&   (   R   t   meant   ssR!   R‚   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   get_sum_of_squared_deviationsÙ   s
     #i    c         C   sà  g  |  D]& } d t  | j ƒ t  | j ƒ ^ q } g  |  D] } t  | j ƒ ^ q: } t  |  d j ƒ } d } d } d }	 d }
 t d | ƒ } t d | ƒ } t | ƒ } t g  t | | ƒ D] \ } } | | ^ q½ ƒ } | | } } xÁ t t | | ƒ ƒ D]ª \ } \ } } | | } |	 | 7}	 | | 8} |
 | 7}
 | | 8} |	 | k r | | k r |
 |	 } | | } t | | ƒ } | | k rª| } | } | } | } qªq q W| d k rÐt  |  | j ƒ } n  | | | | f S(   Ni   i    iÿÿÿÿ(	   R   R'   R*   R&   R   RN   t   zipRo   t   abs(   R   t   left_margint   right_marginR!   t   lengthst   depthst   dividert   max_it   max_difft   left_lent	   left_areat   left_offsett   right_offsett	   right_lenR   R&   t
   right_areat   extreme_left_meant   extreme_right_meanRy   R€   t	   left_meant
   right_meant   diff(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   find_extreme_partitionà   s<    3"2(






c      	   C   s¾   g  } | j  t |  d j d t |  d j ƒ t |  d j ƒ d ƒ ƒ xp t |  d  |  d ƒ D]W \ } } | j  t | j d t | j ƒ t | j ƒ t | j ƒ t | j ƒ ƒ ƒ q_ W| S(   Ni    iÿÿÿÿi   (   R   R   Rp   R   R'   R*   R‡   R&   (   R   t   jumpst	   prevblockt	   nextblock(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   extract_depth_changes   s
    A$Oc         C   sˆ   t  |  d j d |  d j |  d j ƒ g } xT |  d D]H } | j t  | j | d j | d j | j | d j | j ƒ ƒ q8 W| S(   Ni    i   iÿÿÿÿ(   R   Rp   R'   R&   R   (   Rœ   t	   work_copyt   jump(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   apply_depth_changes  s    -Fc         C   sª   t  | ƒ } g  } d } x t |  ƒ D]s }	 t j | ƒ t | ƒ }
 t |
 | | ƒ \ } } } } | j | ƒ | | k r% | d 7} | | k r˜ Pq˜ q% q% W| j ƒ  | S(   Ni    i   (   RŸ   t   xranget   randomt   shuffleR¢   R›   R   t   sort(   t   max_iterR   t	   thresholdt   critical_tail_countR‰   RŠ   Rœ   t   diffst
   tail_countRy   R    R   R   R–   R—   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   sample_extreme_partitions  s    

c         C   se   xH t  d t | ƒ ƒ D]% } | | |  k  r | d } Pq q Wt | ƒ } t | ƒ t | ƒ } | S(   Ni   (   t   rangeR%   RR   (   t   xt   distributionRy   t   rankt   p(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   calculate_tail_area  s    
c   
      C   sb   t  |  | | ƒ \ } } } } d t } t d |  | | | | ƒ } t | | ƒ }	 | |	 | | f S(   Niè  (   R›   t   ALPHAR¬   R²   (
   R   R‰   RŠ   R   R   R–   R—   R©   R¯   R±   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   find_1_jump'  s
    
c   
      C   sJ  g  } t  ƒ  } x4|  rE|  j ƒ  } t g  | j ƒ  D] } t | ƒ ^ q4 ƒ } | d k  ra q n  x | j ƒ  D]q } t | ƒ d k  r¢ t j d It | ƒ IJn  | j d ƒ sn t	 | | ƒ } t
 | ƒ } | j d | ƒ qn qn Wt g  | j ƒ  D]% } t | ƒ t | j d ƒ ƒ ^ qó ƒ | }	 | j d |	 ƒ | j | ƒ q W| S(   Ni   s5   [find_read_islands.break_at_depth_jumps] Short exon:
RK   (   R   t   popleftRN   RC   R%   R	   R
   R7   RO   R#   Rƒ   RP   RR   RQ   R   (
   t   transcript_dequeR   Rœ   R]   t   current_transcriptR   t   current_transcript_lenR   t   exon_mean_deptht   current_transcript_cov(    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   break_at_depth_jumps.  s$    		+Bc      	   C   sà  |  \ } } } } d  } d } d } g  } g  }	 yg  | j | ƒ D]" }
 t j |
 j d ƒ d  ƒ ^ qC } g  | D]6 } t | j ƒ t k rr t | j	 ƒ t
 k rr | ^ qr } | d g } xs | d D]g } t | j ƒ t | d j ƒ k rþ | j | ƒ qÆ t | j	 ƒ t | d j	 ƒ k rÆ | | d <qÆ qÆ WWn, t k
 r`} t j d I| IJg  } n XyO| r´| d t | d j ƒ f } y]t d | d t
 | Œ } | r1t | ƒ d k r1xu | d  D]f } y4 | d 7} t | d	 | | f ƒ } | j | ƒ WqÄt k
 r)} t j d
 g I| IJqÄXqÄWn  | râyv | d 7} t | d d	 | | f ƒ } | j | d j ƒ | d d t | d j ƒ k r¬| j | ƒ d  } n  Wqât k
 rÞ} t j d g I| IJd  } qâXn  Wn) t k
 r} t j d I| IJ‚  n Xt | ƒ d k r¤x€t | d  | d ƒ D]d\ } } t | j ƒ t | j ƒ k rŽt j d I| I| j I| j IJq9n  | t | j ƒ t | j ƒ f } yÂt d | d t
 | Œ } | rp| röt | d ƒ } | j | ƒ n' | d 7} t | d d	 | | f ƒ } | j | j ƒ t | ƒ d k rÿx½ | d D]® } | j | ƒ | d 7} t | d	 | | f ƒ } xo | D]g } y5 t | g ƒ } t | | ƒ } t | t d d ƒ Wq‡t k
 rí} t j d I| IJ‚  q‡Xq‡Wg  } qJWn  | j | j k rpt | ƒ d k rm| j | ƒ | d 7} t | d d	 | | f ƒ } | j | j ƒ qmqpn  Wq9t k
 rœ} t j d I| IJ‚  q9Xq9Wn  | d j } n  y–t | ƒ t | ƒ k  rIt | | | d | d t
 ƒ} | rI| rSy | j t | d ƒ ƒ Wn& t k
 r<} t j d I| IJn X| j | ƒ d  } n~ yU | d 7} t | d d	 | | f ƒ } | rš| j | d j ƒ n  | j | ƒ Wn& t k
 rÐ} t j d I| IJn Xxr | d D]c } y4 | d 7} t | d	 | | f ƒ } | j | ƒ WqÜt k
 r>} t j d I| IJqÜXqÜWqIn  Wn& t k
 rr} t j d I| IJn Xx9 | D]1 } t | g ƒ } t | | ƒ } |	 j | ƒ qzWWn) t k
 rÛ} t j d I| IJ‚  n X|	 S(   Ni   i    s   	i   iÿÿÿÿs   [find_read_islands.l372]R   R+   s   %s.Island%06ds   find_read_islands.l386i   s   find_read_islands.l397s   [find_read_islands.l400]s   Overlapping splices!s   [find_read_islands.l429]s   [find_read_islands.l440]s   [find_read_islands.l455]s   [find_read_islands.l465]s   [find_read_islands.l473]s   [find_read_islands.l476]s   [find_read_islands.l483](   R$   R   R   t   _makeR   RR   t   scoret   MAX_READTHROUGHR   t   countt   MINIMUM_COVERR*   R'   R   R(   R	   R
   R.   R%   R5   R   Rq   Rr   R‡   R0   R1   R   R»   Rn   R^   (   t   argsR)   t   scaffold_lent   juncs_tabixt   cover_tabixt   curr_transcriptt   search_startt
   island_numt   island_transcriptst   resultt   jt   all_splicest   st	   s_splicest   splicesRh   t   inter_splice0R,   R2   R4   R"   t   splice1t   splice2t   inter_spliceR   t   t_dequeR]   (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   get_scaffold_islands[  sð    8C""

!
!$%!


	

%t   __main__i   s   .fais   samtools faidx %st   shelli   i   t   wR_   R+   s   done.(A   t   __doc__R	   t   osR¤   t
   subprocesst   pysamR    t   collectionsR   R   R   t
   gff3RecordR   R   R   RÀ   t   MIN_TRANSCRIPT_LENGTHt   MIN_EXON_LENGTHR¾   R³   R   R   R$   R   R#   R.   R5   R0   RJ   RD   Rn   R~   Rƒ   R†   R›   RŸ   R¢   R¬   R²   R´   R»   RÔ   t   __name__R   t
   genome_seqt   fai_patht   patht   isfilet   callt   Truet   opent	   scaffoldsRÃ   RÄ   R^   R(   Rh   R   R   R)   RÂ   RÁ   t   scaffold_islandsR]   t   close(    (    (    sD   /home/ian/PycharmProjects/RNA-Seq-Simulator/lib/find_read_islands.pyt   <module>	   sd   0				#	#.	
	 			
	-	†
!
