ó
ËdTc           @   sâ   d  d l  Z  d  d l Z d  d l m Z d  d l m Z d  d l m Z m Z d  d l	 m	 Z	 d  d l
 m Z d  d l m Z d  d l Z d f  d	 „  ƒ  YZ e d
 k rÞ d GHd  d l Z e j e j j ƒ  d d ƒ e _ n  d S(   iÿÿÿÿN(   t   deepcopy(   t   Wig(   t   rt   FloatVector(   t   time(   t   randint(   t   logt   readsc           B   s+  e  Z d  d d e d d d „ Z d d d „ Z d „  Z d d d	 „ Z d  d d d
 „ Z d  d d d „ Z d  d d d „ Z	 d  d d d „ Z
 d  d d d „ Z d  d d d „ Z d  d d d „ Z d  d d d „ Z d „  Z d d „ Z d „  Z d „  Z d „  Z d d d d d „ Z d d „ Z RS(   t    i    i
   g»½×Ùß|Û=t   bedc         C   s  i  |  _  | |  _ | |  _ | d k r[ | d k r[ | r[ |  j d | d |  j d | ƒ n¨| d k r• | d k r• |  j d | d |  j d | ƒ nn| d k rÕ | d k rÕ | rÕ |  j d | d |  j d | ƒ n.| d k r| d k r|  j d | d |  j d | ƒ nô | d k rO| d k rO| rO|  j d | d |  j d | ƒ n´ | d k r‰| d k r‰|  j d | d |  j d | ƒ nz | d k rÉ| d k rÉ| rÉ|  j	 d | d |  j d | ƒ n: | d k r| d k r|  j
 d | d |  j d | ƒ n  d	 S(
   s]  
        Description:
            Initialize an instance of the class "reads".
        Parameters:
            rlen: read length
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            paired: is the reads paired-end
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
            format: the file format of the reads data, can be 'bed','bam',or 'sam'
        Value:
            None
        R   R	   t   filet   stept   cutt   bamt   samt   bowtieN(   t   datat   rlenR   t   loadBedPairedt   loadBedt   loadBamPairedt   loadBamt   loadSamPairedt   loadSamt   loadBowtiePairedt
   loadBowtie(   t   selfR
   R   R   t   pairedR   t   format(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   __init__   s&    			 " " " " " " " i,  c         C   s¾  |  j  } | | :} | | d } d GHi  } x,t | | ƒ D]} d | | <d } xî |  j D]ã } |  j | d j | }	 |  j | d |	  |  j | d | |	 | !}
 | | c |	 |
 j ƒ  |  j | d |	  j ƒ  |  j | d | |	 | !j ƒ  |  j | d |	  j ƒ  |  j | d | |	 | !j ƒ  7<| |	 7} q\ W| | c | d :<q< W| j ƒ  } | j ƒ  t | d ƒ } x7 | D]/ } | j	 t
 | ƒ d t
 | | ƒ d	 ƒ q‡W| S(
   s²  
        Description:
            Calculate correlation between the two strands allowing shift distances.
            
        Parameters:
            ofile: the output file used to write the result to
            minsize: minimal shift distance
            maxsize: maximal shift distance
        
        Value:
            an dictionary with shift distance as key and correlation coefficient  correspondint to the distance
        i   s   calculating ...i    t   +t   -g      ð?t   ws   	s   
(   R   t   rangeR   t   sizet   meant   stdt   keyst   sortt   opent   writet   str(   R   t   ofilet   minsizet   maxsizeR   t   dict   it   szsumt   chrt   szt   ct   kst   fot   k(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   autocorrelation#   s(    	

/Œ
 -c         C   sé   xâ |  j  D]× } d } x’ |  j  | D]ƒ } t j |  j  | | ƒ } | d k rV q$ n  x9 |  j  | | | d d k r‘ | d k r‘ | d 8} qY W| | k  r$ | } q$ q$ Wx3 |  j  | D]$ } |  j  | | j | d d ƒq¹ Wq
 Wd S(   s°   
        Description:
            remove zero values at the end of each chrosome
        
        Parameters:
            None
        
        Value:
            None
        i    i   t   refcheckN(   R   t   numpyR"   t   resize(   R   R0   t
   final_sizeR)   R"   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   clearEmptyEndD   s     .   c         C   s	  d } |  j  } | | :} | | d } |  j ƒ  |  j  } t d ƒ } d t | ƒ t d ƒ } t | d ƒ } xG d t t | | | ƒ ƒ j ƒ  d ƒ t d ƒ | k  r¹ | d 7} qs W| d k  rÏ d } n  d GHi  } x! t | | ƒ D] }	 d | |	 <qê Wx|  j	 D]ú }
 |  j	 |
 d	 j
 | } | d k r8qn  t |  j	 |
 ƒ } xc | j ƒ  D]U } | | c | 8<| | d
 d | | d
 | | <|  j	 |
 | | | | | <qXWxN t | | ƒ D]= }	 | d	 |  | d |	 | |	 !} | |	 c | j ƒ  7<qÁWqWg  } t | j ƒ  ƒ } | d k  r3d } n  d GHxœ t | | ƒ D]‹ }	 d } x3 t d t d | |	 | ƒ ƒ D] } | d 7} qvW| Gt |	 | ƒ d Gt | |	 ƒ GH| |	 | d k rH| j |	 ƒ qHqHWt } x0 t | | d ƒ D] }	 |	 | k rñt } qñqñWx0 t | d | ƒ D] }	 |	 | k r$t } q$q$W| rQd GHn  d \ } } x. | D]& } | | | | 7} | | | 7} qdWt | | d ƒ } | | d k  rºd GHn  | | d k  rÒd GHn  d Gx | D] } | | GqÝWd GHd G| | GH| | S(   so  
        Description:
            Calculate most probable size of DNA fragments from which a set of sequencing reads are sequenced.
            
        Parameters:
            minsize: minimal shift distance
            maxsize: maximal shift distance
        
        Value:
            An interger representing the most probable fragment size
            
        g»½×Ùß|Û=i   s?   function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}i    i
   g      à?iÿÿÿÿs   calculating fragment size ...R   i   R   s   sizes distribution:R   id   t   bpgffffffî?i   s‚   warning: the probilities of calculated size and up/bottom sizes are too close, we suggest to change up/bottom limit and try again!g        s|   warning: the calculated fragment size seems too close the the bottom limit, we suggest to change bottom limit and try again!sl   warning: the calculated fragment size seems too close the the up limit, we to change up limit and try again!s   potential size:s   calculated fragment size:(   g        g        (   R   R#   R   R   t   intt   floatR)   t   splitR!   R   R"   R    R%   t   sumt   maxt   valuest   appendt   Falset   True(   R   R+   R,   R   R   t   avgt   ppoist   lgpcutR-   R.   R0   R1   t   tchrt   straR2   t   pt   mt   olinet   jt   warningt   upvt   dpvt	   finalsizet   t(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   fragSizeDisW   s€    	
<  	  "!! 	( %       c         C   sµ  d G| Gd GHd } i  } d } t  | ƒ } | d d k rM t j  | ƒ } n  x°| D]¨} yQ | d  j d ƒ }	 |	 d	 t |	 d
 ƒ |	 d }
 } } | t |	 d ƒ } Wn | j ƒ  GHqT n X| d 7} | d d k ré | Gd GHn  | d k r| | } n | d k r| | } n  |  j j |
 ƒ smi t j d g ƒ d 6t j d g ƒ d 6|  j |
 <d | |
 <n  | | |
 k rÒ| d | |
 <|  j |
 d j	 | |
 d d ƒ|  j |
 d j	 | |
 d d ƒn  | d k rT |  j |
 | | c d 7<qT qT W|  j
 ƒ  xx |  j D]m }
 t |  j |
 d j |  j |
 d j ƒ } |  j |
 d j	 | d d ƒ|  j |
 d j	 | d d ƒqW| d k r¤|  j d | ƒ n  d G| Gd GHd S(   s  
        Description:
            load single-end reads data from a file in '.bed' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from bowtie files   ...R   i    iþÿÿÿt   gziÿÿÿÿs   	i   i   i   i   i@B s   reads parsedR   R   g        iè  R7   g      ð?R   s   parsing finished,N(   R'   t   gzipR?   R=   t   lenR   t   has_keyR8   t   arrayR9   R;   RA   R"   t   rvClonal(   R   R
   R   R   t   oldchrt   sizest   numt   infilet   linet   colR0   t   startRJ   t   endt   midt   lth(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   œ   sN     &
   3"% "
+" c         C   sÅ  d G| Gd GHd } i  } i  } d } d" \ } }	 t  | ƒ }
 | d d k r_ t j  | ƒ }
 n  g  g  } } x³|
 D]«} y,| d  j d ƒ } | d	 t | d
 ƒ | d d | d f \ } } } } } | j ƒ  } t | ƒ d	 k rü | d | d } n  | t | d ƒ } t | ƒ d k  rf| | | | | | g } | d 7} | d d k r`| Gd GHn  ws n> | | | | | | g } | d 7} | d d k r¤| Gd GHn  Wn d G| d  GHqs n X| d
 d  | d
 d  k rï| d d k r6| d d k r6| d | d | d	 d	 | | d	 | d } } } nŠ | d d k r‘| d d k r‘| d | d	 | d d	 | | d | d	 } } } n/ d G| Gd G| Gd GH| d 7} g  g  } } qs | j | ƒ sÜd | | <q| | c d 7<n& d G| Gd GH|	 d 7}	 | } g  } qs |  j j | ƒ sgi t j d g ƒ d 6t j d g ƒ d 6|  j | <d | | <n  | | | k rÌ| d | | <|  j | d j	 | | d d ƒ|  j | d j	 | | d d ƒn  | d k r|  j | d | c d 7<|  j | d | c d 7<n  g  g  } } qs W|  j
 ƒ  xx |  j D]m } t |  j | d j |  j | d j ƒ } |  j | d j	 | d d ƒ|  j | d j	 | d d ƒq6Wd G| Gd GHt | j ƒ  ƒ | j ƒ  } } | j ƒ  d# \ } } g  } x„ | D]| } | | | k r| j | ƒ n  | | | | 7} | | | 7} d | | | } | d k röd | G| G| | GHqöqöWd G| d | GHd G| GH| Gd GH|	 Gd GH| d k rÁ|  j d  | ƒ n  d! S($   sþ  
        Description:
            load paired-end reads data from a file in '.bed' format, the "name" fields of each reads pair must be the same except for the last one character in each field, which should be either '1' or '2', e.g. 'reads/1' or 'reads/2', each pair of reads must be arranged in two neighboring lines.
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from bowtie files   ...R   i    iþÿÿÿRU   iÿÿÿÿs   	i   i   i   i   i@B s   reads parseds   wrong line:i   R   R   s'   pair error --- reads from same strand:
s   
s!   pair error --- single end reads:
g        iè  R7   g      ð?s   parsing finished,id   s   average fragment size:s   most enriched fragment size:s-   pairs failed due to locations on same strandss   reads have no mate readsR   N(   i    i    (   i    i    (   R'   RV   R?   R=   RW   RX   R   R8   RY   R9   R;   RA   R"   RB   R%   R&   RC   RZ   (   R   R
   R   R   R[   R\   t	   fragsizesR]   t   serrt   nerrR^   t   end1t   end2R_   R`   R0   Ra   t   namet   scoreRJ   t   tnamesRb   Rc   t   fragsizeRd   t   maxvt   lthst   tltht   countt   maxltht   dcount(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   Ú   s¦     : 
 
  ; ;
 
3"%
+"
  			 c         C   s°  d G| Gd GHd } i  } d } t  | ƒ } | d d k rM t j  | ƒ } n  x«| D]£} yH | j ƒ  }	 |	 d d !\ }
 } } } } } t | ƒ t | ƒ } } Wn d G| j ƒ  GHqT n X| d	 7} | d
 d k rä | Gd GHn  | d k rý | | } n | d k r| | } n  |  j j |
 ƒ shi t j d g ƒ d 6t j d g ƒ d 6|  j |
 <d | |
 <n  | | |
 k rÍ| d | |
 <|  j |
 d j | |
 d d ƒ|  j |
 d j | |
 d d ƒn  | d k rT |  j |
 | | c d 7<qT qT W|  j	 ƒ  xx |  j D]m }
 t
 |  j |
 d j |  j |
 d j ƒ } |  j |
 d j | d d ƒ|  j |
 d j | d d ƒqW| d k rŸ|  j d | ƒ n  d G| Gd GHd S(   s  
        Description:
            load single-end reads data from a file in '.bed' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from bed files   ...R   i    iþÿÿÿRU   i   s   wrong format:i   i@B s   reads parsedR   R   g        iè  R7   g      ð?R   s   parsing finished,N(   R'   RV   R?   R=   R   RX   R8   RY   R9   R;   RA   R"   RZ   (   R   R
   R   R   R[   R\   R]   R^   R_   R`   R0   Ra   Rb   Rj   Rk   RJ   Rc   Rd   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   8  sN     
   3"% "
+" c         C   sf  d G| Gd GHd } i  } i  } d } d \ } }	 t  | ƒ }
 | d d k r_ t j  | ƒ }
 n  g  g  } } xT|
 D]L} yï | d  j d ƒ } | d	 j ƒ  } t | ƒ d
 k rÎ | d | d	 d | d	 <n  t | d ƒ t | d
 ƒ | d <| d
 <t | ƒ d k  r;| } | d 7} | d d k r5| Gd GHn  ws n, | } | d 7} | d d k rg| Gd GHn  Wn d G| d  GHqs n X| d	 d  | d	 d  k r| d d k rù| d d k rù| d | d | d
 d
 | | d
 | d } } } nu | d d k rT| d d k rT| d | d
 | d d
 | | d | d
 } } } n | d 7} g  g  } } qs | j | ƒ sŠd | | <q¶| | c d 7<n |	 d 7}	 | } g  } qs |  j j | ƒ si t j d g ƒ d 6t j d g ƒ d 6|  j | <d | | <n  | | | k rm| d | | <|  j | d j	 | | d d ƒ|  j | d j	 | | d d ƒn  | d k r²|  j | d | c d 7<|  j | d | c d 7<n  g  g  } } qs W|  j
 ƒ  xx |  j D]m } t |  j | d j |  j | d j ƒ } |  j | d j	 | d d ƒ|  j | d j	 | d d ƒq×Wd G| Gd GHt | j ƒ  ƒ | j ƒ  } } | j ƒ  d  \ } } g  } x„ | D]| } | | | k r½| j | ƒ n  | | | | 7} | | | 7} d | | | } | d k r—d | G| G| | GHq—q—Wd G| d | GHd G| GH| Gd GH|	 Gd GH| d k rb|  j d | ƒ n  d S(!   sþ  
        Description:
            load paired-end reads data from a file in '.bed' format, the "name" fields of each reads pair must be the same except for the last one character in each field, which should be either '1' or '2', e.g. 'reads/1' or 'reads/2', each pair of reads must be arranged in two neighboring lines.
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from bed files   ...R   i    iþÿÿÿRU   iÿÿÿÿs   	i   i   i   i@B s   reads parseds   wrong line:i   R   R   g        iè  R7   g      ð?s   parsing finished,id   i   s   average fragment size:s   most enriched fragment size:s-   pairs failed due to locations on same strandss   reads have no mate readsR   N(   i    i    (   i    i    (   R'   RV   R?   RW   R=   RX   R   R8   RY   R9   R;   RA   R"   RB   R%   R&   RC   RZ   (   R   R
   R   R   R[   R\   Re   R]   Rf   Rg   R^   Rh   Ri   R_   R`   Rl   R0   Rc   Rm   Rd   Rn   Ro   Rp   Rq   Rr   Rs   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   u  s       )
 
  ; ;
 
3"%
+"
  			 c         C   sÜ  d G| Gd GHd } i  } d } t  j d | ƒ } xò| D]ê} y“ | j d ƒ }	 d |	 d k rg w9 n  |	 d |	 d	 d
 t |	 d ƒ t |	 d ƒ f \ }
 } } } } d |	 d k rÀ d } n  | | d } } Wn | j ƒ  GHq9 n X| d 7} | d d k r| Gd GHn  | d
 k r)| | } n | d k rB| | } n  |  j j | ƒ s”i t j d g ƒ d
 6t j d g ƒ d 6|  j | <d | | <n  | | | k rù| d | | <|  j | d
 j	 | | d d ƒ|  j | d j	 | | d d ƒn  | d k r9 |  j | | | c d 7<q9 q9 W|  j
 ƒ  xx |  j D]m } t |  j | d
 j |  j | d j ƒ } |  j | d
 j	 | d d ƒ|  j | d j	 | d d ƒq;W| d k rË|  j d | ƒ n  d G| Gd GHd S(   s  
        Description:
            load single-end reads data from a file in '.sam' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from sam files   ...R   i    s   samtools view  -XS s   	t   ui   i   R   i   i	   R   R   t   1i@B s   reads parsedg        iè  R7   g      ð?R   s   parsing finished,N(   t   ost   popenR?   R=   RW   R   RX   R8   RY   R9   R;   RA   R"   RZ   (   R   R
   R   R   R[   R\   R]   R^   R_   R`   Rj   R0   RJ   Ra   R   Rb   Rk   Rc   Rd   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   Ñ  sR     @ 	
   3"% "
+" c         C   s8  d } i  } i  } d } t  j d | ƒ } d G| Gd GHxl| D]d}	 y?|	 j d ƒ }
 d |
 d k rm w? n  t |
 d	 ƒ t |
 d
 ƒ t |
 d ƒ |
 d	 <|
 d
 <|
 d <|
 d |
 d d d |
 d	 |
 d
 |
 d d | f \ } } } } } d |
 d k rd } n  |
 d
 |
 d	 k r=|
 d
 |
 d	 |
 d } n |
 d	 |
 d
 |
 d } | j | ƒ ssd | | <n | | c d 7<Wn |	 j ƒ  GHq? n X| d 7} | d d k rÂ| Gd GHn  |  j j | ƒ si t j d g ƒ d 6t j d g ƒ d 6|  j | <d | | <n  | | | k ry| d | | <|  j | d j	 | | d d ƒ|  j | d j	 | | d d ƒn  | d k r? |  j | | | c d 7<q? q? W|  j
 ƒ  xx |  j D]m } t |  j | d j |  j | d j ƒ } |  j | d j	 | d d ƒ|  j | d j	 | d d ƒq»W| d k rK|  j d | ƒ n  d G| Gd GHt | j ƒ  ƒ | j ƒ  } } | j ƒ  d \ } } g  } x„ | D]| } | | | k rÀ| j | ƒ n  | | | | 7} | | | 7} d | | | } | d k ršd | G| G| | GHqšqšWd G| d | GHd G| GHd S(   s  
        Description:
            load paired-end reads data from a file in '.sam' format.
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        R   i    s   samtools view -XS s   
parsing from sam files   ...s   	t   Pi   i   i   i	   i   R   Ru   R   R   i@B s   reads parsedg        iè  R7   g      ð?R   s   parsing finished,id   i   s   average fragment size:s   most enriched fragment size:N(   i    i    (   Rv   Rw   R?   R=   RW   RX   R   R8   RY   R9   R;   RA   R"   RZ   RB   R%   R&   RC   (   R   R
   R   R   R[   R\   Re   R]   R^   R_   R`   Rj   R0   RJ   Rk   Rc   Rm   Rd   Rn   Ro   Rp   Rq   Rr   Rs   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR      st     >H 	  
 3"% "
+" 
  c         C   sÜ  d G| Gd GHd } i  } d } t  j d | ƒ } xò| D]ê} y“ | j d ƒ }	 d |	 d k rg w9 n  |	 d |	 d	 d
 t |	 d ƒ t |	 d ƒ f \ }
 } } } } d |	 d k rÀ d } n  | | d } } Wn | j ƒ  GHq9 n X| d 7} | d d k r| Gd GHn  | d
 k r)| | } n | d k rB| | } n  |  j j | ƒ s”i t j d g ƒ d
 6t j d g ƒ d 6|  j | <d | | <n  | | | k rù| d | | <|  j | d
 j	 | | d d ƒ|  j | d j	 | | d d ƒn  | d k r9 |  j | | | c d 7<q9 q9 W|  j
 ƒ  xx |  j D]m } t |  j | d
 j |  j | d j ƒ } |  j | d
 j	 | d d ƒ|  j | d j	 | d d ƒq;W| d k rË|  j d | ƒ n  d G| Gd GHd S(   s  
        Description:
            load single-end reads data from a file in '.bam' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from Bam files   ...R   i    s   samtools view -X s   	Rt   i   i   R   i   i	   R   R   Ru   i@B s   reads parsedg        iè  R7   g      ð?R   s   parsing finished,N(   Rv   Rw   R?   R=   RW   R   RX   R8   RY   R9   R;   RA   R"   RZ   (   R   R
   R   R   R[   R\   R]   R^   R_   R`   Rj   R0   RJ   Ra   R   Rb   Rk   Rc   Rd   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   >  sR     @ 	
   3"% "
+" c         C   s8  d G| Gd GHd } i  } i  } d } t  j d | ƒ } xl| D]d}	 y?|	 j d ƒ }
 d |
 d k rm w? n  t |
 d	 ƒ t |
 d
 ƒ t |
 d ƒ |
 d	 <|
 d
 <|
 d <|
 d |
 d d d |
 d	 |
 d
 |
 d d | f \ } } } } } d |
 d k rd } n  |
 d
 |
 d	 k r=|
 d
 |
 d	 |
 d } n |
 d	 |
 d
 |
 d } | j | ƒ ssd | | <n | | c d 7<Wn |	 j ƒ  GHq? n X| d 7} | d d k rÂ| Gd GHn  |  j j | ƒ si t j d g ƒ d 6t j d g ƒ d 6|  j | <d | | <n  | | | k ry| d | | <|  j | d j	 | | d d ƒ|  j | d j	 | | d d ƒn  | d k r? |  j | | | c d 7<q? q? W|  j
 ƒ  xx |  j D]m } t |  j | d j |  j | d j ƒ } |  j | d j	 | d d ƒ|  j | d j	 | d d ƒq»W| d k rK|  j d | ƒ n  d G| Gd GHt | j ƒ  ƒ | j ƒ  } } | j ƒ  d \ } } g  } x„ | D]| } | | | k rÀ| j | ƒ n  | | | | 7} | | | 7} d | | | } | d k ršd | G| G| | GHqšqšWd G| d | GHd G| GHd S(   s  
        Description:
            load paired-end reads data from a file in '.bam' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        s   
parsing from bam files   ...R   i    s   samtools view  -X s   	Rx   i   i   i   i	   i   R   Ru   R   R   i@B s   reads parsedg        iè  R7   g      ð?R   s   parsing finished,id   i   s   average fragment size:s   most enriched fragment size:N(   i    i    (   Rv   Rw   R?   R=   RW   RX   R   R8   RY   R9   R;   RA   R"   RZ   RB   R%   R&   RC   (   R   R
   R   R   R[   R\   Re   R]   R^   R_   R`   Rj   R0   RJ   Rk   Rc   Rm   Rd   Rn   Ro   Rp   Rq   Rr   Rs   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   n  st     >H 	  
 3"% "
+" 
  c         C   s   |  j  ƒ  d |  j ƒ  S(   s±   
        Description:
            calculate the average reads count per nucleotide
        
        Parameter:
            None
        
        Value:
            None
        g      ð?(   R@   R"   (   R   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR#   ­  s    c         C   s
  t  ƒ  } t | ƒ } | d k r% d Sd GH|  j ƒ  |  j } | d k rí | d k  rí t d ƒ } d t | ƒ t d ƒ } t | d ƒ } xG d t t | | | ƒ ƒ j ƒ  d ƒ t d ƒ | k  rÔ | d 7} qŽ W| d k  rí d } qí n  d	 G| Gd
 G| GH| |  j 9} d } d } d } t	 j
 d g ƒ }	 |  j ƒ  }
 d G|
 Gd GHx} |  j D]r } xi |  j | j ƒ  D]T } t |  j | | ƒ }	 |	 | 8}	 |	 d d |	 d }	 |  j | | c |	 8<qlWqRW|  j ƒ  } d G| Gd GH|
 | d |
 Gd GHd Gt  ƒ  | GHd S(   sÓ  
        Description:
            Remove clonal reads, not that this process may change the difference between samples.
            If dno't want to change the difference, transfer all samples to wigs and use the rvClonal function of the Wigs class.
            
        Parameter:
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        
        Value:
            None
        i    Ns   removing clonal reads ...i   s?   function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}i
   g      à?iÿÿÿÿs%   whole genome average reads density iss   use cutoff:g        s   before removing:R   i   s   after removing:id   s   percent removed.s
   time cost:(   R   R>   R#   R   R   R   R=   R)   R?   R8   RY   R@   R   R%   R    (   R   R   t   ssRF   RG   RH   t   cnumt   rreadst   treadst   tchrvt   beforeR0   RJ   t   after(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyRZ   ¹  s@    	 <  
c         C   sU   d } xA |  j  D]6 } x- |  j  | D] } | |  j  | | j 7} q$ Wq W| |  j S(   sÅ   
        Description:
            Calculate the total genome size.
        
        Parameter:
            None
        
        value:
            Interger value representing genome size.
        i    (   R   R"   R   (   R   Rd   R0   RJ   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR"   ã  s
     c         C   s¼   i  } x? t  | ƒ D]1 } | j ƒ  } t | d ƒ |  j | | d <q Wxm |  j D]b } | j | ƒ sz |  j j | ƒ n  x7 |  j | D]( } |  j | | j | | d d ƒqˆ WqR Wd S(   sr  
        Description:
            Adjust the size of each chrosome.
        
        Parameter:
            gfile: path to the file containing the size of each chrosome, each line in the file would be in the format "chrosome_name size", in which size is an integer value, and chrosome_name should contain no empty space
        
        value:
            None.
        i   i    R7   N(   R'   R?   R=   R   R   RX   t   popR9   (   R   t   gfileR\   R_   R`   R0   R)   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt
   sizeAdjustó  s    # c         C   sQ   d } xD |  j  D]9 } x0 |  j  | D]! } | |  j  | | j ƒ  7} q$ Wq W| S(   sÙ   
        Description:
            calculate the total reads in the whole genome
        
        Parameter:
            None
        
        Value:
            an interger value representing the reads count.
        i    (   R   R@   (   R   t   vR0   RJ   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR@     s
    #c      
   C   sc  |  j  } | d k r0 |  j d | d | ƒ } n  | d k rE | } n  d G| GH| } | d | | d | } } t d | ƒ } d GHx¿|  j D]´}	 t d | d	 | d	 ƒ }
 |  j |	 d
 j |
 k  rê |  j |	 d
 j |
 d d ƒn  |  j |	 d j |
 k  r%|  j |	 d j |
 d d ƒn  | j |	 ƒ |  j |	 d
 j } | j	 |	 | | ƒ |  j |	 d
 d | | !|  j |	 d
 | | +x) t
 | ƒ D] } d |  j |	 d
 | <q•W|  j |	 d
 d | | c !|  j |	 d | | !7+xX t
 | | d ƒ D]B } | j |	 | | | c !|  j |	 d
 | | | | | !7+qýWq W| j | d | j  ƒ | S(   s½  
        Description:
            Calculate nucleosome occupancy from the reads data
        
        Parameter:
            fs: average size of fragments that are subject to sequencing and generate the reads, only for signgle-end reads. When this value is not given, a fs value will be infered by the program. For paired-end reads loaded buy the function loadBedPaired(), set fs to 0.
            extend: a interger value, each read will be extend to this length.
            mifrsz: the minimal estimated average fragment size, only for single-end reads 
            mafrsz: the maximal estimated average fragment size, only for single-end reads
        
        Value: a Wig class instance
        R+   R,   i    s	   extend toi   R   s   generating wig ...iè  i   R   R7   R   i   g      ð?N(   R   t   NoneRT   R   R   RA   R"   R9   t   addChrt	   resizeChrR!   t
   foldChange(   R   t   fst   extendt   mifrszt   mafrszR   t
   old_extendRm   t   wgR0   t   tmaxRd   R.   RK   (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   toWig  s6    	  		 ! !. 4 Dg      ð?c         C   sê  | d k r | d } n  |  } t  j d g ƒ } x²| j D]§} xž| j | D]} | GH| j t | j | | j ƒ  ƒ d d ƒ| d | j | | j d d f \ } } } } x‰ | | k  r?t | | j | | | ƒ }	 |	 | j k r| j |	 d d d ƒn  x$ | |	 k  r1| | | <| d 7} qW| d 7} q· Wd t | j | | j ƒ  | ƒ | } }
 } | d k r•| j | | c d 9<n  xF | |
 k  rÝ| d 7} | j | | | t d | d ƒ c d 7<q˜WqO Wq; Wd S(   s  
        Description:
            Randomly sample the reads count to a specified fold
            
        Parameter:
            fold: randomly sample the total reads count to the original reads count multiply by this fold value
        
        Value:
            None
        i   g        R7   i    iè  N(   R8   RY   R   R9   R=   R@   R"   R   (   R   t   foldt   wigt   tarrayR0   R)   t   cszR.   t   tszt   newtszt   tnum(    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   foldSampling:  s.     *0 
/ 
N(   t   __name__t
   __module__RD   R   R6   R;   RT   R   R   R   R   R   R   R   R   R#   RZ   R"   R‚   R@   R„   R   R—   (    (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyR   
   s&   !	E>^=\/>0?	*			"t   __main__R   R    i    (   R8   Rv   t   copyR    R‘   R   t   rpy2.robjectsR   R   R   t   randomR   t   mathR   RV   R   R˜   t   syst   fdopent   stdoutt   fileno(    (    (    sE   /oak/stanford/groups/akundaje/marinovg/programs/danpos-2.2.2/reads.pyt   <module>   s   ÿ ÿ ÿ T