ó
QŽfXc           @   s¨  d  d l  Z  d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m Z d  d l Z d  d l	 Z	 y d  d l
 Z
 Wn e k
 r  e j d ƒ n Xe e j ƒ e j j d ƒ rÙ d  d l m Z m Z n+ e j j d ƒ rd  d l m Z m Z n  d Z e e	 j Z d Z d	 e f d
 „  ƒ  YZ d „  Z d „  Z d e f d „  ƒ  YZ d „  Z e d „ Z d e f d „  ƒ  YZ  d „  Z! e d „ Z" d „  Z# d S(   iÿÿÿÿN(   t
   addsitedirsI   Can't process Lucigen NxMate reads! Python module regex is not installed!s   2.(   t   Parallelt   delayeds   3.i¨a  i   t
   ParseFastQc           B   sD   e  Z d  Z d d g d „ Z d „  Z d „  Z d „  Z d „  Z RS(   s@   Returns a read-by-read fastQ parser analogous to file.readline()t   @t   +c         C   sL   | j  d ƒ r$ t j | ƒ |  _ n t | d ƒ |  _ d |  _ | |  _ d S(   s  Returns a read-by-read fastQ parser analogous to file.readline().
        Exmpl: parser.next()
        -OR-
        Its an iterator so you can do:
        for rec in parser:
            ... do something with rec ...

        rec is tuple: (seqHeader,seqStr,qualHeader,qualStr)
        s   .gzt   rUi    N(   t   endswitht   gzipt   opent   _filet   _currentLineNumbert   _hdSyms(   t   selft   filePatht   headerSymbols(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   __init__=   s
    
	c         C   s   |  S(   N(    (   R   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   __iter__N   s    c         C   s
   |  j  ƒ  S(   N(   t   __next__(   R   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   nextQ   s    c         C   sƒ  g  } x^ t  d ƒ D]P } |  j j ƒ  } |  j d 7_ | rV | j | j d ƒ ƒ q | j d ƒ q Wg  | D] } t | ƒ ^ qn j t	 ƒ } | j d ƒ } | d k r³ t
 ‚ n  | d k sÒ t d |  j ƒ ‚ | d j |  j d ƒ st d |  j d |  j f ƒ ‚ | d j |  j d ƒ sFt d |  j d |  j f ƒ ‚ t | d ƒ t | d	 ƒ k syt d
 |  j ƒ ‚ t | ƒ S(   s{   Reads in next element, parses, and does minimal verification.
        Returns: tuple: (seqHeader,seqStr,qualHeader,qualStr)i   i   s   
s¥   ** ERROR: It looks like I encountered a premature EOF or empty line.
            Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**i    s¢   ** ERROR: The 1st line in fastq element does not start with '%s'.
            Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**i   s¢   ** ERROR: The 3rd line in fastq element does not start with '%s'.
            Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**i   s»   ** ERROR: The length of Sequence data and Quality data of the last record aren't equal.
               Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**N(   t   rangeR
   t   readlineR   t   appendt   stript   Nonet   boolt   countt   Truet   StopIterationt   AssertionErrort
   startswithR   t   lent   tuple(   R   t   elemListt   it   linet   xt   truest   nones(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR   T   s0    (	#c         C   s   |  j  r |  j  j ƒ  n  d  S(   N(   R
   t   close(   R   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR'   }   s    	(   t   __name__t
   __module__t   __doc__R   R   R   R   R'   (    (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR   :   s   			)c         C   sW   xP t  |  | ƒ D]? \ } } x0 | D]( } x | D] } | j | d ƒ q0 Wq# Wq Wd  S(   Ns   
(   t   zipt   write(   t   file_handlerst   record_listst   file_handlert   record_listt   recordR#   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   write_to_files‚   s    c         C   sm   t  |  ƒ t | ƒ } g  } d } xD | t  |  ƒ k  rh | j |  t | ƒ t | | ƒ !ƒ | | 7} q% W| S(   Ng        (   R   t   floatR   t   int(   t   lt   nt   avgt   outt   last(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   split_into_chunks‰   s    $t
   CleanStatsc           B   s   e  Z d  „  Z d „  Z RS(   c         C   sU   d |  _  d |  _ d |  _ d |  _ d d d d d d d d d d d d g |  _ d  S(   Ni    (   t   readcountert   matecountert   TOTALmatecountert   slagcountert
   csscounter(   R   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR   ”   s
    				c         C   s   |  j  | j  7_  |  j | j 7_ |  j | j 7_ |  j | j 7_ g  t |  j | j ƒ D] \ } } | | ^ q^ |  _ |  S(   N(   R<   R=   R>   R?   R+   R@   (   R   t   otherR$   t   y(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   __add__›   s    5(   R(   R)   R   RC   (    (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR;   “   s   	c         C   sZ  t  ƒ  } g  } g  } g  } g  } x |  D]\ } }	 | j d 7_ xút | ƒ D]ì\ }
 } t j | | d ƒ } | |
 } t j | |	 d ƒ } | rÌ| rÌ| j d 7_ | j |
 c d 7<| j ƒ  } | d } | d } | d } | | } | d } | | } | j ƒ  } | d } | d } |	 d } | | } |	 d } | | } t | ƒ t	 k rÈt | ƒ t	 k rÈ| j
 d 7_
 | j | d | | d | g ƒ | j |	 d | |	 d | g ƒ n  PqP |
 d k rP | j d 7_ | j | d | d | d | d g ƒ | j |	 d |	 d |	 d |	 d g ƒ qP qP Wq( W| | | | g | f S(   Ni   i   i   i    i   i   (   R;   R<   t	   enumeratet   regext   searchR>   R@   t   spanR   t   minseqR=   R   R?   (   t   readst   csslist1t   csslist2t   statst   processed_out1t   processed_out2t   processed_slag1t   processed_slag2t   recR1t   recR2t   cssindext   css1t   mt   css2R6   t	   R1matchest   mendt   mySeqt   myR1t   myQual1t   myR1Qualt	   R2matchest   nendt   mySeq2t   myR2t   myQual2t   myR2Qual(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   chimera_clean_process_batch¤   sL    	












$!$)4c   !         s²  t  j  ƒ  } t j j |  ƒ } t j j | ƒ d d k rS t j j | ƒ d } n  t j j | ƒ } t j j | ƒ d d k rš t j j | ƒ d } n  t j j | d | ƒ } t | d ƒ }	 t j j | d | ƒ }
 t |
 d ƒ } t j j | d | ƒ } t | d ƒ } t j j | d | ƒ } t | d ƒ } d d d	 d
 d d d d d d d d g ‰  d d d d d d d d d d d d g ‰ t |  ƒ } t | ƒ } t ƒ  } t	 j
 } x‹t rEt t j | t ƒ ƒ } t t j | t ƒ ƒ } t | ƒ t | ƒ k r2t j d t | ƒ t | ƒ f | ƒ n  | s<Pn  t t t | | ƒ ƒ | ƒ } t d  | ƒ ‡  ‡ f d! †  | Dƒ ƒ } g  | D] } | d ^ q‰g  | D] } | d ^ q£} } x@ t | | ƒ D]/ \ } } t |	 | | | g | ƒ | | 7} qÍW| s»| j d" | j t  j d# t  j t  j  ƒ  | ƒ ƒ f ƒ q»q»W| j ƒ  | j ƒ  |	 j ƒ  | j ƒ  | j ƒ  | j ƒ  | j | j | j k r®t j d$ | ƒ n  | j d k rÐt j d% | ƒ n  | s¨d& | j | j } d& | j | j } | j d' ƒ | j d( | j | j | | j | f ƒ | j | j } | j d) | ƒ t  j d# t  j t  j  ƒ  | ƒ ƒ }  | j d* |  ƒ | j d+ t | j  ƒ ƒ n  | | f S(,   Ni   s   .gzi    t   mates_ICC4_t   ws   non-mates_ICC4_s   (TGGACTCCACTGTG){e<=1}s   (ACTTCGCCACTGTG){e<=1}s   (TGAGTCCCACTGTG){e<=1}s   (TGACTGCCACTGTG){e<=1}s   (TCAGGTCCACTGTG){e<=1}s   (ATGTCACCACTGTG){e<=1}s   (GTATGACCACTGTG){e<=1}s   (GTCTACCCACTGTG){e<=1}s   (GTTGGACCACTGTG){e<=1}s   (CGATTCCCACTGTG){e<=1}s   (GGTTACCCACTGTG){e<=1}s   (TCACCTCCACTGTG){e<=1}s   (TCCAGACCAATGTG){e<=1}s   (ACATCACCAATGTG){e<=1}s   (TCACGACCAATGTG){e<=1}s   (TAGCACCCAATGTG){e<=1}s   (AACCTCCCAATGTG){e<=1}s   (ACAACTCCAATGTG){e<=1}s   (GTCTAACCAATGTG){e<=1}s   (TACACGCCAATGTG){e<=1}s   (GAGAACCCAATGTG){e<=1}s   (GAGATTCCAATGTG){e<=1}s   (GACCTACCAATGTG){e<=1}s   (AGACTCCCAATGTG){e<=1}sg   lucigen_nxmate.py, chimera_clean: number of left reads (%d) is not equal to number of right reads (%d)!t   n_jobsc         3   s'   |  ] } t  t ƒ | ˆ  ˆ ƒ Vq d  S(   N(   R   Rc   (   t   .0RI   (   RJ   RK   (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pys	   <genexpr>  s   sB   ==== chimera_clean progress: reads processed: %d, time elapsed: %ss   %H:%M:%SsU   lucigen_nxmate.py, chimera_clean: error in the script somewhere! Unequal read counts!sV   lucigen_nxmate.py, chimera_clean: error in input data! Number of processed reads is 0!g      Y@s-   ==== chimera_clean info: processing finished!sn   ==== chimera_clean info: %d reads processed, %d true mate reads (%.2f %%) and %d non-mates/chimeras (%.2f %%).sB   ==== chimera_clean info: %d mates too short to keep after trimmings)   ==== chimera_clean info: time elapsed: %ss   ==== chimera_clean info: (!   t   timet   ost   patht   basenamet   splitextt   joinR	   R   R;   t   options_storaget   threadsR   t   listt	   itertoolst   islicet   READS_PER_BATCHR   t   supportt   errorR:   R+   R   R2   t   infoR<   t   strftimet   gmtimeR'   R>   R?   R=   t   strR@   (!   t   infilename1t   infilename2t   dstt   logt   silentt	   starttimet	   basename1t	   basename2t   outfilename1t   outfile1t   slagfilename1t	   slagfile1t   outfilename2t   outfile2t   slagfilename2t	   slagfile2t   parserR1t   parserR2t	   all_statsRf   t   reads1t   reads2t   chunkst   outputsR$   t   resultsRL   t   resultt   statt   percentmatest   percentslagt
   shortmatest   elapsedtime(    (   RJ   RK   sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   chimera_clean×   s‚    				 ;	7





	 %t   JunctionStatsc           B   s   e  Z d  „  Z d „  Z RS(   c         C   sC   d |  _  d |  _ d |  _ d |  _ d |  _ d |  _ d |  _ d  S(   Ni    (   R<   t
   jctcountert   splitcountert   bothjctcountert   R1jctcountert   R2jctcountert   R1R2jctcounter(   R   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR   /  s    						c         C   s‚   |  j  | j  7_  |  j | j 7_ |  j | j 7_ |  j | j 7_ |  j | j 7_ |  j | j 7_ |  j | j 7_ |  S(   N(   R<   Rš   R›   Rœ   R   Rž   RŸ   (   R   RA   (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyRC   8  s    (   R(   R)   R   RC   (    (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyR™   .  s   		c         C   s™  t  ƒ  } g  } g  } g  } xh|  D]`\ } } | j d 7_ t j | | d ƒ } t j | | d ƒ }	 | rŠ|	 rŠ| j d 7_ | j ƒ  }
 |
 d } | d } | |  } | d } | |  } |	 j ƒ  } | d } | d } | |  } | d } | |  } t | ƒ t k r‚t | ƒ t k r‚| j d 7_ | j	 d 7_	 | j
 | d | | d | g ƒ | j
 | d | | d | g ƒ q‚q" |	 r]|	 j ƒ  } | d } | d } | |  } | d } | |  } t | ƒ t k r‚| j d 7_ | j
 | d | | d | g ƒ | j
 | d | d | d | d g ƒ | j d 7_ | j d 7_ q‚q" | r0| j ƒ  }
 |
 d } | d } | |  } | d } | |  } t | ƒ t k r‚| j d 7_ | j
 | d | | d | g ƒ | j
 | d | d | d | d g ƒ | j d 7_ | j d 7_ q‚q" | j
 | d | d | d | d g ƒ | j
 | d | d | d | d g ƒ q" W| | | g | f S(   Ni   i    i   i   (   R™   R<   RE   RF   Rœ   RG   R   RH   R›   RŸ   R   Rš   Rž   R   (   RI   t   jctstrRL   t   processed_split1t   processed_split2t   processed_unsplitRQ   RR   RU   R6   t   matchest   startRY   t   myLeftt   myQualt
   myLeftQualt   nmatchest   nstartR_   t   myRight2Ra   t   myRightQual2(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   nx_seq_junction_process_batchC  sp    	









$!'




!)




!))-c             s…  t  j  ƒ  } t j j |  ƒ } t j j | ƒ d d k rS t j j | ƒ d } n  t j j | ƒ } t j j | ƒ d d k rš t j j | ƒ d } n  t j j | d | ƒ } t | d ƒ }	 t j j | d | ƒ }
 t |
 d ƒ } t j j | d | j d d	 ƒ ƒ } t | d ƒ } d
 ‰  t |  ƒ } t | ƒ } t	 ƒ  } t
 j } x…t rÕt t j | t ƒ ƒ } t t j | t ƒ ƒ } t | ƒ t | ƒ k rÈt j d t | ƒ t | ƒ f | ƒ n  | sÒPn  t t t | | ƒ ƒ | ƒ } t d | ƒ ‡  f d †  | Dƒ ƒ } g  | D] } | d ^ qg  | D] } | d ^ q6} } x= t | | ƒ D], \ } } t |	 | | g | ƒ | | 7} q`W| sQ| j d | j t  j d t  j t  j  ƒ  | ƒ ƒ f ƒ qQqQW| j ƒ  | j ƒ  |	 j ƒ  | j ƒ  | j ƒ  | j d k r*t j d | ƒ n  | j d k rLt j d | ƒ n  | sdd | j | j } d | j | j } d | j | j } d | j  | j } | j d ƒ | j d | j ƒ | j d | j | f ƒ | j d | j | f ƒ | j d | j | f ƒ | j d | j  | f ƒ t  j d t  j t  j  ƒ  | ƒ ƒ } | j d | ƒ n  | j ƒ  | j ƒ  | |
 | f S(   Ni   s   .gzi    t   R1_IJS7_Re   t   R2_IJS7_t   unsplit_IJS7_t   _R1_t   _R1R2_s"   (GTTCATCGTCAGGCCTGACGATGAAC){e<=4}si   lucigen_nxmate.py, nx_seq_junction: number of left reads (%d) is not equal to number of right reads (%d)!Rf   c         3   s$   |  ] } t  t ƒ | ˆ  ƒ Vq d  S(   N(   R   R­   (   Rg   RI   (   R    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pys	   <genexpr>­  s   sD   ==== nx_seq_junction progress: reads processed: %d, time elapsed: %ss   %H:%M:%SsX   lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!sT   lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!id   s/   ==== nx_seq_junction info: processing finished!s-   ==== nx_seq_junction info: %d reads processedsM   ==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))sW   ==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))sX   ==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))sX   ==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))s+   ==== nx_seq_junction info: time elapsed: %s(!   Rh   Ri   Rj   Rk   Rl   Rm   R	   t   replaceR   R™   Rn   Ro   R   Rp   Rq   Rr   Rs   R   Rt   Ru   R:   R+   R   R2   Rv   R<   Rw   Rx   R'   R›   RŸ   R   Rž   (    Rz   R{   R|   R}   R~   R   R€   R   t   splitfilenameleftt
   splitfile1t   splitfilenamerightt
   splitfile2t   unsplitfilenamet   unsplitfileRŠ   R‹   RŒ   Rf   R   RŽ   R   R   R$   R‘   RL   R’   R“   t   percentsplitt   percentR1R2t	   percentR1t	   percentR2R—   (    (   R    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   nx_seq_junction„  s‚    %				 ;	7




				%

c   	      C   sƒ   | j  d |  d t j j | ƒ d | d ƒ t |  | | | d t ƒ\ } } t | | | | d t ƒ\ } } } | | | f S(   Ns$   == Processing Lucigen NxMate reads (s    and s    (results are in s    directory)R~   (   Rv   Ri   Rj   Rk   R˜   t   FalseR¾   (	   t   left_reads_fpatht   right_reads_fpathR|   R}   t   cleaned_filename1t   cleaned_filename2t   split_filename1t   split_filename2t   unsplit_filename(    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   process_readsÙ  s
    +!$($   Ri   Rh   Rt   R   Rq   t   syst   siteR    t   spades_initRn   RE   t   ImportErrorRu   t   ext_python_modules_homet   versionR   t   joblib2R   R   t   joblib3t   READS_PER_THREADRo   Rs   RH   t   objectR   R2   R:   R;   Rc   R   R˜   R™   R­   R¾   RÇ   (    (    (    sr   /oak/stanford/groups/akundaje/marinovg/programs/SPAdes-3.13.0-Linux/share/spades/spades_pipeline/lucigen_nxmate.pyt   <module>   s<   H		
	3W	AU