ó
ƒå˜[c           @` s¨   d  d l  m Z m Z m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 Z
 d  d l Z d  d l Z e j e ƒ Z d d d „  ƒ  YZ d S(   i    (   t   absolute_importt   divisiont   print_functiont   unicode_literalsNt   Trinity_fasta_parserc           B` s)   e  Z d  Z d „  Z d „  Z d „  Z RS(   u¡  
    Parses a Trinity.fasta file and stores the transcript name, sequence, and node path info.

    Instance member:

        trinity_gene_to_isoform_seqs : (defaultdict(list)) stores key,val of transcript_name,path_struct

        where path_struct has structure:
             {
                 'transcript_name' : accession,
                 'path' : path_str,
                 'seq' : sequence
             }
    c         C` sÇ   t  j t ƒ |  _ t | ƒ £ } d } d } xn | D]f } | j ƒ  } | d d k r | d k r | d k r |  j | | ƒ n  | } d } q4 | | 7} q4 W| d k r½ |  j | | ƒ n  Wd  QXd  S(   Nu    i    u   >(   t   collectionst   defaultdictt   listt   trinity_gene_to_isoform_seqst   opent   rstript   add_trinity_seq_entry(   t   selft   trinity_fasta_filenamet   fht   headert   sequencet   line(    (    s„   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Trinity_fasta_parser.pyt   __init__"   s    	c   	      C` sğ   t  j d | ƒ } | s0 t d j | ƒ ƒ ‚ n  | j d ƒ } t  j d | ƒ } | so t d j | ƒ ƒ ‚ n  | j d ƒ } t  j d d | ƒ } | | k r· t d j | ƒ ƒ ‚ n  |  j | } i | d	 6| d
 6| d 6} | j | ƒ d S(   uU  
        entry looks like so:
        >TRINITY_DN16_c0_g1_i2 len=266 path=[1:0-48 27:49-49 28:50-50 27:51-51 28:52-52 27:53-53 28:54-54 27:55-55 28:56-56 27:57-57 28:58-58 27:59-59 28:60-60 27:61-61 29:62-265] [-1, 1, 27, 28, 27, 28, 27, 28, 27, 28, 27, 28, 27, 28, 27, 29, -2]
        CTGTTGTGTGGGGGGTGCGCTTGTTTTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC
        TCAAGTTGATTCCTCCATGTTGCTTTACAGAGACCTGCCAACTACCCAGGAATGTAAAAG
        CATTCATAGTATTTGTCTAGTAGAGATGCTGTATGAAAAATGCCAAAACCAAAAAGAGAA
        AGAAGGAAAGAGAGATAGATAGATGACATAGATGACGGATGGATGGGTGGGTGGGTGGAT
        GGATGGATGGATGGATGGAGGGGGGC
        u   ^>(\S+)u-   Error, cannot parse accession from header: {}i   u   path=\[([^\]]+)\]u5   Error, cannot parse path info from header of line: {}u   _i\d+$u    u<   Error, couldn't remove isoform ID from Trinity accession: {}u   transcript_nameu   pathu   seqN(   t   ret   searcht   RuntimeErrort   formatt   groupt   subR   t   append(	   R   R   R   t   mt	   accessiont   path_strt   gene_idt   isoform_listt
   iso_struct(    (    s„   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Trinity_fasta_parser.pyR   ;   s     

c         C` s   |  j  S(   N(   R   (   R   (    (    s„   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Trinity_fasta_parser.pyt    get_trinity_gene_to_isoform_info`   s    (   t   __name__t
   __module__t   __doc__R   R   R    (    (    (    s„   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Trinity_fasta_parser.pyR      s   		%(    (   t
   __future__R    R   R   R   t   ost   sysR   t   loggingt   argparseR   t   numpyt   timet	   getLoggerR!   t   loggerR   (    (    (    s„   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Trinity_fasta_parser.pyt   <module>   s   "$