ó
ƒå˜[c           @` s  d  d l  m Z m Z m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 Z
 d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m Z d  d l Z d  d l Z e j e ƒ Z e j e j ƒ  ƒ d d d „  ƒ  YZ d S(   i    (   t   absolute_importt   divisiont   print_functiont   unicode_literalsN(   t   GraphCycleExceptiont   Gene_splice_modelerc           B` sƒ   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z e	 d „  ƒ Z
 e	 d „  ƒ Z e	 d	 „  ƒ Z e	 d
 d „ ƒ Z RS(   uŒ   
    Builds supertranscipts.

    object instance members:

        gene_id : str

        alignments : list of Node_alignment objects

    c         C` sn   | |  _  t ƒ  |  _ t j d j | ƒ ƒ x< | D]4 } | j ƒ  } t j j | ƒ } |  j j	 | ƒ q2 Wd S(   u?  
        initialize alignments list with simple single 'alignment' objects with
        each path as an individual alignment with just its path nodes.

        params:

        gene_id : str

        node_path_obj_list : list of Node_path objects, each Node_path corresponding to an individual Trinity isoform

        u   Gene_splice_modeler inputs: {}N(
   t   gene_idt   listt
   alignmentst   loggert   debugt   formatt   get_transcript_namet   Node_alignmentt   get_single_seq_node_alignmentt   append(   t   selfR   t   node_path_obj_listt   node_path_objt   transcript_namet   alignment_obj(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyt   __init__&   s    	c         C` s   |  j  S(   N(   R   (   R   (    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyt   get_gene_idA   s    c         C` sF   |  j  ƒ  s8 y |  j ƒ  SWqB t k
 r4 |  j ƒ  SXn
 |  j ƒ  Sd S(   u'  
        method to construct the super transcript.

        Tries 2 approaches:
            a.  If there isn't an obvious repetitive node structure and so the graph formas a DAG,
                we build a splice graph and perform topological sorting of the nodes.
            b.  If there is some repetitive structure, we resort to performing a multiple alignment-based method to
                organize relationships among nodes in isoforms, and the multiple alignment produces the linear ordering
                for the supertranscript.

        N(   t   alignment_contains_repeat_nodet   topological_order_splice_modelR   t   multiple_alignment_splice_model(   R   (    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyt   build_splice_modelE   s    c         C` sy   xr |  j  D]g } t ƒ  } xU t d | j ƒ  ƒ D]> } | j | ƒ } | j ƒ  } | | k r` t S| j | ƒ q/ Wq
 Wt S(   Ni    (	   R   t   sett   ranget   widtht   get_representative_column_nodet
   get_loc_idt   Truet   addt   False(   R   t	   alignmentt   loc_idst   it   node_objt   loc_id(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyR   `   s    	c         C` sé  t  j d ƒ |  j ƒ  } t j | ƒ } xg|  j D]\} t  j d t | ƒ ƒ | j ƒ  d } | j ƒ  d } t  j d t | ƒ ƒ xt d t	 | ƒ ƒ D]î } | | } | j
 ƒ  } | j | | | j ƒ  ƒ }	 t  j d t |	 ƒ ƒ | d k r5| | d }
 | j | |
 j
 ƒ  |
 j ƒ  ƒ } |	 j | ƒ n  | t	 | ƒ d k  rœ | | d } | j | | j
 ƒ  | j ƒ  ƒ } |	 j | ƒ qœ qœ Wq2 Wt  j d t | ƒ ƒ t j j | j ƒ  ƒ } t  j d t | ƒ ƒ t ƒ  } x7 t d t	 | ƒ ƒ D]  } | | j
 ƒ  } | | | <q÷Wt ƒ  } t ƒ  } x‰ |  j D]~ } | j | j ƒ  d ƒ g  | D] } d
 ^ q[} x8 | j ƒ  d D]& } | j
 ƒ  } | | } | | | <q~W| j | ƒ q7Wt j | | | ƒ } t  j d	 t | ƒ ƒ | S(   uW   
        Build supertranscript using simpler topological sorting of the nodes.
        u    	using topological sort method.
u1   topological_order_splice_model, input alignment: i    u+   topological_order_splice_model, node list: u   generic node: i   u   Before sorting nodes: u   Topologically sorted nodes: u   Splice graph model: N(   R	   R
   R   t   TGraphR   t   strt   get_aligned_nodest   get_transcript_namesR   t   lenR   t   get_nodet   get_seqt   add_prev_nodet   add_next_nodet   Topological_sortt   topologically_sortt   get_all_nodest   dictR   R   t   NoneR   (   R   R   t   graphR#   t	   node_listR   R%   R&   R'   t   generic_nodet   prev_node_objt   prev_generic_nodet   next_node_objt   next_generic_nodet   topologically_sorted_nodest   aligned_loc_id_post   new_alignmentst   transcript_idst   new_alignmentt   nodet   new_idxt   splice_graph_model(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyR   n   sR    
!!			
c         C` sØ  t  j d ƒ |  j } t | ƒ d k r0 | d St j |  j ƒ } t  j d t | ƒ ƒ xDt | ƒ d k rŸx+ t d t | ƒ ƒ D] } d | | | <q„ Wt t	 j
 | ƒ ƒ } t | ƒ } t | | ƒ } | | } | | } | | }	 t j | |	 ƒ }
 t ƒ  } xC t d t | ƒ ƒ D], } | | | f k r| j | | ƒ qqW| j |
 ƒ | } t  j d t | ƒ ƒ t j | ƒ } t  j d t | ƒ ƒ q\ Wt | ƒ d k rÐt d j t | ƒ ƒ ƒ ‚ n  | d S(   uî   
        Multiple alignment algorithm for dealing with repeat nodes:
        For each best matching pair of transcripts (or aligned transcripts),
        perform alignment, and replace aligned pair with a single alignment object.
        u   	using mult alignment method.
i   i    u   Similarity matrix:
iÿÿÿÿu   
Updated alignments:
uH   Error, should only have one alignment but have {} alignments after merge(   R	   R
   R   R,   R   t   compute_similarity_matrixR)   R   t   intt   numpyt   argmaxt   merge_alignmentsR   R   t   RuntimeErrorR   (   R   R   t   similarity_matrixR%   t   best_pair_idxt   num_alignmentst   best_pair_idx_1t   best_pair_idx_2t   align_at   align_bt   align_mergedt   new_alignment_list(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyR   ¬   s8    	


	c   	      C` sª   t  |  ƒ } t j | | f d d ƒ} x| t d | d ƒ D]g } |  | } xT t | d | ƒ D]? } |  | } t j j | | ƒ } t  | ƒ } | | | | <q_ Wq; W| S(   uc   
        similarity matrix indicates number of shared nodes between each pair of isoforms.
        t   dtypeu   int_i    i   (   R,   RG   t   zerosR   R   t   compute_number_common_nodes(	   t   alignments_listRM   t
   sim_matrixR%   t   align_it   jt   align_jt   common_nodest   num_common_nodes(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyRE   ç   s    

c         C` s  t  j d j |  | ƒ ƒ t |  j ƒ  ƒ } t | j ƒ  ƒ } t j | | ƒ sj t d j | | ƒ ƒ ‚ n  |  j ƒ  } | j ƒ  } t j j	 | | ƒ } x- t
 d | d ƒ D] } d | | d d <q« Wx- t
 d | d ƒ D] } d | d | d <qÛ WxEt
 d | d ƒ D]0} x't
 d | d ƒ D]} t j |  | d | | d ƒ }	 | | d | d d |	 }
 | | | d d } | | d | d } |	 d k rà|
 | k rà|
 | k rà|
 | | | d <d	 | | | d <q%| | k r| | | | d <d | | | d <q%| | | | d <d | | | d <q%WqW| } | } | } | } t ƒ  } xU| d k s{| d k r·| | | } |  j | d ƒ } | j | d ƒ } t ƒ  } | d } | d	 k rï| d 8} | d 8} | | } n¸ | d k r?| d 8} | | 7} x• t
 d t | ƒ ƒ D] } | j d ƒ q%Wnh | d k r| d 8} x* t
 d t | ƒ ƒ D] } | j d ƒ qkW| | 7} n t d
 j | | ƒ ƒ ‚ | j | ƒ qcW| j ƒ  t  j d t | ƒ ƒ |  j ƒ  | j ƒ  } t ƒ  } xU t
 d t | ƒ ƒ D]> } t ƒ  } x | D] } | j | | ƒ q$W| j | ƒ qWt  j d t | ƒ ƒ t j |  j ƒ  | | ƒ } t  j d t | ƒ ƒ | S(   uò   
        Computes a mismatch-free multiple alignment (just matches and gaps) between two Node_alignment objects

        returns single Node_alignment object containing the contents of aligned align_a and align_b as aligned.
        
        u   Merging alignments {} and {}uE   Error, transcripts in alignments to merge are not disjoint: {} and {}i   u   DEL_Bi    u   btu   DEL_Au   scoreu   DIAGu   bt: ({},{}), bt_dir not definedu   Merged alignment nodes list: u   merged alignment node matrix:
u   merged alignment obj:
N(   R	   R
   R   R   R+   t
   isdisjointRJ   R   t	   DP_matrixt   build_DP_matrixR   R   t   get_match_scoreR   t   get_node_LIST_at_column_posR,   R   R5   t   reverseR)   R   R   (   RP   RQ   t   transcript_names_align_At   transcript_names_align_Bt   width_at   width_bt	   dp_matrixR%   RZ   t   score_cell_matcht
   score_diagt   score_del_at   score_del_bt   max_it   max_jt   all_merged_alignment_nodes_listt   score_structt   nodes_align_at   nodes_align_bt   align_nodest   bt_dirt   xt   merged_transcript_name_listt   node_obj_matrixt   rowt   node_obj_listt   merged_alignment_obj(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyRI   ÿ   s„    	 $		






		c         C` s`   |  j  | ƒ } | j  | ƒ } t j j | ƒ } t j j | ƒ } t j | | ƒ rX d Sd Sd S(   u]   
        just determines if indices in two transcripts have the same node identifier
        i   i    N(   t   get_node_set_at_column_posR   t   get_node_loc_idsR   t   intersection(   RP   t   idx_aRQ   t   idx_bt
   node_set_at
   node_set_b(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyRa   •  s    id   c   
      C` s»   t  | j ƒ  ƒ } t | | d ƒ } d } d } xi | | k  r xB | D]: } | | | t | | | ƒ !}	 | | d |	 d 7} qH W| d 7} | | 7} q5 W| j d j |  | ƒ ƒ d S(   uW   
        writes the multiply aligned isoform sequences to an output filehandle
        i    u    u   	u   
u
   // {}

{}
N(   R   t   keysR,   t   mint   writeR   (
   t	   gene_namet   malign_dictt   ofht   align_widtht   transcript_namest   alignment_lengtht   align_startt
   align_textR   t   align_region(    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyt   write_malign§  s    
(   t   __name__t
   __module__t   __doc__R   R   R   R   R   R   t   staticmethodRE   RI   Ra   RŽ   (    (    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyR      s   					>	;–(    (   t
   __future__R    R   R   R   t   ost   syst   ret   loggingt   argparset   collectionsRG   t   timeR(   t   TNodet	   Node_pathR   R   R1   R_   t	   getLoggerR   R	   t
   addHandlert   NullHandlerR   (    (    (    sƒ   /oak/stanford/groups/akundaje/marinovg/programs/trinityrnaseq-Trinity-v2.8.4/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.pyt   <module>   s    "$