ó
4+•Qc           @   sü  d  Z  d d l Z d d l Z d d l Z d d l Z d d l Z d d l m Z d d l m	 Z	 d d l
 Z
 d d l m Z d d l Z d d l m Z d d l m Z m Z m Z m Z d d l m Z e j j e ƒ Z e j j e ƒ Z e j j e ƒ d d	 l m Z d d
 l m Z  d d l! m" Z" d d l# m$ Z$ d d l% m& Z& d d l' m( Z( d Z) g  e* e) d ƒ D] Z+ e, e+ d ƒ ^ qwZ- g  e* e) d ƒ D] Z+ e, e+ d ƒ ^ q¤Z. e/ e0 e- e. ƒ ƒ Z1 d „  Z2 d „  Z3 d „  Z4 d „  Z5 d „  Z6 d „  Z7 d „  Z8 d „  Z9 d „  Z: d „  Z; d „  Z< d „  Z= e> d k røyÁe= ƒ  \ Z? Z@ e9 e@ d  ƒ ZA e3 e@ d ƒ ZB eC g  e3 e@ d ƒ D] \ ZD ZE eE ^ q“ƒ ZF e? jG rÄeF d! :ZF n  e3 e@ d ƒ ZB e jH e@ d! d" ƒ ZI e j jJ e@ d# ƒ ZK e j j eK ƒ ZL e j jM eK ƒ ZM eK d$ ZN eH eN d% ƒ ZO e j jJ e@ d& ƒ ZP e2 eP ƒ \ ZQ ZR ZS ZT ZU e@ d' ZV e jW e jX eH eV ƒ d( ƒ ƒ ZY e? jZ e[ eQ ƒ k r½e\ d) ƒ ‚ n  e? j] e? jZ k rÞe\ d* ƒ ‚ n  e? jG re? j^ d! e? jZ k  re j_ d+ IJn  Wn- e` k
 r=Za e j_ ea IJe jb d ƒ n XeK d, Zc e jd d- ec d. e je d/ d% ƒ e jf d0 e? jZ ƒ e jf d1 e? jG ƒ e jf d2 e? j] ƒ e jf d3 e? j^ ƒ e jf d4 e? jg ƒ e jf d5 e? jh ƒ e jf d6 e? ji ƒ e jf d7 e? jj ƒ e jf d8 e? jk ƒ e jf d9 e j jJ e@ d  ƒ ƒ e jf d: e j jJ e@ d ƒ ƒ e jf d; e? jG rud< n d= eF f ƒ e jf d> e j jJ e@ d! ƒ ƒ e jf d? eN ƒ e jf d@ e j jl eP dA ƒ ƒ e jf dB e j jl eP dC ƒ ƒ e jf dD e j jl eP dE ƒ ƒ e jf dF e j jl eP dG ƒ ƒ e jf dH e j jl eP dI ƒ ƒ e jf dJ ƒ e jf dK ƒ e6 e? jj e? jZ eQ eR eS ƒ \ ZQ ZR ZS eV dL Zm e j jn em ƒ rÙe j jo em ƒ d  k rÙe; em ƒ Zp n e< eV ƒ Zp eO ep IJg  eq d eF d ƒ D] Zr es er ƒ ^ qZt e
 ju et ƒ e ƒ  Zv d  Zw d  Zx xÿeB D]÷\ ZD ZE y¨eD GeE GHeD eA k rueA eD Zy n' e j_ dM IeD IJe jz dN eD ƒ wDe[ ey ƒ d k r,ey d  j{ dO ƒ r,eC g  ey D] Z| e} e| j~ dO ƒ ƒ ^ qËƒ Z g  ey D]- Z| e€ dP eE e} e| j~ dO ƒ ƒ e ƒ ^ qöZ n	 eE g Z x¿e0 ey e ƒ D]®\ Z‚ ZE e8 e‚ eY e? eI ƒ Zƒ eƒ d k rxqEn  ev j… e‚ ƒ eE d  k rEe? jG r¦
e& j† eƒ eE e? jZ e? j] e? j^ e? jg e? jh e? ji ƒ Z‡ d  Zr e jf dQ e[ e‡ ƒ d! eƒ jˆ e[ eƒ ƒ eE f ƒ xÏe‡ D]’Z‰ yixF e‰ D]> ZŠ dR eŠ j‹ k rN	e? jZ dS g g eŠ j‹ dR <n  eŠ j‹ dR ZŒ q	WdT e‰ d  j‹ Z g  ZŽ x‚ e* d! ƒ D]t Z e‰ e ZŠ e? jj d  k rÆ	e4 eŠ eQ eR eS eT eU e? jk ƒ ZŠ n e- dU e[ eŠ ƒ eŠ j dV <eŽ j e7 eŠ ev ƒ ƒ qƒ	We: eŽ e‰ d  j‹ dW ƒ ZŽ xO e* d! ƒ D]A Z eŽ e j‘ ƒ  eŽ e j’ ƒ  et ex eŽ e _“ eO eŽ e IJq"
Wex d 7Zx er d 7Zr Wq	e\ k
 rž
Z” e jz e” ƒ q	Xq	Wn2e& j• eƒ eE e? jZ e? j] e? j^ e? jg e? jh e? ji ƒ Z– d  Zr e jf dX e[ e– ƒ eƒ jˆ e[ eƒ ƒ f ƒ xÊ e– D]Â ZŠ y™ e? jj d  k rNe4 eŠ eQ eR eS eT eU e? jk ƒ ZŠ n e- dU e[ eŠ ƒ eŠ j dV <dT eŠ j‹ Z e7 eŠ ev ƒ ZŽ et ex eŽ _“ eO eŽ IJer d 7Zr ex d 7Zx Wqe\ k
 rÓZ” e jz e” ƒ qXqWew er 7Zw dY j— ew ƒ GHqEqEWWqDe` k
 r:Za e jz ea ƒ e jz dZ j— eD ƒ ƒ e j˜ ƒ  qDXqDWe? jG r`d[ j— ew e? jZ ƒ Z™ n d\ j— ew e? jZ ƒ Z™ e jf e™ ƒ He™ GHe& jš rªe} e& j› ƒ e& jš Zœ n d  Zœ d] j— eœ ƒ Z e jf e ƒ e GHeO jž ƒ  e j_ e jŸ d  Id^ IJn  d S(_   s‰  Output a set of simulated RNA-Seq short reads from a set of transcripts.

Positional arguments:
gene_id-copy_numbers.txt
transcripts.fasta
frag_prob_dict.shelve
transcript.coords
output_filename.fastq
probability_files_dir

Created on Oct 8, 2009
@author: ian
Modified to use Stranded_Read_Initiation_Probability_Vectors 2012-12-07

The relative probability of a substitution error at each position in a read was estimated empirically
from counts of correct and incorrect bases in a set of reads mapped to a genome, and the relative probability
of each quality code for correct and incorrect bases at each position was estimated from the same data.
iÿÿÿÿN(   t   GzipFile(   t   defaultdict(   t   OptionParser(   t   bisect(   t   SeqIOt	   SeqRecordt   Seqt   Alphabet(   t   reverse_complement(   t   transcript_coords(   t   SAMText(   t   GFF3Iterator(   t   doRepackageSimulation(   t   generateShortReads(   t+   Stranded_Read_Initiation_Probability_Vectori>   i   i@   i!   c         C   s  t  t j j |  d ƒ ƒ } t  t j j |  d ƒ ƒ } t  t j j |  d ƒ ƒ } t  t j j |  d ƒ ƒ } t  t j j |  d ƒ ƒ } g  } x1 | D]) } | j t | j ƒ  j ƒ  d ƒ ƒ q” W| j ƒ  g  } x1 | D]) } | j t | j ƒ  j ƒ  d ƒ ƒ qØ W| j ƒ  g  }	 x1 | D]) } |	 j t | j ƒ  j ƒ  d ƒ ƒ qW| j ƒ  g  }
 xG | D]? } g  | j ƒ  j ƒ  D] } t | ƒ ^ qy} |
 j | ƒ q`W| j ƒ  g  } xG | D]? } g  | j ƒ  j ƒ  D] } t | ƒ ^ qÓ} | j | ƒ qºW| j ƒ  | | |	 |
 | f S(   Ns   error_probabilities.txts   BBB.error_probabilities.txts   BBB.init_probabilities.txts5   Correct_reads.quality_scores.cumulative_frequency.txts7   Incorrect_reads.quality_scores.cumulative_frequency.txtiÿÿÿÿ(	   t   opent   ost   patht   joint   appendt   floatt   stript   splitt   close(   t   probability_files_dirt   error_prob_filet   BBB_error_prob_filet   BBB_init_prob_filet   correct_qual_freq_filet   incorrect_qual_freq_filet
   error_probt   linet   BBB_error_probt   BBB_init_probt   correct_qual_freqt   ft   freqst   incorrect_qual_freq(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   load_probability_tables6   s<    '
'
'
+
+
c         c   sv   t  |  ƒ } xc | D][ } | j ƒ  j d ƒ } y) | d } t | d ƒ } | | f VWq t k
 rm q q Xq Wd  S(   Ns   	i    i   (   R   R   R   t   intt	   Exception(   t   filenamet   linesR   t   fieldst   gene_idt   copies(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   gene_copy_iterator\   s    
c         C   s  t  t |  j ƒ ƒ } d } t }	 x!t t | ƒ ƒ D]}
 |	 pS t j ƒ  | |
 k }	 |	 rf | |
 n | |
 } t j ƒ  | k rt j ƒ  | k r§ d | |
 <d } q3t j d ƒ } x# | | |
 k rÛ t j d ƒ } q¹ W| | |
 <|	 rò d n t | |
 t j ƒ  ƒ } n% |	 rd n t | |
 t j ƒ  ƒ } | t	 | 7} q4 Wt
 j
 t j d j | ƒ t j ƒ |  j ƒ } | | j d <|  j | _ | S(   Nt    t   Ni   t   ACGTt   phred_quality(   t   listt   strt   seqt   Falset   ranget   lent   randomt   choiceR   t   illumina_quality_codeR   R   R   R   t   generic_dnat   idt   letter_annotationst   annotations(   t   sequence_recordR   R    R!   R"   R%   t   N_ratet   new_sequencet   qualst   bbbt   it   substitution_probabilityt   qualt   new_baset
   new_record(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   introduceSubstitutionsh   s*    
	
(%-c         C   sr   d } d } xR t  |  ƒ D]D } | | | d | 7} d | | | | | | } | | 7} q W| t j :} | S(   Ni    i   (   R7   t   optionst   read_length(   RL   t   error_prob_vectort   BBB_error_prob_vectort   BBB_init_prob_vectort   err_ratet
   bbb_rate_iRE   t
   err_rate_i(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt    estimate_substitution_error_rate…   s    c   	      C   s   t  | | | | ƒ } |  | } t j d | ƒ g  | D] } | | ^ q7 } g  | D] } | | ^ qT } t  | | | | ƒ } t j d | ƒ | | | f S(   Ns#   Error rate adjustment factor = %.3fs   Adjusted error rate  = %.3f(   RS   t   loggingt   info(	   t   target_rateRL   RM   RN   RO   RP   t   adjustment_factort   et   adjusted_err_rate(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   adjust_error_rate   s    	
	c         C   sb  |  j  d } t |  j ƒ } t |  j  d ƒ } t |  j  d ƒ } |  j  d } t | j d ƒ ƒ d k r… d | j d ƒ d } n | } | j | | ƒ } | j | | ƒ }	 |	 d | d k  rÉ d	 n d
 }
 | d | _ t | d |	 d ƒ | _	 t
 |  j ƒ | _ d j g  |  j d D] } t | ^ qƒ | _ | j | ƒ } | j } |
 d	 k r”t | j ƒ | _ | j d  d  d … | _ | j ƒ  n  t | d |	 d ƒ t |  ƒ k r"| j | | | ƒ } | r"xå| D]Ý\ } } t t | j | ƒ | j | ƒ ƒ | j | j	 ƒ ƒ d } | d	 k r:| d 8} n  t | | ƒ d } t |  ƒ | } d } x_ t t | ƒ ƒ D]K } | | d d k rw| | | d | k r­Pn  | | | d 7} qwqwW| | } | | | d | } | d k  sþ| d k  r(t d t
 |  ƒ d t
 | j ƒ ƒ ‚ n  | | d } | | | d <| d t | ƒ k  r‘| j | d | | g ƒ | j | d | d g ƒ qÚ| j | d g ƒ | j | | g ƒ qÚW| | k rÍd
 n d	 } | j j d | ƒ g  | D] } | d d k rî| ^ qî} d } xr | t | ƒ d k  rŠ| | d | | d d k r}| | d c | | d d 7<| | d =q| d 7} qWd } x{ | t | ƒ d k  r| | d d k r| d k sà| t | ƒ d k r| j	 | | d 7_	 | | =q”| d 7} q”W| |  j  d <q"n  d j g  |  j  d D] \ } } d | | f ^ q6ƒ | _ | S(   Nt
   cigar_listt   startt   endt   transcript_idt   |i   t   mRNAi   t   -t   +i    R/   R2   iÿÿÿÿt   MIs&   Negative segment length in SAM cigar: t    R0   s   XS:A:%st   Ms   %d%s(   R?   t   SAMReadR=   R'   R8   R   t   get_genome_coordt   rnamet   mint   posR4   R5   R   R>   t   illumina2phred_quality_codeRG   t   get_scaffold_coordst   strandR   t   set_reverse_strand_flagt   abst   get_donor_acceptor_pairst   get_spliced_coordR7   t
   ValueErrort   _dictt   insertR   t   tagst   cigar(   t   readt   coordsR[   t   samR\   R]   R,   R^   t   genome_startt
   genome_endRm   t   qt   trans_coordst   transcript_strandt   intronst   donort   acceptort   ex1t   gapt   ex2t   cursorRE   t   new_mt   new_m2t   lettert
   intron_dirt   st   filtered_cigart   length(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   generateSamž   s„     0	$)
*)  6
<c         C   s‚  |  j  | ƒ } | j j ƒ  | _ |  j ƒ  | _ |  j ƒ  | _ |  j ƒ  | j d <t	 | ƒ } | | j
 k  r± t j d | j | | j
 f IJt j d | j | | j
 f ƒ d	 S| j | k rÐ | | j } n; t j d I| j IJt j |  IJt j d | j ƒ d	 St | j ƒ d k  r8t j d | j ƒ d	 St	 | ƒ | k  r•| j j d g | t	 | j ƒ ƒ | j j d g | t	 | j ƒ ƒ n  yƒ |  j ƒ  d k rï| j d	 d	 d
 … |  | j d <| j d	 d	 d
 … |  | j d <n( | j |  | j d <| j |  | j d <Wnc t k
 r}} t j | IJt j | j Id I| Id It	 | ƒ IJt j d | j ƒ d	 SX| S(   s˜   Create a Bio.SeqRecord corresponding to a transcript isoform and annotate it with info
    needed for simulated read generation

    :param transcript:
    :type transcript: GFF3mRNA
    :param genome_index:
    :type genome_index: Bio.SeqIO.dict
    :param options:
    :type options: Namespace
    :param origin_prob_dict:
    :type origin_prob_dict: shelve
    :return:
    :rtype: SeqRecord or None
    Rm   s   %s is too short: %d < %ds&   No origin probability vector found fors'   No origin probability vector found for gš™™™™™©?s   Zero origin probability for i    Ra   Niÿÿÿÿt   origin_probability_plust   origin_probability_minuss   transcript length:s   origin_rate_vector length:s&   origin_rate_vector length error for %s(   t   get_transcript_sequenceR5   t   uppert   get_IDR=   t   get_namet   namet
   get_strandR?   R8   t
   size_uppert   syst   stderrRT   t   warnt   Nonet   errort   sumt   plust   extendt   minusR>   t	   TypeError(   t
   transcriptt   genome_indexRK   t   origin_prob_dictt   transcript_rect   seq_lent   origin_rate_vectort   te(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   get_transcript_recordç   sF      $'!$%	c         C   sg   t  t ƒ } xT t t |  ƒ ƒ j ƒ  D]: } | j ƒ  } x% | j ƒ  D] } | | j | ƒ qD Wq% W| S(   sÜ   Make lists of isoforms for each gene

    :param gene_filename: Name of a .gff3 file describing genes, transcripts, and exons
    :type gene_filename: String
    :return: transcript_map
    :rtype: defaultdict(list)
    (   R   R3   R   R   t   genesR“   t   get_transcriptsR   (   t   gene_filenamet   transcript_mapt   genet	   gene_nameR¡   (    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   load_transcripts!  s    c         C   sò   |  d j  ƒ  r! |  d j ƒ  n |  d j ƒ  |  d j |  d _ |  d j |  d _ |  d j |  d _ |  d j |  d _ |  d j ƒ  |  d j ƒ  |  d j |  d j k  rÓ | |  d _ | |  d _ n | |  d _ | |  d _ |  S(   Ni    i   (	   t   is_reversedt   set_mate_reverse_strand_flagRh   t   mrnmRj   t   mpost   set_first_of_pair_flagt   set_second_of_pair_flagt   isize(   t   sam_pairt   insert_size(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   add_sam_pair_info1  s    c         C   s}   g  } t  |  ƒ } xQ | D]I } | j ƒ  j ƒ  } | d } | d } d | | f } | j | ƒ q W| j ƒ  d j | ƒ S(   Ni    i   s   @SQ	SN:%s	LN:%ss   
(   R   R   R   R   R   R   (   t   fai_filenamet   header_linest   faiR   R+   t   ref_namet
   ref_lengtht   hdr_line(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   generate_SAM_header_from_faiE  s    


c         C   so   g  } t  j |  d ƒ } xG t | j ƒ  ƒ D]3 } t | | ƒ } d | | f } | j | ƒ q+ Wd j | ƒ S(   Nt   fastas   @SQ	SN:%s	LN:%ss   
(   R   t   indext   sortedt   keysR8   R   R   (   t   fasta_filenameR»   R¢   R½   R¾   R¿   (    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   generate_SAM_header_from_fastaR  s    c          C   sá  d }  t  d |  ƒ } | j d d d d d d d	 d
 d d ƒ| j d d d d d d d	 t d d ƒ| j d d d d d d ƒ| j d d d d d d	 d d d ƒ| j d d d d d d	 d d d ƒ| j d d d d d d	 d  d d! ƒ| j d" d d# d d d	 d$ d d% ƒ| j d& d' d d( d d) d	 d* d d+ ƒ| j d, d- d d. d d) d	 d* d d/ ƒ| j d0 d1 d d2 d d) d	 d* d d3 ƒ| j ƒ  \ } } t | ƒ d4 k r×| j d5 ƒ | j ƒ  t d5 ƒ ‚ n  | | f S(6   Ns“   Usage: %prog  [options] gene_models.gff3 gene_id-copy_numbers.txt origin_prob_dict.shelf output_filename_base probability_files_directory genome.fat   usages   -ls   --lengtht   destRL   t   typeR'   t   defaulti&   t   helps#   length of a read; default: %defaults   -1s   --singlet   pairst   actiont   store_falses'   single or paired reads; default: paireds   -2s   --pairt
   store_trues	   --minsizet   size_mini–   s.   lower bound for size filter; default: %defaults	   --lowsizet
   size_loweri¯   s:   lower end of pass range for size filter; default: %defaults
   --highsizeR–   iú   s:   upper end of pass range for size filter; default: %defaults	   --maxsizet   size_maxi,  s.   upper bound for size filter; default: %defaults   -is   --indelratet
   indel_rateR   i    s@   probability of an indel error at any position; default: %defaults   -ss	   --subratet
   subst_ratesF   probability of a substitution error at any position; default: %defaults   -Ns   --NrateRA   sF   probability that a substitution will introduce an N; default: %defaulti   s   Wrong number of arguments(   R   t
   add_optiont   Truet
   parse_argsR8   R›   t
   print_helpt   RuntimeError(   RÇ   t   parserRK   t   args(    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   get_command_line\  s6    !!!!!
t   __main__i    i   t   ri   s   .true_mappings.samt   wi   i   RÁ   s:   Error probability vector must be at least as long as readss5   Read length must be less than minimum fragment lengths”   
            WARNING: Read length is too close to target fragment length!
             Some read pairs will have negative insert size.
             s   _simulateRNA-Seq.logR)   t   levelt   filemodes   read_length = %ds
   pairs = %ss   size_min = %ds   size_lower = %ds   size_upper = %ds   size_max = %ds   indel_rate = %fs   subst_rate = %fs   N_rate = %fs   transcripts file = %ss!   transcript copy numbers file = %ss!   predicted total number of %s = %ds
   read pairst   readss   origin probability file = %ss   output file = %ss   error probability file = %ss   error_probabilities.txts   BBB error probability file = %ss   BBB.error_probabilities.txts0   BBB conditional transition probability file = %ss   BBB.init_probabilities.txts   Correct qualities file = %ss5   Correct_reads.quality_scores.cumulative_frequency.txts   Incorrect qualities file = %ss7   incorrect_reads.quality_scores.cumulative_frequency.txts   
s   	Reads	ID	Lengths   .fais   No transcript found fors   No transcript found for t   depthg      à?s   	%d	%s	%d	%dR[   Re   s0   %(transcript_id)s[%(strand)s]:%(start)d..%(end)di&   R2   R¶   s	   	%d	%s	%ds   Total: {:n}s   in transcript {}s.   Generated {0:n} pairs of reads of length {1:d}s%   Generated {0:n} reads of length {1:d}s"   Size filter accept rate was {:.2%}s   done. (    t   __doc__R—   RT   R   t	   tracebackt   tempfilet   gzipR    t   collectionsR   R9   t   optparseR   t   shelveR   t   BioR   R   R   R   t   Bio.SeqR   R   t   dirnamet   __file__t   this_dirt   srcR   t   lib.transcript_coordsR	   t   lib.samTextR
   Rf   t   lib.gff3IteratorR   t   lib.repackageSimulationR   t   libR   t(   calcTranscriptFragmentationProbabilitiesR   t   MAX_ILLUMINA_QUALITYR7   R|   t   chrR;   t   phred_quality_codet   dictt   zipRk   R&   R.   RJ   RS   RZ   R   R¨   R¯   R¹   RÀ   RÆ   RÜ   t   __name__RK   RÛ   R¬   t   gene_copiesRœ   R,   t   copy_numbert   total_copiesRÌ   R   R£   t   abspatht   output_filename_baset   working_dirt   basenamet   output_file_namet   outputR   R   R    R!   R"   R%   t
   genome_seqt   to_dictt   parseR¢   RL   R8   Rr   RÐ   RÑ   R˜   R(   RX   t   exitt   LOG_FILENAMEt   basicConfigt   DEBUGRU   R–   RÒ   RÓ   RÔ   RA   R   t   fai_patht   isfilet   getsizet   headert   xrangeRE   R4   t   codest   shuffleRx   t   totalt
   code_indext   isoform_listR›   t   has_attributet   isoformR   t   get_attributet   total_heightR'   t   isoform_copiesR¡   R¤   Rš   t   addt   generatePairedReadSett   readpairsetR=   t   readpairRw   R?   R[   t   read_idRy   t   jR>   t   set_paired_flagt   set_proper_pair_flagt   qnamet   vet   generateUnpairedReadSett   readsett   formatt	   print_exct   summaryt
   candidatest   acceptst   accept_ratet
   accept_pctR   t   argv(    (    (    s>   /home/ian/PycharmProjects/RNA-Seq-Simulator/simulateRNA_Seq.pyt   <module>   s  "--	&					I	:				
	/	

!
!!&!  
*
,		%.:			'

)


		
