ó
ÀênQc           @   sé   d  Z  d d l Z d d l Z d d l Z d d l m Z d d l m Z d d l m	 Z	 d d l
 Z
 d d l m Z d a d a d „  Z d	 „  Z d
 „  Z d d d „ Z d „  Z d „  Z e d d „ Z d d „ Z d d „ Z d S(   s¡    From a known transcript, simulate cleavage, size filtering of fragments, and generation of short reads from fragment ends.

Created on Oct 8, 2009
@author: ian
iÿÿÿÿN(   t   Alphabet(   t   Seq(   t	   SeqRecord(   t   bisect_lefti    c         C   sç   |  j  j ƒ  } d |  j k r3 |  j j d d ƒ n |  j j d d ƒ } t | | ƒ } |  j j ƒ  | _ |  j d | j d <|  j d | j d <|  j d d k rª d n d | j d <d | j k rã t | j d ƒ | j d <n  | S(   sÏ   
    Custom version of reverse-complement that adjusts special annotations as well as sequence
    :param sequence_record:
    :type sequence_record: Bio.SeqRecord
    :return:
    :rtype: Bio.SeqRecord
    s   [+]t   +t   -t   endt   startt   strandt
   cigar_list(   t   seqt   reverse_complementt   idt   replaceR   t   annotationst   copyt   reversed(   t   sequence_recordt   rc_seqt   rc_idt   rc(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyR      s    9&c         C   s˜   | |  k o | k n r” | |  k o3 | k n r< t  S|  | k  rc t | |  ƒ | | } n t | |  ƒ | | } t j ƒ  | k r” t  Sn  t S(   s1  Return True or False for acceptance of a fragment with given length.

    Accepts lengths between lower and upper with probability 1. The probability of acceptance decreases
    linearly from lower toward minimum and from upper toward maximum. Lengths <= minimum or >= maximum
    are never accepted.
    (   t   Truet   floatt   randomt   False(   t   lengtht   minimumt   lowert   uppert   maximumt   fraction(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   fuzzySizeFilter(   s    c         C   sw  | d k r, t  |  ƒ d g g |  j d <|  Sg  } d } d } xÇ t |  j ƒ D]¶ } | d k rí t j ƒ  | k rí | j | d g ƒ d } t j ƒ  d k  r× | j | ƒ | d 7} | j | ƒ | j d d g ƒ q| j d d g ƒ qN | j | ƒ | d 7} qN W| d k r*| j | d g ƒ n  t t j d j | ƒ t	 j
 ƒ |  j ƒ } |  j j ƒ  | _ | | j d <| S(	   sÌ   Randomly double or delete nucleotides in a DNA sequence.

    sequence_record is a SeqRecord.
    indel_rate is the probability of insertion or deletion at each position.
    Returns a new SeqRecord.
    i    t   MR	   t    g      à?i   t   It   D(   t   lenR   t   strR
   R   t   appendR   R   t   joinR    t   generic_dnaR   R   (   R   t
   indel_ratet   new_sequencet   mR	   t   ct
   new_record(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   introduceIndels;   s0    
*i€„ c         C   so  | j  d } | j  d }	 g  }
 g  } d } x% | D] } | | 7} |
 j | ƒ q3 W|
 d } g  |
 D] } | | ^ qe }
 d } x% |	 D] } | | 7} | j | ƒ qˆ W| d } g  | D] } | | ^ qº } t | j ƒ } i | j d 6d d 6} d } g  } x5t | ƒ |  k  r9| | k  r9t |
 t j ƒ  ƒ } x9 | t | ƒ | k rst |
 t j ƒ  ƒ } | d 7} q;W| | | } | t | | d t | ƒ d ƒ | } | d	 k  rÇ| d 7} qn  t | | | t j ƒ  ƒ } | d 7} xK t	 | | d | | | | ƒ s;t | | | t j ƒ  ƒ } | d 7} qñW| | k  rNqn  | | | d !} | j
 ƒ  } | j d d
 k r‹| | d <n
 | | d <| | d <t | ƒ | d <t t | t j ƒ d d | j | | f d | ƒ} | d k rt | | ƒ } n  t j ƒ  d k  r#| j | ƒ q| j t | ƒ ƒ qWd j t | ƒ | ƒ GHt | 7a t t | ƒ 7a | S(   sÎ  Return a set of random, size-filtered fragments from a transcript.

    transcript is a DNA SeqRecord including letter_annotations specifying
    the probability of cleavage at each position.
    size_min, size_lower, size_upper, size_max are the parameters of the size filter
    indel_rate is the probability of insertion or deletion at each position.
    max_tries is a positive integer to guard against infinite looping
    Returns a list of SeqRecords.
    t   origin_probability_plust   origin_probability_minusi    iÿÿÿÿt   transcript_idR   R   i   gü©ñÒMb`?R   R   R   t   isizeR   s   %s[+]:%d..%dR   g      à?s'   {:n} fragments generated in {:n} tries.(   t   letter_annotationsR&   R%   R
   R   R$   R   R   t   minR   R   R   R   R   R    R(   R.   R   t   formatt
   candidatest   accepts(   t   frag_targett
   transcriptt   size_mint
   size_lowert
   size_uppert   size_maxR)   t	   max_triest   rate_vector_plust   rate_vector_minust   cum_probs_plust   cum_probs_minust   cum_probt   fpt   normalizing_factort   cpt   mrnat   annotation_commont   triest   fragst	   initiate0t	   min_rightt
   size_ranget	   initiate1t   seq_fragmentR   t   fragment(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   generatePCRfrags]   sl    



!)

#

4
c         C   s"   t  |  | ƒ t  t |  ƒ | ƒ g S(   N(   t   get5primeReadR   (   t   sequencet   read_length(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   getReadPairª   s    c   	      C   s   |  |  } |  j  j ƒ  | _  | j  d d k rD | j  d | d n | j  d | d | j  d <d | j  k r| j  d } g  } d } } x} t | ƒ D]o \ } } | | d | k r× | j | ƒ | | d 7} q™ | | } | d k r| j | | d g ƒ n  Pq™ W| | j  d <n  | S(   NR   R   R   i   R   R	   i    (   R   R   t	   enumerateR&   (	   RS   RT   t   readt   frag_cigar_listt   read_cigar_listt   cursort   it   segmentt   residual(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyRR   ­   s"    
D

c	         C   s•   g  }	 t  |  ƒ | k  r g  St | |  | | | | | d d | ƒ}
 | ro g  |
 D] } t | | ƒ ^ qQ } n" g  |
 D] } t | | ƒ ^ qv } | S(   NR>   i
   (   R$   RQ   RU   RR   (   R9   t   copy_numberRT   R:   R;   R<   R=   t   pairedR)   t   readsRJ   t   fragt   readset(    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   generateReadSetÂ   s    (%"c      
   C   s"   t  |  | | | | | | t | ƒ	 S(   N(   Rc   R   (   R9   R^   RT   R:   R;   R<   R=   R)   (    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   generateUnpairedReadSetÍ   s    c      
   C   s&   t  |  | d | | | | | t | ƒ	 S(   Ni   (   Rc   R   (   R9   R^   RT   R:   R;   R<   R=   R)   (    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   generatePairedReadSetÐ   s    (   t   __doc__R   t   syst   ost   BioR    t   Bio.SeqR   t   Bio.SeqRecordR   t   matht   bisectR   R6   R7   R   R   R.   RQ   RU   RR   R   Rc   Rd   Re   (    (    (    s\   /woldlab/castor/data00/home/georgi/programs/RNA-Seq-Simulator.v1.0/lib/generateShortReads.pyt   <module>	   s"   $			"M		