
VNc           @   s;  d  d l  Z  d  d l Z d  d l Z d  d l m Z d  d l m Z d  d l m Z d  d l Z d  d l	 m
 Z
 d  d l m Z d  d l Z d Z d Z d	 e f d
     YZ d   Z d   Z d   Z e d  Z e d  Z e e d  Z g  d  Z d d  Z d d  Z d d  Z d d  Z d   Z d d e e e d d d e e  e d d  Z! d d e e e d d d e e  e d d  Z" d   Z# d   Z$ d e e d d  d!  Z% e d  d"  Z& d#   Z' d$   Z( i  e g  e d% d&  Z) d d e  i  e e d e e  d' 	 Z* d(   Z+ g  d) e  d  d  d*  Z, d S(+   iN(   t   strftime(   t   array(   t   defaultdict(   t
   geneinfoDB(   t   Genomegffffff@g       @t   ErangeErrorc           B   s   e  Z RS(    (   t   __name__t
   __module__(    (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR      s   c         C   s1   i d d 6d d 6d d 6d d 6d d 6} | |  S(   Nt   Tt   At   Ct   Gt   N(    (   t   baset   revComp(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getReverseComplement   s    

c         C   s7   t  t  } x |  D] } | | c d 7<q W| j   S(   Ni   (   R   t   intt   items(   t   listToCheckt   tallyt   item(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   countDuplicatesInList#   s    c         C   sp   y t  |   } Wn  t k
 r2 t  |  d  } n Xt  |  d  } | j d t d  | | f  | j   d S(   sZ    create a log file to write a message from a messenger or append to an existing file.
    t   wt   as   %s: [%s] %s
s   %Y-%m-%d %H:%M:%SN(   t   opent   IOErrort
   writelinesR    t   close(   t   logFilet	   messengert   messaget   logfile(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   writeLog+   s     c         C   sF   t  d |  } |  d k r3 | j |  d d } n | j |   } | S(   Nt   cachet   dmelanogastert   infoKeyt   locus(   R   t   getallGeneInfo(   t   genomeR!   t   idbt   geneinfoDict(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getGeneInfoDict9   s
    c         C   s   t  |  d d | S(   Nt    t   inRAM(   t   getExtendedGeneAnnotDict(   R&   R+   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getGeneAnnotDictC   s    c         C   sD   t  |  d | } | d k r4 | j | d | n  | j   } | S(   NR+   R*   t   replace(   R   t   extendFeaturest   allAnnotInfo(   t
   genomeNamet   extendGenomet   replaceModelsR+   R&   t   geneannotDict(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR,   G   s
    c         C   sS   d t  j j d  g } x |  D] } | j |  q Wt j   } | j |  | S(   Ns   erange.configs   ~/.erange.config(   t   ost   patht
   expandusert   appendt   ConfigParsert   SafeConfigParsert   read(   t   fileListt   configFilest   filenamet   config(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getConfigParserQ   s    c         C   s@   y |  j  | |  } Wn# t j t j f k
 r; | } n X| S(   N(   t   getR9   t   NoSectionErrort   NoOptionError(   t   parsert   sectiont   optiont   defaultt   setting(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getConfigOption\   s
    
c         C   s@   y |  j  | |  } Wn# t j t j f k
 r; | } n X| S(   N(   t   getintR9   RB   RC   (   RD   RE   RF   RG   RH   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getConfigIntOptione   s
    
c         C   s@   y |  j  | |  } Wn# t j t j f k
 r; | } n X| S(   N(   t   getfloatR9   RB   RC   (   RD   RE   RF   RG   RH   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getConfigFloatOptionn   s
    
c         C   sC   y |  j  | |  } Wn& t j t j t f k
 r> | } n X| S(   N(   t
   getbooleanR9   RB   RC   t
   ValueError(   RD   RE   RF   RG   RH   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getConfigBoolOptionw   s
    
c         C   s4   y |  j  |  } Wn t j k
 r/ g  } n X| S(   N(   R   R9   RB   (   RD   RE   RH   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getAllConfigSectionOptions   s
    
i  i    i   i   c         C   sV   t  |   } | j   } t | | | | | | | | | |	 |
 | |  } | j   | S(   s   returns a dictionary containing a list of merged overlapping regions by chromosome; 
    can optionally filter regions that have a scoreField fewer than minHits.
    Can also optionally return the label of each region, as well as the
    peak, if supplied (peakPos and peakHeight should be the last 2 fields).
    Can return the top regions based on score if higher than minHits.
    (   R   t	   readlinest   getMergedRegionsFromListR   (   t   regionfilenamet   maxDistt   minHitst   verboset	   keepLabelt	   fullChromt
   chromFieldt
   scoreFieldt   padt   compactt   doMerget   keepPeakt	   returnTopt   infilet   linest   regions(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getMergedRegions   s    

c   (      C   s  i  } d } d } d | k  o/ t  |   k  n r g  } x |  D]~ } | d d k r d | k rl d } n  d | k rA d } qA qA n  | j   j d  } t | | j    } | j |  qA W| j   d | } | | } | | k r | } q n  d } t |  } d } x|  D]} | d d k r^d | k rCd } n  d | k rd } qqn  | j   j d  } | d k ry t | | j    } Wn t t f k
 rqn X| | k  rqqn  |	 r| | j d  \ } } | j d	  \ } } t |  } t |  } n | d k ryt	 j
 | |  d  } | | } t | | d  | } t | | d
  | } n< | d } | d } t | d
  | } t | d  | } | s| d } n  | rt | d | |  } t | d | |  } n  | | k rg  | | <n  t }  |
 rt  | |  d k rxdt t  | |   D]I}! | | |! }" |" j }# |" j }$ t | | |# |$  st | | |# |$ |  rS| |# k  r| }# n  |$ | k  r| }$ n  | r|" j }% |" j }& | |& k r| }& | }% qn  |# | | |! _ |$ | | |! _ t |$ |#  | | |! _ | r]| | | |! _ n  | r|% | | |! _ |& | | |! _ n  | d 7} t }  PqSqSWn  |  st j | |  }" | r| |" _ n  | r| |" _ | |" _ n  | | j |"  | d 7} n  | r| d d k r| GHqqWd }' x9 | D]1 } |' t  | |  7}' | | j d d    q5W| rd | GHd |' GHn  | S(   s   returns a dictionary containing a list of merged overlapping regions by chromosome; 
    can optionally filter regions that have a scoreField fewer than minHits.
    Can also optionally return the label of each region, as well as the
    peak, if supplied (peakPos and peakHeight should be the last 2 fields).
    Can return the top regions based on score if higher than minHits.
    i    t   #t   pvaluei   t	   readShifts   	it   :t   -i   i   ii t   cmpc         S   s   t  |  j | j  S(   N(   Rj   t   start(   t   xt   y(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   <lambda>'  s    s   merged %d timess   returning %d regions(   t   lent   stript   splitt   floatR8   t   sortR   t
   IndexErrorRO   t   stringt   joint   Falset   rangeRk   t   stopt   regionsOverlapt   regionsAreWithinDistancet   peakPost
   peakHeightt   abst   lengtht   labelt   Truet   Region((   t
   regionListRU   RV   RW   RX   RY   RZ   R[   R\   R]   R^   R_   R`   Rc   t	   hasPvaluet   hasShiftt   scorest   regionEntryt   fieldst   hitst   minScoret
   mergeCountt   countt   chromt   post   frontt   backRk   Ry   R   R|   R}   t   mergedt   indext   regiont   rstartt   rstopt   rpeakPost   rpeakHeightt   regionCount(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyRS      s    	"		


			


		-				
		c         C   s   |  | k r | |  }  } n  | | k r8 | | } } n  | |  k oO | k n p | | k ok | k n p |  | k o | k n p |  | k o | k SS(   N(    (   Rk   Ry   R   R   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyRz   0  s
    c         C   sb   |  | k r | |  }  } n  | | k r8 | | } } n  t  | |  | k pa t  | |   | k S(   N(   R~   (   Rk   Ry   R   R   RU   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR{   :  s
    i   iK   c         C   s  | d k r- t  |  | | d | d | } n  t |  | | | | | |  \ } }	 }
 } t d d g |  } xi t d | d  D]T } | | d d | | d d | | d | | d | | d d	 | | <q Wt | |  } t j | |
 | | d
 | } | rd } | d } x^ |	 D]V } | r<| d } n d } | d | } | | k r#| d d k r#| | 7} q#q#W| | _ n  | S(   s   find the peak in a list of reads (hitlist) in a region
    of a given length and absolute start point. returns a
    list of peaks, the number of hits, a triangular-smoothed
    version of hitlist, and the number of reads that are
    forward (plus) sense.
    If doWeight is True, weight the reads accordingly.
    If leftPlus is True, return the number of plus reads left of
    the peak, taken to be the first TopPos position.
    t   autot	   useWeightt   maxShiftt   fg        i   i   i   g      "@t   shifti    t   weightg      ?Rk   t   senset   +(   t   getBestShiftForRegiont   findPeakSequenceArrayR   Rx   t   getPeakPositionListt   Peakt   numLeftPlus(   t   hitListRk   R   t   readlent   doWeightt   leftPlusR   t   maxshiftt   seqArrayt   regionArrayt   numHitst   numPlust   smoothArrayR   t   topPost   peakR   t   maxPosR;   R   t
   currentPos(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   findPeakD  s(    !*R
c         C   s8  d } d } x%t  | d  D]} t d d g |  } x |  D] }	 |	 d | }
 |	 d d k rq |
 | 7}
 n
 |
 | 8}
 |
 d k  s@ |
 | k r q@ n  | r |	 d	 } n d
 } |	 d d k r | |
 c | 7<q@ | |
 c | 8<q@ Wd } x | D] } | t |  7} q W| GH| | k  r | } | } q q W| S(   Ni    I    i   R   g        Rk   R   R   R   g      ?(   t   xrangeR   R~   (   t   readListRk   R   R   R   t	   bestShiftt   lowestScoret	   testShiftt
   shiftArrayR;   t
   currentposR   t   currentScoret   score(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR   o  s2    
c         C   sf  t  d d g |  } d } d }	 g  }
 x+|  D]#} | d | } | d d k r` | | 7} n
 | | 8} | d | k  s/ | | k r q/ n  | r | d } n d } | | 7} | r |
 j |  n  d	 } x$ | d	 k  r | d 7} | d 7} q Wx@ | | k  r4| | k  r4| | c | 7<| d 7} | d 7} q W| d d k r/ |	 | 7}	 q/ q/ W| |
 | |	 f S(
   NR   g        Rk   R   R   i   R   g      ?i    (   R   R8   (   R   Rk   R   R   R   R   R   R   R   R   R   R;   R   R   t   hitIndex(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR     s8    



c         C   sm   d } g  } xZ t  |  D]L } | |  | k  rE |  | } | g } q | |  | k r | j |  q q W| S(   Ni    (   R   R8   (   R   R   t   topNucleotidet   peakListR   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR     s    
ic   !      C   sO  |  j    } t } t |  d k r- t } n  t |  d k rg  } x | D] }	 x | |	 D] }
 |
 j } | | k r | j |  n  | | k r g  | | <d } n | | d d } | | j d |	 |
 j |
 j | f  q] WqL Wx( | D] } | | j d d    q Wn  i  } x| D]} | r=| | k r=qn  | | } g  } t } xk| D]c\ } }	 } } } | d k rt } n  | | | f | k rZt } g  } xf | D]^ \ } } } | | k r| | k rt } n  | | k  r| | k r| j | | f  qqWt |  d k rg  } t } xc | D][ \ } } } | | f | k r3| j | | | f  | | k r| | k rt } qq3q3W| } n  | r| j | | | f  qqZqZW| r| rqn  |	 | k rg  | |	 <n  x7 | D]/ \ } } } | |	 j | | | | | f  qWqWx | D] }	 | |	 j   q1W| rGi  } d } x | D] }	 g  | |	 <t | |	  } | d k rbd } xu t	 |  D]g } | |	 | d }  | d 7} | |  k  r| |	 j | |  d	 | d
 d f  n  | |	 | d } qW| }  | |	 j | |  d	 | d
 d f  qbqbW| | f S| Sd S(   s   return a dictionary of cistematic gene features. Requires
    cistematic, obviously. Can filter-out pseudogenes. Will use
    additional regions dict to supplement gene models, if available.
    Can restrict output to a list of GIDs.
    If regionComplement is set to true, returns the regions *outside* of the
    calculated boundaries, which is useful for retrieving intronic and
    intergenic regions. maxStop is simply used to define the uppermost
    boundary of the complement region.
    i    R   it   customRj   c         S   s   t  |  d | d  S(   Ni   (   Rj   (   Rl   Rm   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyRn     s    t   PSEUDOi   s	   nonExon%dt   Ft   nonExonN(
   t   getallGeneFeaturesRw   Ro   R   R   R8   Rk   Ry   Rs   Rx   (!   t   genomeObjectt   additionalRegionsDictt   ignorePseudot   restrictListt   regionComplementt   maxStopt   featuresDictt   restrictGIDt   sortListR   R   R   R   t   gidt   featuresByChromDictt   featureListt   newFeatureListt   isPseudot   ftypeRk   Ry   t   notContainedt   containedListt   fstartt   fstopt   ftype2t   newFListt   complementByChromDictt   complementIndext
   listLengtht   currentStartR   t   currentStop(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getFeaturesByChromDict  s    		
	.
			 (

'+
c
         C   s  i  }
 | d k r. | d k r. | r. d GH|
 S| d k rV | d k rV | rV d GH|
 S| d k rr | rr d GH|
 S| r | d k r d GH|
 S| d k r | d k r d GH|
 S| d k  r | d k r d GH|
 S|  j  } t |  |  } x| D]} | | } g  } x0 | D]( \ } } } } } | j | | f  qW| rO| d k rOq n  | j   | d d	 } | d d } | d	 d
 } t | |  } | d k r_| r| d k r| r| | | d k r| | d } qq| } n | r| d k r| } n  | d k r| r$| d } n | } |	 rl|  j | | f | d  } | | d k  rl| d } qln  t | d
  } | | 8} n  | d k r| } |	 r|  j | | f | d  } | | d k  r| d } qn  t | d
  } | | 7} n  d | k  o| k  n r)| d d | } n  | d k  rt |  | k  r| d	 d
 | } qn| r| d k r| r| | | d k  r| | d } qq| } n | r| d k r| } n  | d k rH| r| d } n | } |	 r,|  j | | f | d  } | | d k  r,| d } q,n  t | d
  } | | 7} n  | d k r| } |	 r|  j | | f | d  } | | d k  r| d } qn  t | d
  } | | 8} n  d | k  o| k  n r| d	 d	 | } n  | d k  rt |  | k  r| d d | } n  | |
 k r5g  |
 | <n  | r^|
 | j | | | | | f  q |
 | j | | | | f  q Wx |
 D] } |
 | j   qW|
 S(   s   return a dictionary of gene loci. Can be used to retrieve additional
    sequence upstream or downstream of gene, up to the next gene. Requires
    cistematic, obviously.
    Can filter-out pseudogenes and use additional regions outside of existing
    gene models. Use upstreamSpanTSS to overlap half of the upstream region
    over the TSS.
    If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
    lengthCDS < 0bp, return only the last X bp from CDS.
    i    sA   getLocusByChromDict: asked for no sequence - returning empty dictsR   getLocusByChromDict: asked for only upstream and downstream - returning empty dictsP   getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dictsR   getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dictsi   getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dictsf   getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dictR   ii   R   i   (   R&   t   getGeneFeaturesR8   Rs   R~   t   leftGeneDistancet   maxt   rightGeneDistance(   R&   t   upstreamt
   downstreamt   useCDSR   R   t   upstreamSpanTSSt	   lengthCDSt	   keepSenset   adjustToNeighbort   locusByChromDictR1   R   R   R   R   R   R   Rk   Ry   R   t   gstartt   gstopt   glent   distancet   nextGene(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   getLocusByChromDict3  s    	

				#!c   	      C   s   |  j    } t |  d k r g  } x | D] } x | | D] } | j } | | k rg | j |  n  | | k r g  | | <d } n | | d d } | | j d | | j | j | f  q< Wq+ Wx( | D] } | | j d d    q Wn  | S(   Ni    R   iR   Rj   c         S   s   t  |  d | d  S(   Ni   (   Rj   (   Rl   Rm   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyRn     s    (   R   Ro   R   R8   Rk   Ry   Rs   (	   R&   R   R   R   R   R   R   R   R   (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyR     s     	
	.g      ?c	   "      C   s  d }	 i  }
 i  } | r3 d } d } d } d } n d } d } d } d } d } d GHt  |  d k r xc | D] } d g | |
 | <qo WnA x> |  D]6 } x- |  | D]! } | | } d g | |
 | <q Wq Wx| D]} | |  k r q n  x* |  | D] } | | } | | | | <q Wd | GHd } x| | D]} | d	 } | d
 } |	 d 7}	 |	 d d k rsd |	 Gn  | | } | d k  rd } n  x?|  | | D]/} | | } | | } | | } | | } y | | } Wn t k
 rd } n X| | k r| d 7} qn  | | k r-| d 8} Pn  | | k oD| k n r| d k  rb| | } n | } | | } | d k r| | }  | d k r| | k  rd }  n | d k rd }  n  |  | k r| d }  n  y |
 | |  c | | 7<Wqt k
 rd | t |   f GHqXn | | }! |! | }  | d k rT|! | k  rTd }  n | d k rid }  n  |  | k r| d }  n  y |
 | |  c | | 7<Wn& t k
 rd | t |   f GHn X| } qqWq4Wq W|
 | f S(   s    returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
        a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
        or a different weight / tag.
    i    i   i   i   i   s   entering computeRegionBinsg        s   %s
Rk   R   i s   read %d R   i
   s   %s %s(   Ro   Rt   t   KeyErrort   str("   t   regionsByChromDictt   hitDictt   binsR   R   t   normalizedTagt   defaultRegionFormatt   fixedFirstBint	   binLengthR   t   regionsBinst
   regionsLent   regionIDFieldt
   startFieldt	   stopFieldt   lengthFieldt
   senseFieldt   readIDR   t   regionTuplet   regionIDt   startRegionR;   t   tagStartR   t	   stopPointRk   Ry   t   rlent   rsenset   regionBinLengtht	   startdistt   binIDt   rdist(    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   computeRegionBins  s    	

	



	








		

		(-   R9   R5   Ru   t   timeR    R   t   collectionsR   R   t   cistematic.core.geneinfoR   t   cistematic.genomesR   R   t   commoncodeVersiont   currentRDSversiont	   ExceptionR   R   R   R    Rw   R)   R-   R,   R@   t   NoneRI   RK   RM   RP   RQ   R   Rd   RS   Rz   R{   R   R   R   R   R   R   R   R  (    (    (    sE   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/commoncode.pyt   <module>   s\   			

							
	
	*%	(	i				