
VNc           @   s  d  d d d g Z  d d l Z d d l m Z m Z d d l Z d d l Z d d l Z d d l m Z e j	 d  r e j	 d  Z
 n d	 Z
 e
 e _ i  Z i  Z i  a d
   Z d d  Z d   Z d d  Z d d  Z d d  Z d d  Z d d d d  Z d d d d  Z d d d  Z d   Z d d  Z d d d  Z d d d  Z d d d  Z e d d  Z d d d  Z  d  d d!  Z! e e d d d"  Z" e e e d d d#  Z# e e e d d d$  Z$ d e e d d%  Z% d&   Z& d'   Z' d d(  Z( d d)  Z) d d*  Z* d d+  Z+ d d,  Z, d d-  Z- d d.  Z. d d/  Z/ d0 d1  Z0 d d d d d e d2  Z1 d d d d d d e d3  Z2 d S(4   t   motift   homologyt   geneinfot   proteiniN(   t   Genomet   geneDB(   t   environt   CISTEMATIC_TEMPs   /tmpc         C   sb   |  t  k rT y2 d t j   } t j t |  |  | t  |  <Wq^ d |  GHq^ Xn
 t  |  } | S(   s?    save a copy of a genome's gene database to a local cache.
    s   %s.dbs   could not cache genome %s(   t   cachet   tempfilet   mktempt   shutilt   copyfileR   (   t   genomet   tempgen(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   cacheGeneDB2   s    
t    c         C   s   |  t  k rB y t j t  |   Wn d t  |  GHn Xt  |  =nC x: t  D]2 } y t j t  |  WqI d t  | GHqI XqI Wi  a  d S(   s8    remove the local copy of a genome's gene database.
    s   could not delete %sN(   R   t   ost   remove(   R   t   gen(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   uncacheGeneDBB   s    
c           C   s
   t  j   S(   sF    return lists of genomes with a gene database in the local cache.
    (   R   t   keys(    (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   cachedGenomesW   s    c         C   s)   | d k r% |  t  k r% t  |  } n  | S(   sT    helper function to use genome's gene database from the local cache if present.
    R   (   R   (   R   t   dbfile(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   chooseDB]   s    c         C   s(   t  |  | d t |  |  } | j   S(   s+    return sequence for entire chromosome
    t   dbFile(   R   R   t   getChromosomeSequence(   R   t   chromt   dbt   aGenome(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   readChromosomeg   s    c         C   sP   | d k r% |  t  k r% t  |  } n  t |  d t |  |  } |  | j   f S(   s-    return the entries for a given genome. 
    R   R   (   R   R   R   t   allGIDs(   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getGenomeEntriesn   s    c         C   sJ   | d k r% |  t  k r% t  |  } n  t |  d t |  |  } | j   S(   s-    return the entries for a given genome. 
    R   R   (   R   R   R   R   (   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getGenomeGeneIDsy   s    c         C   s=   |  \ } } t  | | d t | |  } | j | | |  S(   s0    return the entries for a given chromosome.
    R   (   R   R   t   chromGeneEntries(   t
   chromosomet
   lowerboundt
   upperboundR   R   t   chromIDR   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getChromoGeneEntries   s    i   i    c         C   s+   t  |  d t |  |  } | j | |  S(   s0    return the chromosomes for a given genome.
    R   (   R   R   t   allChromNames(   R   R   t	   partitiont   sliceR   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getChromosomeNames   s    t   1c         C   s5   |  d } t  | d t | |  } | j |  |  S(   sD    returns (chrom, start, stop, length, sense) for a given geneID
    i    R   (   R   R   t   geneInfo(   t   geneIDR   t   versionR   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt	   geneEntry   s    
c         C   s   i d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d	 d
 6d
 d	 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6} | j  |  d  S(   s3    returns the complementary basepair to base nt
    t   Tt   At   Ct   Gt   St   Wt   Yt   Rt   Kt   Mt   Dt   Ht   Vt   Bt   Nt   tt   at   ct   gt   nt   z(   t   get(   t   ntt   compDict(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   compNT   s    
c         C   s   d } t  |   } | | k s* | d k  rV t |   } | j   d j t t |   SxR t | d | | d d  D]2 } y | t |  |  7} Wqu | d 7} qu Xqu W| S(   s-    returns the complement of the sequence.
    R   i    i   iR?   (   t   lent   listt   reverset   joint   mapRI   t   range(   t   sequencet   lengtht   newSeqt	   seqLengtht   seqListt   index(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt
   complement   s    
%c         C   s   | } |  d } t  | d t | |  } yp | j |   r | j |  |  \ } } }	 }
 } | d k r | j |  | |  } q | j |  | |  } n  Wn n X| S(   s3    return distance to gene immediately upstream.
    i    R   t   F(   R   R   t	   checkGeneR-   t   leftGeneDistancet   rightGeneDistance(   R.   t   radiusR/   R   t   upstreamR   R   R   t   startt   stopRQ   t   sense(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   upstreamToNextGene   s    
!c         C   s   | } |  d } t  | d t | |  } yp | j |   r | j |  |  \ } } }	 }
 } | d k r | j |  | |  } q | j |  | |  } n  Wn n X| S(   s5    return distance to gene immediately downstream.
    i    R   RW   (   R   R   RX   R-   RZ   RY   (   R.   R[   R/   R   t
   downstreamR   R   R   R]   R^   RQ   R_   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   downstreamToNextGene   s    
!c         C   s   |  \ } } | \ } } t  | d  t  |  } | d k  rG d } n  t | | d t | |  }	 |	 j | | d t  |  |  }
 |
 S(   s/    return the features around a given match.
    i    R   i   (   t   intR   R   t   getFeaturesIntersecting(   t   matchR[   t   featureTypeR   R#   t   hitR   R&   t   lowerboundHitR   t   results(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveFeatures   s    	"c         C   s  g  } |  \ } } t  | d t | |  }	 t rd }
 d } |	 j |   r|	 j |   \ } } } } } | | k  r | } | } | } n  | d k r9| d k r | r |	 j |  |  } n  | | }
 |
 d k  r d }
 | } n  | } n  | d k r| d k r| }
 n  | | 7} n  | d k re| rC|	 j |  |  } n  | d k rX| }
 n  | | 7} n  |	 j | |
 | d  } x?| D] } | \ } } } } } } } | | k  r| } | } n  | |
 } | d k  rd } n  | |
 } | | k r| } n  | | | | f | k r| j | | | | f  qqWq| d k rs| r`|	 j |  |  } n  | | }
 | } n  | d k r| d k r| }
 n  | | 7} n  | d k r| r|	 j |  |  } n  | d k r| }
 n  | | 7} n  |	 j | |
 | | d  } x | D] } | \ } } } } } } } | | k  rI| } | } n  |
 | } | d k  rhd } n  |
 | } | | k r| } n  | | | | f | k r| j | | | | f  qqWqn  | S(   s    retrieve CDS features upstream, all or none of the cds, and downstream of a geneID.
        Feature positions are normalized and truncated to local sequence coordinates.
    R   i    RW   t   CDS(	   R   R   t   TrueRX   R-   RY   RZ   Rd   t   append(   R.   R\   t   cdsRa   t   boundToNextGeneR   Ri   R   t   gIDR   t   seqstartt   seqlenR   R]   R^   RQ   R_   t   post
   allresultst   entryt   fnamet   fversiont   fchromosomet   fstartt   fstopt   forientationt   ftypet   forstartt   forstopt   revstartt   revstop(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveSeqFeatures   s    	
					
	
	#
				
	
	&Rk   c         C   s1   t  |  d t |  |  } | j | | | |  S(   sF    return features of type ftype that fall within the given region.
    R   (   R   R   Rd   (   R   R   R]   RQ   R   R|   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyRd   h  s    RW   c         C   s   d } t  | |  d } y t |  d t |  |  } | d k r | d k  rY d }	 n
 | d }	 | j | |	 |  }
 |
 } n | d }	 | j | |	 |  } Wn) t k
 r d |  | | | | f GHn X| S(   sM    retrieve a sequence given a genome, chromosome, start, stop, and sense.
    R   i   R   RW   i    s)   Couldn't retrieve sequence %s %s %s %s %s(   t   absR   R   RP   t   IOError(   R   R   R]   R^   R_   R   t   entrySeqRQ   R   t   seqStartRP   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveSequenceo  s    	
	
c         C   s   d } |  d } t  | d t | |  } y. | j |   rX | j |  | | |  } n  Wn  t k
 r{ d t |   GHn X| S(   sA    retrieveCDS() - retrieve a sequence given a gene identifier
    R   i    R   s   Could not find %s (   R   R   RX   t   geneSeqR   t   str(   R.   t   maskCDSt	   maskLowerR   R/   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveCDS  s    
c         C   sL  d } |  d } t  | d t | |  }	 y |	 j |   r*|	 j |  |  \ }
 } } } } | d k r | r |	 j |  | |  } n  | | d k r | | d } | } q d } | } n* | r |	 j |  | |  } n  | } | } |	 j |
 | | | |  } | d k r| } q*t | |  } n  Wn t k
 rGd G|  GHn X| S(   sR    retrieve sequence 5' of cds of length upstream for a given a gene identifier
    R   i    R   RW   i   s   Couldn't find (	   R   R   RX   R-   RY   RZ   RP   RV   R   (   R.   R\   R   R   Ro   R   R/   R   R   R   R   R]   R^   RQ   R_   R   RS   RP   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveUpstream  s2    
!			c         C   s8  d } |  d } t  | d t | |  }	 t r4|	 j |   r4|	 j |  |  \ }
 } } } } | d k r | r |	 j |  | |  } n  | d } | d } nM | r |	 j |  | |  } n  | | d k r | | } | } n d } | } |	 j |
 | | | |  } | d k r| } q1t | |  } q4n  | S(   sT    retrieve sequence 3' of CDS of length downstream for a given a gene identifier
    R   i    R   RW   i   (	   R   R   Rl   RX   R-   RZ   RY   RP   RV   (   R.   Ra   R   R   Ro   R   R/   R   R   R   R   R]   R^   RQ   R_   R   RS   RP   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveDownstream  s.    
!

		c   
   	   C   s   d } t  |  d k r! t }	 n t }	 | d k rX | t |  | |	 | | | |  7} n  | d k r | t |  |	 | | |  7} n  | d k r | t |  | |	 | | | |  7} n  t |  d k r d |  d |  d | | f GHn  | S(   sK    retrieve upstream, all or none of the cds, and downstream of a geneID
    R   i   i    sW   retrieveSeq Warning: retrieved null sequence for %s: %s (splice form %s) from geneDB %si   (   Rc   Rl   t   FalseR   R   R   RJ   (
   R.   R\   Rn   Ra   R   R   Ro   R/   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveSeq  s    	%% c         C   st   t  | d  } xT | D]L } d G| GH| j d |  |  | f } | j d t | | d |   q W| j   d S(   sp    retrieve set of upstream and downstrean sequences for a list of genes in a genome and save them to a file.
    t   ws   Processing s   > %s 
s   %s
i    N(   t   opent   writeR   t   close(   R   t   genesR\   Ra   t   outputFilePatht   outFilet   geneR.   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   retrieveAll  s    	$c         C   s   d |  d |  d | f } | S(   s;    write a fasta formated seq with geneID in the header.
    s   > %s-%s
%s
i    i   (    (   R.   t   seqt   fastaString(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   fasta  s    c         C   sD   t  |  d t |  |  } |  t j   k r@ | j   t |  <n  d S(   s     load GO for a given genome
    R   N(   R   R   t   goDictR   t	   allGOInfo(   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt
   loadGOInfo  s    c         C   sG   |  \ } } t  | d t | |  } y | j |   SWn g  SXd S(   s!    retrieve GO info for geneID
    R   N(   R   R   t   goInfo(   R.   R   R   t   locusR   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt	   getGOInfo  s    c         C   s;   t  |  d t |  |  } y | j |  SWn g  SXd S(   s5    retrieve count of genes with a particular GOID.
    R   N(   R   R   t   getGOIDCount(   R   t   GOIDR   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyR   "  s
    c         C   s8   t  |  d t |  |  } y | j   SWn g  SXd S(   s    return all GO Terms.
    R   N(   R   R   t
   allGOterms(   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt
   allGOTerms,  s
    c         C   s8   t  |  d t |  |  } y | j   SWn g  SXd S(   s    return all GO Info.
    R   N(   R   R   t	   allGoInfo(   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getAllGOInfo6  s
    c         C   sD   t  |  d t |  |  } |  t j   k r@ | j   t |  <n  d S(   s)    load Annotations for a given genome
    R   N(   R   R   t	   annotDictR   t   allAnnotInfo(   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   loadAnnotInfo@  s    c         C   sG   |  \ } } t  | d t | |  } y | j |   SWn g  SXd S(   s-    retrieve Annotations for a given geneID
    R   N(   R   R   t	   annotInfo(   R.   R   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getAnnotInfoH  s    c         C   s8   t  |  d t |  |  } y | j   SWn g  SXd S(   s!    return all Annotation Info.
    R   N(   R   R   R   (   R   R   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   getAllAnnotInfoS  s
    i   c   	      C   s  t  |   } t |  j    } | d } | d d } xt | |  D]} |  | | | !j   } | j d  | k s | j d  | k s | j d  | k s | j d  | k r x3 t |  D]" } | | | j   | | | <q Wn  | j d  | k s| j d  | k s| j d	  | k s| j d
  | k s| j d  | k s| j d  | k s| j d  | k s| j d  | k s| j d  | k s| j d  | k rG x3 t |  D]" } | | | j   | | | <qWqG qG Wd j |  S(   s    make sure that very simple repeats are out of the sequence. 
        Soft-mask any window that has windowSize - 2 of mononucleotides 
        and (windowSize / 2) - 1 non-GC dinucleotides.     
    i   i   R2   R3   R4   R1   t   ACt   AGt   ATt   CTt   GTt   TAt   TCt   TGt   GAt   CAR   (   RJ   RK   t   upperRO   t   countt   lowerRM   (	   t   inSeqt
   windowSizeRr   t   outSeqt   winmin2t   winhalfRs   t   windowRU   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   sanitize]  s    
T#'c
      
   C   sz  i  }
 | d k rI t  |  d t |  |  d t } | j | d |	 n t  |  d t |  |  } | j | | | |  } t |  d k  s t |  d k  r |
 S| j   } x | D] \ } } g  } | | k r q n  xo | | D]c \ } } } } } } } | | | k  s | | | k r'q n  | j | | | | | | | f  q Wt |  d k r | |
 | | f <q q W|
 S(   s    returns a dictionary of matching features to positions of the double form (chromosome, position).
        Only positions with features within radius are returned.
    R   R   t   inRAMt   replacei   i    (   R   R   Rl   t   extendFeaturest   getFeaturesRJ   R   Rm   (   R   t   posListR[   R|   t   nameR   R/   R   t	   extendGent
   replaceModt
   resultDictR   t   featurest	   chromListRs   t   tempListR#   R]   R^   t   orientationt   atype(    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   featuresIntersectings  s(    !$& &c	      	   C   so  i  }	 | d k rI t  |  d t |  |  d t }
 |
 j | d | n t  |  d t |  |  }
 |
 j | | |  } t |  d k  s t |  d k  r |	 S| j   } x | D] \ } } g  } | | k r q n  xg | | D][ \ } } } } } | | | k o| | k n r | j | d | | | | f  q q Wt |  d k r | |	 | | f <q q W|	 S(   s    returns a dictionary of matching genes to positions of the double form (chromosome, position).
        Only positions with features within radius are returned.
    R   R   R   R   i   t	   noversioni    (   R   R   Rl   R   t   getallGeneInfoRJ   R   Rm   (   R   R   R   R   R/   R   t   flankR   R   R   R   R   R   Rs   R   R#   R]   R^   R   (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   genesIntersecting  s&    !$ $&(3   t   __all__t
   cistematict   cistematic.genomesR   R   R   R	   R   R   RF   t   cisTempt   tempdirR   R   R   R   R   R   R   R   R    R!   R'   R+   R0   RI   RV   R`   Rb   Rj   R   R   Rd   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R   (    (    (    sS   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/core/__init__.pyt   <module>   s\   $			
	o'$		



!