
VNc           @   s  y d  d l  m Z Wn d  d l m Z n Xd  d l Z d  d l Td  d l m Z e j d  ru e j d  Z	 n d Z	 d d d	 d
 d d d d d d d d d d d d d d g Z
 d d d d d d d d	 d
 d d d d d d d d d g Z i  Z i  Z i  Z i  Z i  Z i  Z i  Z i  Z i  Z i  Z i  Z d   Z d d  Z d f  d     YZ i  d  Z d i  d  d!  Z x4 e D], Z d" e Z e d Ue d# e  e e <qWd S($   i(   t   dbapi2N(   t   *(   t   environt   CISTEMATIC_ROOTs   /proj/genomet   scerevisiaet	   athalianat   celeganst	   cbriggsaet	   cbrennerit   cremaneit   dmelanogastert	   mmusculust   hsapienst   rnorvegicust   spurpuratust   ggallust   cfamiliarist
   mdomesticat   xtropicalist   btaurust   dreriot	   ecaballusc         C   s   i d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d	 d
 6d
 d	 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6} | j  |  d  S(   s3    returns the complementary basepair to base nt
    t   Tt   At   Ct   Gt   St   Wt   Yt   Rt   Kt   Mt   Dt   Ht   Vt   Bt   Nt   tt   at   ct   gt   nt   z(   t   get(   t   ntt   compDict(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   compNTD   s    
i    c         C   s   d } t  |   } | | k rJ t |   } | j   d j t t |   SxR t | d | | d d  D]2 } y | t |  |  7} Wqi | d 7} qi Xqi W| S(   s-    returns the complement of the sequence.
    t    i   iR$   (   t   lent   listt   reverset   joint   mapR.   t   range(   t   sequencet   lengtht   newSeqt	   seqLengtht   seqListt   index(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt
   complementY   s    
%t   Genomec           B   s  e  Z d  Z d  Z d  Z d  Z e Z e Z e Z	 d  d  d  e d  Z
 d   Z d   Z d   Z d d  Z d  d  d  d  Z d d d	  Z d d d
  Z d   Z d   Z d  d  Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z d d d  Z e e d d  Z d  d d  Z d   Z d  d  Z  d  d  d  d  Z! d  d  Z" d   Z# e e d  Z$ d  d   Z% d  d! d! d"  Z& d  e d#  Z' e d$  Z( d  d% d&  Z) d  d'  Z* d(   Z+ d  d  d  d  d  d  d)  Z, d* d+  Z- d,   Z. e d-  Z/ e d.  Z0 e d/  Z1 e d0  Z2 e d1  Z3 e d2  Z4 d3 e d4  Z5 e e d5  Z6 e d6  Z7 e d7  Z8 RS(8   R/   c         C   s  | |  _  | d k r! | |  _ n  | d k r9 | |  _ n  | d k rQ | |  _ n  | t k r | d k r t | |  _ t |  _ n  | rst |  _ t	 j
 d  |  _ |  j d t  |  j j   } yl | j d  | j d |  j  x7 d d d d	 d
 d g D] } | j d | | f  q W| j d  Wn% |  j d k rVd |  j GHqVn X| j   |  j d t  n t |  _ d |  _ d  S(   NR/   s   :memory:t   inMemorys"   PRAGMA DEFAULT_CACHE_SIZE = 500000s   ATTACH '%s' as diskdbt   gene_entriest   gene_annotationt   gene_ontologyt	   sequencest   chromosomest   sequence_featuress&   insert into %s select * from diskdb.%ss   DETACH diskdbs   could not import %s(   t   genomet
   chromosomet   versiont   dbFilet   supportedGenomest   geneDBt   Truet	   supportedt   memoryBackedt   sqlitet   connectt   memoryDBt   createGeneDBt   cursort   executet   closet   createIndicest   False(   t   selfRE   t   chromRG   RH   t   inRAMt   sqlt   table(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   __init__v   s8    		
	c         C   s   | |  _  t |  _ d  S(   N(   RH   RV   RL   (   RW   RH   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt	   setGeneDB   s    	c         C   s   | |  _  d  S(   N(   RF   (   RW   RX   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   setChromosome   s    c         C   s`   | \ } } | |  j  k r t Sy3 d | } |  j |  } t |  d k rQ t SWn n Xt S(   sM    returns True if the geneID matches an entry in the genome database.
        s7   SELECT chromosome  from gene_entries where name = '%s' i    (   RE   RV   t   queryDBR0   RK   (   RW   t   geneIDRE   t   gIDt   stmtt   res(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt	   checkGene   s    
t   1c         C   s   | \ } } d } | |  j  k r% t Sy d | | f } |  j |  } t |  d k r | d } t | d  }	 t | d  }
 |	 |
 k r |
 } |	 }
 | }	 n  t | d  } | d } | |	 |
 | | f } n  Wn n X| S(   NR/   sk   SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' i    i   i   i   i   (   RE   RV   R_   R0   t   int(   RW   R`   RG   RE   Ra   t   resultRb   Rc   RX   t   startt   stopt   tempR7   t   orientation(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   geneInfo   s*    
	
c         C   s   i  } g  } d } |  j  | d t } x | D] } | \ } }	 }
 } } | | k rb g  | | <n  |	 | k r g  | |	 <| j |	  n  | |	 j | |	 t |
  t |  | f  q. W| S(   NsY   select name, chromosome, start, stop, orientation from gene_entries order by name, start t   fetchall(   R_   RK   t   appendRf   (   RW   t   nameRX   RG   t
   resultDictt	   chromListRb   Rc   t   entryRF   Rh   Ri   Rk   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getallGeneInfo   s    
0iP  c         C   sA  | } |  j  | |  } | d k r=| \ } } } }	 }
 | | k rZ | } | } | } n  d | | | | | | | f } |  j | d t } x | D] } t | d  } t | d  } | | k r | } | } | } n  | | } | | d k r| | | k  r| | } n  | d k r | | k  r | } q q Wn  | S(   NR/   s   SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) Rm   i   i   i    (   Rl   R_   RK   Rf   (   RW   R`   t   radiusRG   Rg   Rc   RX   Rh   Ri   R7   Rk   Rj   Rb   Rr   t   rstartt   rstopt	   thelength(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   leftGeneDistance   s.    	!	
 c         C   sA  | } |  j  | |  } | d k r=| \ } } } }	 }
 | | k rZ | } | } | } n  d | | | | | | | f } |  j | d t } x | D] } t | d  } t | d  } | | k r | } | } | } n  | | } | | d k r| | | k  r| | } n  | d k r | | k  r | } q q Wn  | S(   NR/   s   SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) Rm   i   i   i    (   Rl   R_   RK   Rf   (   RW   R`   Rt   RG   Rg   Rc   RX   Rh   Ri   R7   Rk   Rj   Rb   Rr   Ru   Rv   Rw   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   rightGeneDistance  s.    	!	
 c         C   s   | \ } } g  } | |  j  k r% t Sd | } |  j | d t } t |  d k r{ x" | D] } | j | d  q] Wn  | S(   Ns9   SELECT description from gene_annotation where name = '%s'Rm   i    (   RE   RV   R_   RK   R0   Rn   (   RW   R`   RE   Ra   Rg   Rb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt	   annotInfo   s    
c         C   s   | \ } } g  } | |  j  k r% t Sd | } |  j | d t } t |  d k r x* | D] } | j t j | d   q] Wn  | S(   Ns[   SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'Rm   i    s   	(   RE   RV   R_   RK   R0   Rn   t   stringR3   (   RW   R`   RE   Ra   Rg   Rb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   goInfo/  s    
 c         C   s\   | d k r' |  j  d k r' |  j  } n  d | } |  j |  } d | d | d f } | S(   NR/   sD   SELECT sequenceName, storageType from chromosomes where name = '%s' s   %s	%si    i   (   RF   R_   (   RW   RX   Rb   Rc   Rg   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt	   chromInfo>  s    
c         C   sG   g  } d } |  j  | d t } x | D] } | j | d  q( W| S(   s*    returns [ list of all orf names]
        s&   SELECT distinct name from gene_entriesRm   i    (   R_   RK   Rn   (   RW   Rg   Rb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   allGIDsI  s    c         C   sK   g  } d | } |  j  | d t } x | D] } | j | d  q, W| S(   sG    returns [ list of all orf names] that match a particular GOID
        s:   SELECT distinct name from gene_ontology where GOID = '%s' Rm   i    (   R_   RK   Rn   (   RW   t   GOIDRg   Rb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   allGIDsbyGOIDU  s    
c         C   sG   g  } d } |  j  | d t } x | D] } | j | d  q( W| S(   s2    returns the list of GOID's in the genome
        s'   SELECT distinct GOID from gene_ontologyRm   i    (   R_   RK   Rn   (   RW   Rg   Rb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   allGOIDsa  s    c         C   sT   i  } d } |  j  | d t } x, | D]$ } t | d  | t | d  <q( W| S(   sO    returns the list of GOID's and their associated GO term in the genome
        s/   SELECT distinct GOID, GOterm from gene_ontologyRm   i   i    (   R_   RK   t   str(   RW   Rg   Rb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt
   allGOtermsm  s    "c         C   s)   d | } |  j  | d t } t |  S(   s7    returns the match count for a particular GOID
        s:   SELECT distinct name from gene_ontology where GOID = '%s' Rm   (   R_   RK   R0   (   RW   R   Rb   Rc   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getGOIDCounty  s    
c         C   s   i  } d } |  j  | d t } x\ | D]T } |  j | d f } | | k rZ g  | | <n  | |  j | d f j | d  q( W| S(   Ns-   SELECT name, description from gene_annotationRm   i    i   (   R_   RK   RE   Rn   (   RW   Rg   Rb   Rc   Rr   R`   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   allAnnotInfo  s    &c         C   s   i  } d } |  j  | d t } xh | D]` } |  j | d f } | | k rZ g  | | <n  | |  j | d f j t j | d d   q( W| S(   Nse   SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name Rm   i    i   s   	(   R_   RK   RE   Rn   R{   R3   (   RW   Rg   Rb   Rc   Rr   R`   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt	   allGoInfo  s    2i   i    c   	      C   sz   g  } d } |  j  | d t } t |  } xF t |  D]8 } | | | d k r: | | } | j | d  q: q: W| S(   Ns%   SELECT distinct name from chromosomesRm   i    (   R_   RK   R0   R5   Rn   (	   RW   t	   partitiont   sliceRg   Rb   Rc   t   reslenR;   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   allChromNames  s    
c         C   s^   |  j  | |  \ } } } } }	 |  j | | | | |  }
 |	 d k rZ t |
 |  }
 n  |
 S(   NR   (   Rl   R6   R<   (   RW   Ra   t   maskCDSt	   maskLowerRG   RX   Rh   Ri   R7   Rk   t   seq(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   geneSeq  s
    !c         C   s   g  } d } | \ } } t  |  d k r7 d | } n  d | t |  | f } |  j | d t }	 x? |	 D]7 }
 |
 \ } } } } } | j | | | | | f  ql W| S(   NR/   i    s    and type = "%s" s   select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s"  %s order by start Rm   (   R0   R   R_   RK   Rn   (   RW   Ra   t   typeRG   t   resultst   featureClauseRE   t   geneidRb   Rc   Rr   RF   Rh   Ri   Rk   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getGeneFeatures  s     c         C   s   i  } d } |  j  | d t } x_ | D]W } | \ } } } } }	 }
 | | k r_ g  | | <n  | | j | | | |	 |
 f  q( W| S(   Nsd   select name, type, chromosome, start, stop, orientation from sequence_features order by name, start Rm   (   R_   RK   Rn   (   RW   Rp   Rb   Rc   Rr   Ro   R   RF   Rh   Ri   Rk   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getallGeneFeatures  s    $c         C   s   g  } d } t  } d | k r' t } n  t |  d k rM | rM d | } n t |  d k rl d | } n  d | } |  j | d t } x | D] } | j | d  q W| S(   s    Returns the distinct feature types available in the sequence_features
            tables. Can optionally limit by feature type; the wild-card % can be 
            used to search feature substrings.
        R/   t   %i    s   where type = "%s" s   where type LIKE "%s" s.   select distinct type from sequence_features %sRm   (   RV   RK   R0   R_   Rn   (   RW   t   ftypeR   t   whereClauset   useLikeRb   Rc   Rr   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getFeatureTypes  s    	
c      	   C   sa  i  } g  } d } d } d }	 d }
 d | k r9 d }
 n  t  |  d k ry d | k r` d } n d } d | | f } n  t  |  d k r d | } n  t  |  d k r d | }	 n  d	 |
 | | | |	 f } |  j | d
 t } xu | D]m } | \ } } } } } } } | | k r3g  | | <| j |  n  | | j | | | | | | | f  q W| S(   sl   Get features stored in sequence_features that match a feature type, 
            optionally restricted by name/value, chromosome, or version. Will 
            search for substrings when ftype and/or name are given with a % to 
            indicate the location of the wildcard. Returns a dictionary of features 
            with chromosomes as the keys.
        R/   t   =R   t   LIKEi    s    and name %s "%s" s    and chromosome = "%s" s    and version = "%s" s   select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by typeRm   (   R0   R_   RK   Rn   (   RW   R   Ro   RX   RG   R   Rq   t
   nameClauset   chromClauset   versionClauseR   t   nameLikeRb   Rc   Rr   RF   Rh   Ri   Rk   t   atype(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getFeatures  s4    		
*c      	   C   s-  g  } d } | | } t  } d | k r1 t } n  t |  d k rW | rW d | } n t |  d k rv d | } n  d | | | | f }	 |  j |	 d t }
 xK |
 D]C } | \ } } } } } } } | j | | | | | | | f  q Wd | | | | f }	 |  j |	 d t }
 xo |
 D]g } | \ } } } } } } } | | | | | | | f | k r!| j | | | | | | | f  q!q!Wd	 | | | | f }	 |  j |	 d t }
 xo |
 D]g } | \ } } } } } } } | | | | | | | f | k r| j | | | | | | | f  qqW| S(
   s    Return features that are on a particular stretch of the genome. Can optionally
            limit by feature type; the wild-card % can  be used to search feature substrings.
        R/   R   i    s    and type = "%s" s    and type LIKE "%s" s   select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by startRm   s   select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by starts   chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start(   RV   RK   R0   R_   Rn   (   RW   RX   t   qstartt   qlengthR   R   R   t   qstopR   Rb   Rc   Rr   RF   Rh   Ri   Rk   Ro   RG   R   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getFeaturesIntersecting  s8    
	&!)!)c      	   C   s  g  } | | } d } d | | | f } |  j  | d t } xH | D]@ }	 |	 \ }
 } } } } } | j | | |
 | | | | f  qE Wd | | | f } |  j  | d t } xi | D]a }	 |	 \ }
 } } } } } | | |
 | | | f | k r | j | | |
 | | | | f  q q Wd | | | f } |  j  | d t } xi | D]a }	 |	 \ }
 } } } } } | | |
 | | | f | k rL| j | | |
 | | | | f  qLqLW| S(   s    Return features that are on a ptarticular stretch of the genome. Can optionally
            limit by feature type; the wild-card % can  be used to search feature substrings.
        t   models   select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by startRm   s   select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by starts   select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start(   R_   RK   Rn   (   RW   RX   R   R   R   R   R   Rb   Rc   Rr   RF   Rh   Ri   Rk   Ro   RG   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getGenesIntersecting?  s*    
&))c         C   sa  d } | d k r- |  j  d k r- |  j  } n  d | } |  j |  } | d }	 | d }
 |
 d k r d |	 } |  j |  } | d | | | !} d } n{ t d t |	 f d  } t | j   d d	 t } t | j   | j   d	 t } | j |  | j	 |  } | j
   | r| | d } t |  } |  j | | | d
  } x | D] } | \ } } } } } } } | | k  r| } n  | | k r| } n  | | } | | d } x& t | |  D] } d | | | <qWqWWt j | d  } n  | r]t |  } x7 t t |   D]# } | | d k r!d | | <q!q!Wt j | d  } n  | S(   NR/   sD   select sequenceName, storageType from chromosomes where name = '%s' i    i   t   dbs1   select sequence from sequences where name = '%s' s   %s%st   rt   accesst   CDSR$   R&   R'   R(   R%   (   R&   R'   R(   R%   (   RF   R_   t   opent   cisRoott   mmapt   filenot   ACCESS_READt   sizet   seekt   readRT   R1   R   R5   R{   R3   R0   (   RW   RX   Rh   R7   R   R   R   Rb   Rc   t   seqNamet   seqTypet	   chromFilet   mymapRi   t   seqArrayt   featuresRr   Ro   R`   RF   t   fstartt   fstopt   forientationR   t   nstartt   nstopt   posR;   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyR6   `  sP    



	!
		
c         C   s   d } | d k r- |  j  d k r- |  j  } n  d | } |  j |  } | d  k r_ d | GHd S| d } | d } | d k r |  j d |  } | d } d } n/ t d t | f d	  } | j   } | j   | S(
   NR/   sD   select sequenceName, storageType from chromosomes where name = '%s' s   Could not find chromosome %si    i   R   s0   select sequence from sequences where name = "%s"s   %s%sR   (   RF   R_   t   NoneR   R   t   readlineRT   (   RW   RX   R   Rb   Rc   R   R   R   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   getChromosomeSequence  s$    
	


	
ic   
      C   sw  g  } | d k r- |  j  d k r- |  j  } n  d | } |  j | d t } | d k r | d k r x| D] } t | d  } t | d  }	 |	 | k  r |	 | } }	 n  |	 | k  sk | | k r qk n  | j | |	 | d |  j | d f f  qk Wnw xt | D]l } t | d  } t | d  }	 |	 | k  rE|	 | } }	 n  | j | |	 | d |  j | d f f  qW| S(   NR/   sY   select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' Rm   i    i   i   i   (   RF   R_   RK   Rf   Rn   RE   (
   RW   RX   t
   lowerboundt
   upperboundRg   Rb   Rc   Rr   Rh   Ri   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   chromGeneEntries  s*    
1.c         C   s   t  |  d k r | |  _ n  i d d 6d d 6d d 6d d	 6d
 d 6d d 6} x; | j   D]- } d | | | f } |  j | d | q[ Wd  S(   Ni    s   ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varcharR?   s9   ID INTEGER PRIMARY KEY, name varchar, description varcharR@   s   ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varcharRA   sd   ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varcharRB   sO   ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varcharRC   s   ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varcharRD   s   create table %s(%s)t	   useMemory(   R0   RH   t   keyst   writeDB(   RW   RH   R>   t	   tableDictR[   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyRQ     s    

c         C   s   i	 d d 6d d 6d d 6d d	 6d d 6d d 6d d 6d d 6d d 6} xJ | j    D]< } | | \ } } d | | | f } |  j | d | qR Wd  S(    NR?   Ro   t
   nameIndex1R@   t
   nameIndex2RA   t
   nameIndex3R   t	   goidIndexRB   t
   nameIndex4RC   t
   nameIndex5RD   s
   name, typet   geneIDIndexs   chromosome, start, stop, typet   posIndexR   t	   typeIndexs   CREATE INDEX %s on %s(%s)R   (   s   gene_entriess   name(   s   gene_annotations   name(   s   gene_ontologys   name(   s   gene_ontologys   GOID(   s	   sequencess   name(   s   chromosomess   name(   s   sequence_featuress
   name, type(   s   sequence_featuress   chromosome, start, stop, type(   s   sequence_featuress   type(   R   R   (   RW   R>   t	   indexDictt	   indexNamet	   tableNamet   fieldsRb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyRU     s    

g      ?c      	   C   sk   | \ } }	 t  t t |  t |   d  }
 d |	 t  |  | | | |
 | | f } |  j |  d  S(   Ni   s   insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') (   R   t   absRf   R   (   RW   R`   RX   Rh   Ri   Rk   t   featureRG   RE   Ra   R7   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addGeneEntry  s    &(c      	   C   se   | \ } }	 t  t t |  t |   d  }
 d | |	 | | | |
 | | f } |  j |  d  S(   Ni   s   insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') (   R   R   Rf   R   (   RW   Ro   R`   RX   Rh   Ri   Rk   R   RE   Ra   R7   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addFeatureEntry  s    &"c         C   s-   | \ } } d | | f } |  j  |  d  S(   NsM   insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') (   R   (   RW   R`   t   descriptionRE   Ra   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addAnnotation  s    c	      	   C   sE   | \ }	 }
 d |
 t  |  | | | | | | f } |  j |  d  S(   Ns   insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s',  '%s', '%s', '%s', '%s', '%s', '%s') (   R   R   (   RW   R`   R   t   objTypet   objNamet   isNott   GOtermt   evidencet   otherRE   Ra   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt	   addGoInfo  s    (s   -1c         C   s`   | \ } } t  |  d k  r3 t t |   } n  d | t |  | | f } |  j |  d  S(   Ni    sm   insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')(   Rf   R   R0   R   (   RW   R`   R   R   t   seqLenRE   Ra   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addSequence  s
    c         C   s$   d | | | f } |  j  |  d  S(   Ns]   insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, '%s', '%s', '%s')(   R   (   RW   t   chromoR   t   storageTypeRb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addChromosomeEntry  s    c         C   s   g  } d } xr | D]j } | \ } } } }	 | \ }
 } t  |	  d k  r^ t t |   }	 n  | j | t |	  | | f  q W|  j | |  d  S(   Nsa   insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)i    (   Rf   R   R0   Rn   t   writeBatchDB(   RW   t
   entryArrayR>   t	   stmtArrayRb   Rr   R`   R   R   R   RE   Ra   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addSequenceBatch  s    #c         C   s    d } |  j  | | d | d  S(   NsT   insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, ?, ?, ?)R   (   R   (   RW   R   R>   Rb   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addChromosomeEntryBatch$  s    c      
   C   s   g  } d } x | D]~ } | \ } } } }	 }
 } } | \ } } t  t t |  t |	   d  } | j | t  |  | | |	 | |
 | f  q W|  j | | d | d  S(   Ns   insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) i   R   (   R   R   Rf   Rn   R   (   RW   R   R>   R   Rb   Rr   R`   RX   Rh   Ri   Rk   R   RG   RE   Ra   R7   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addGeneEntryBatch)  s    &/c      
   C   s   g  } d } x | D] } | \ } } } }	 }
 } } | \ } } t  t t |	  t |
   d  } | j | | | t |	  t |
  | | | f  q W|  j | | d | d  S(   Ns   insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) i   R   (   R   R   Rf   Rn   R   (   RW   R   R>   R   Rb   Rr   R`   RG   RX   Rh   Ri   Rk   R   RE   Ra   R7   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addFeatureEntryBatch5  s    &5c   
      C   sb   g  } d } x9 | D]1 } | \ } } | \ } }	 | j  |	 | f  q W|  j | | d | d  S(   NsG   insert into gene_annotation(ID, name, description) values (NULL, ?, ?) R   (   Rn   R   (
   RW   R   R>   R   Rb   Rr   R`   R   RE   Ra   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addAnnotationBatchA  s    c      
   C   s   g  } d } xc | D][ } | \ } } } }	 }
 } } } | \ } } | j  | t |  | |	 |
 | | | f  q W|  j | | d | d  S(   Ns   insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?,  ?, ?, ?, ?, ?, ?) R   (   Rn   R   R   (   RW   R   R>   R   Rb   Rr   R`   R   R   R   R   R   R   R   RE   Ra   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   addGoInfoBatchL  s    /t
   cistematicc      
   C   s  g  } g  } d } d } d } d }	 d }
 d } i d d 6d d 6d d 6} t  |  } | d k rx| D]4} | d	 d
 k r qj n  | j   j   } | d k r | d	 } | d }	 t | d  } | | d }
 n  | d	 | k rI| j |  j | f |	 | | |
 d d f  | d	 } | d }	 t | d  } | | d }
 n  t | d  } t | d  } | d } | j |  j | f d |	 | | |
 | f  qj Wn| d k rx|| D]^} | d	 d
 k rqn  | j d  } t | d  } | d j d  } | d j d  } | d d }	 | | d }
 t | d  d } t | d  d } d | d f } xt |  D]} t | |  d } t | |  d } | | k r| | k r| j | d |	 | | |
 d f  qz| | k r1|
 d k rd } n d } | j | d |	 | | |
 | f  qz| | k r}|
 d k rRd } n d } | j | d |	 | | |
 | f  qz| | k r| | k r|
 d k rd } n d } | j | d |	 | | |
 d f  | j | d |	 | d | |
 | f  qz| | k  ry| | k ry|
 d k r(d } n d } | j | d |	 | | d |
 | f  | j | d |	 | | |
 d f  qz|
 d k rd } d } n d } d } | j | d |	 | | d |
 | f  | j | d |	 | | |
 d f  | j | d |	 | d | d |
 | f  qzWqWn d GH| j   d  S| j   | t k ro|  j	 d d  t |  j	 d! d  t n  |  j
 | d" t |  j | d" t d  S(#   NR/   it   Ft   +R   t   -t   .R   i    t   #i   i   i   t
   TranscriptRe   i   i   t   UCSCs   	i   i	   t   ,i
   i   i   t   genericR   t   5UTRt   3UTRs   %s format not supported yets   delete from sequence_featuresR   s   delete from gene_entriesR>   (   R   t   stript   splitRf   Rn   RE   R5   RT   RK   R_   R   R   (   RW   t   featureFilet   fileTypet   replacet   geneEntryListt   geneFeatureListt   currentGenet   gstartt   gstopt   gchromt   gsenseR   t
   senseArrayt   featfilet   lineR   t   lstartt
   geneFieldst   exonNumt
   exonStartst	   exonStopsR`   R;   t   estartt   estopt   fTypet   fType1t   fType2(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   extendFeaturesW  s    



+


2%	%	%	")	&%	&"5

c         C   s   | s |  j  r |  j } n t j |  j d d } | j   } | j |  | ra | j   } n | j   } | j	   | p |  j  s | j	   n  | S(   Nt   timeouti<   (
   RM   RP   RN   RO   RH   RR   RS   Rm   t   fetchoneRT   (   RW   Rb   Rm   R   R   RZ   R   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyR_     s    
c         C   sn   | r |  j  } n t j |  j d d } | j   } | j |  | j   | j   | sj | j   n  d  S(   NR  i<   (   RP   RN   RO   RH   RR   RS   t   commitRT   (   RW   Rb   R   R   RZ   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyR     s    

c         C   s   | r |  j  } n t j |  j d d } | j   } y | j | |  Wn d | GHn X| j   | j   | s | j   n  d  S(   NR  i<   s   writeBatchDB: problem with %s(   RP   RN   RO   RH   RR   t   executemanyR  RT   (   RW   Rb   R   R   R   RZ   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyR     s    

(9   t   __name__t
   __module__RE   RF   RG   RH   RV   RL   t   oldStylet   customAnnotationsR\   R]   R^   Rd   Rl   Rs   Rx   Ry   Rz   R|   R}   R~   R   R   R   R   R   R   R   R   R   R   R   R   R   R   R6   R   R   RQ   RU   R   R   R   R   R   R   R   R   R   R   R   R   R  R_   R   R   (    (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyR=   l   sl   &															+*	!4			pc   
      C   s  i  } t  |   } d } x d | k r9 | j   } q q W| j   } | d } | i  k r d | d <d | d <d | d <d	 | d
 <n d } xv | D]n } | j   } | d | j   k r | | d d k r | d }	 n | | d }	 | | |	 <n  | d 7} q W| j   | | f S(   s2   process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the 
        name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column 
        name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the 
        same field name will be used. For example, one possible keyFields dict would be:
        keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'}
    R/   s   CREATE TABLEi   i   RX   Rh   i   Ri   i   Ro   i    (   R   R   R   R   RT   (
   t   sqlFilet	   keyFieldsR   t   infileR  t	   tabFieldst   tabNameR;   t
   lineFieldst   outfield(    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt
   processSql  s0    




t   MISCRe   c      	   C   s  g  } | i  k r= d | d <d | d <d | d <d | d <n  t  |  } x2| D]*} | j   j d	  } d | k r | | d }	 n | }	 d
 | k r |	 d | | d
 7}	 n  t | | d  }
 t | | d  } | | d d } | j d d  d } d | k r/| | d d k r/d } q/n  d | k rL| | d } n  | j d |	 f | | |
 | | | j   f  qP W|  d k r|  j |  n | GHd S(   s   process data for a UCSC track, given an instantiated genome object, the data, the name for the feature 
        type in sequence_features, the dataFields in the format returned by processSql(), and a version.
        If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records,
        rather than use addFeatureEntryBatch() to record the added features.

        Note that numberic values are overloaded into the name field using @ as a delimiter.
    i   RX   i   Rh   i   Ri   i   Ro   s   	t   valuet   @t   _randomt   randR   Rk   R   R   RG   t   placeholderR/   N(   R   R   R   Rf   R  Rn   t   upperR   (   t   genomeObjectt   dataFilet   typeNamet
   dataFieldsRG   t   recordsR"  R  R   t   recNameRh   Ri   RX   Rk   (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   processTrack#  s8    


2s	   import %ss	   %s.geneDB(   t	   pysqlite2R    RN   t   sqlite3R{   R   t   osR   R+   R   t   __all__RI   RJ   t	   chromDictt	   chromRootR   Rd   Rl   t   getAllGenesRz   R|   R   t	   allGOInfoR.   R<   R=   R'  R5  RE   t   importStringt   eval(    (    (    sV   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/cistematic/genomes/__init__.pyt   <module>   sP   
	   (/
