
3%Nc           @   s   d  d l  Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m Z d  d l m Z m	 Z	 m
 Z
 d Z d e f d     YZ d d	 d     YZ d S(
   iN(   t   array(   t   getReverseComplementt   getConfigParsert   getConfigOptions   2.1t   ReadDatasetErrorc           B   s   e  Z RS(    (   t   __name__t
   __module__(    (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR      s   t   ReadDatasetc           B   s  e  Z d  Z e d e e e d  Z d   Z d   Z d   Z d   Z	 d   Z
 d   Z d	   Z d
   Z d   Z d d d d  Z d d  Z d   Z d   Z d   Z d   Z d d  Z d d  Z d   Z d   Z d e d  Z e e e d  Z d d  Z e e e d d e e e e e e e e e e d d d d e e d e d  Z d e d   Z e d!  Z d"   Z e e d d e e e e e e e e e e d d d d#  Z  d d d e e e e d$ d%  Z! d d d d&  Z" d d d d e d d'  Z# d d d d e d(  Z$ d d d d e d)  Z% d d d d e d*  Z& e e e e d d+  Z' dD e e d,  Z) d d e e d- e d$ d. d/  Z* d0   Z+ d d1  Z, d2   Z- d3   Z. d4   Z/ d5   Z0 e e e d$ d6  Z1 e e e d7  Z2 e e e d8  Z3 d9   Z4 d: d;  Z5 e d<  Z6 e d=  Z7 d>   Z8 d d?  Z9 d@   Z: d e dA  Z; d dB  Z< d dC  Z= RS(E   sC   Class for storing reads from experiments. Assumes that custom scripts
    will translate incoming data into a format that can be inserted into the
    class using the insert* methods. Default class subtype ('DNA') includes
    tables for unique and multireads, whereas 'RNA' subtype also includes a
    splices table.
    t   DNAc   	      C   s  d |  _  d |  _ d |  _ t |  _ t |  _ d |  _ d |  _ d |  _	 | ri | d k ri t
 d   n  | r | r} d GHn  |  j |  |  j	 } n | } t j |  |  _  t j |  j  _ |  j  j d  | r | |  _ |  j |  j   n |  j d  } | d |  _ y  |  j d  } | d |  _ WnI y  |  j d t t  f g  Wqwt k
 rrd	 GHd
 |  _ qwXn X| r|  j | | |  n  d S(   s    creates an rds datafile if initialize is set to true, otherwise
        will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
        t    R   t   RNAs8   failed to initialize: datasetType must be 'DNA' or 'RNA's   caching ....s   PRAGMA temp_store = MEMORYt   dataTypet
   rdsVersions&   could not add rdsVersion - read-only ?s   pre-1.0N(   s   DNAs   RNA(   t   dbcont   memconR   t   currentRDSVersionR   t   Falset	   memBackedt   memChromt	   memCursort   cachedDBFileR   t   cacheDBt   sqlitet   connectt   Rowt   row_factoryt   executet   initializeTablest   getMetadatat   insertMetadatat   floatt   IOErrort   printRDSInfo(	   t   selft   datafilet
   initializet   datasetTypet   verboset   cachet   reportCountt   dbFilet   metadata(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   __init__   sF    									 c         C   sN   |  j    } | |  j   7} |  j d k r> | |  j   7} n  t |  } | S(   s;    return the number of usable reads in the dataset.
        R
   (   t   getUniqsCountt   getMultiCountR   t   getSplicesCountt   int(   R!   t   total(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   __len__J   s    c         C   s    |  j  d k r |  j   n  d S(   s2    cleanup copy in local cache, if present.
        R	   N(   R   t	   uncacheDB(   R!   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   __del__X   s    c         C   s   | r d | GHn	 d | GH|  j    } d GH| j   } | j   x# | D] } d | d | | GHqI W| r | r |  j   n  d |  j   GH|  j   r d GHn d GHd  S(   Ns   INITIALIZED dataset %ss
   dataset %ss	   metadata:s   	s   default cache size is %d pagess   found indexs   not indexed(   R   t   keyst   sortt   printReadCountst   getDefaultCacheSizet   hasIndex(   R!   R"   R'   R#   R)   t	   pnameListt   pname(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR    _   s    	
c         C   sm   |  j    } |  j   } |  j d k r9 d | | f GHn0 |  j d k ri |  j   } d | | | f GHn  d  S(   NR   s"   
%d unique reads and %d multireadsR
   s4   
%d unique reads, %d spliced reads and %d multireads(   R+   R,   R   R-   (   R!   t   ucountt   mcountt   scount(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR5   v   s    c         C   sT   t    } t | d d d d } | t _ d t j   |  _ t j | |  j  d S(   s+    copy geneinfoDB to a local cache.
        t   generalt   cistematic_tempt   defaults   /tmps   %s.dbN(   R   R   t   tempfilet   tempdirt   mktempR   t   shutilt   copyfile(   R!   t   filenamet   configParsert   cisTemp(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR      s
    		c         C   s   t  j |  j |  d S(   s+    copy geneinfoDB to a local cache.
        N(   RC   RD   R   (   R!   RE   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   saveCacheDB   s    c         C   sI   |  j  d k rE y t j |  j   Wn d |  j  GHn Xd |  _ n  d S(   s-    delete geneinfoDB from local cache.
        R	   s   could not delete %sN(   R   t   ost   removet   cachedDB(   R!   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR1      s    c         C   s!   d | | f } |  j  |  d S(   s:    attach another database file to the readDataset.
        s   attach '%s' as %sN(   R   (   R!   RE   t   dbNamet   stmt(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   attachDB   s    c         C   s   d | } |  j  |  d S(   s4    detach a database file to the readDataset.
        s	   detach %sN(   R   (   R!   RL   RM   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   detachDB   s    
t   *R	   c         C   sG   d | | | | | f } | d k r6 | d | 7} n  |  j  |  d S(   s    import into current RDS the table (with columns destcolumns,
            with default all columns) from the database file asname,
            using the column specification of ascolumns (default all).
        s&   insert into %s %s select %s from %s.%sR	   s    where flag = '%s' N(   t   executeCommit(   R!   RL   t   tablet	   ascolumnst   destcolumnst   flaggedRM   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   importFromDB   s    c         C   st   g  } |  j    } | d k r+ d | } n  d | } | j |  | j   } x | D] } | j | d  qU W| S(   sB    get a list of table names in a particular database file.
        R	   s   %s.s3   select name from %ssqlite_master where type='table't   name(   t   getSqlCursorR   t   fetchallt   append(   R!   RL   t
   resultListt   sqlRM   t   resultst   row(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt	   getTables   s    
c         C   s(   |  j  r |  j   } n |  j   } | S(   N(   R   t   getMemCursort   getFileCursor(   R!   R\   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyRX      s    	c         C   s   |  j  j   S(   s]    returns a cursor to memory database for low-level (SQL)
        access to the data.
        (   R   t   cursor(   R!   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR`      s    c         C   s   |  j  j   S(   s[    returns a cursor to file database for low-level (SQL)
        access to the data.
        (   R   Rb   (   R!   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyRa      s    c         C   s3   d } t  |  j | d t d d  } | d k S(   s=    return True if the RDS file has at least one index.
        s5   select count(*) from sqlite_master where type='index't   returnResultsi    (   R.   R   t   True(   R!   RM   t   count(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR7      s    #i c         C   s   | j  d |  | j  d  | j  d |  j  d } d | } | j  d |  | j  d |  |  j d k r d	 } d | } | j  d
 |  d	 } d | } | j  d |  n  | j   d S(   s}    creates table schema in a database connection, which is
        typically a database file or an in-memory database.
        s   PRAGMA DEFAULT_CACHE_SIZE = %ds3   create table metadata (name varchar, value varchar)s,   insert into metadata values('dataType','%s')s   start int, stop intsw   (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)s   create table uniqs %ss   create table multi %sR
   s,   startL int, stopL int, startR int, stopR ints   create table splices %ss   create table multisplices %sN(   R   R   t   commit(   R!   t   dbConnectionR&   t   positionSchemat   tableSchema(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR      s    


c         C   s   d } i  } | d k r% d | } n  |  j    } | j d |  | j   } x | D] } | d } | d } | | k r | | | <qU t }	 d }
 xQ |	 r t j | t |
  g d  } | | k r | | | <t }	 n  |
 d 7}
 q WqU W| S(	   s+    returns a dictionary of metadata.
        R	   s    where name='%s's#   select name, value from metadata %sRW   t   valuei   t   :i   (   RX   R   RY   Rd   t   stringt   joint   strR   (   R!   t	   valueNamet   whereClauset   resultsDictR\   R]   R^   t   parameterNamet   parameterValuet   tryingt   indext   newName(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR     s*    

	
	c         C   s   |  j    } d | k r' t d   nT | d } d | k rP | j   d } n  t |  } | d k  rw t d   n  | Sd S(   s2    returns readsize if defined in metadata.
        t   readsizes   no readsize parameter definedt   importi    s   readsize is negativeN(   R   R   t   splitR.   (   R!   R)   t   readSize(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getReadSize"  s    
c         C   s!   t  |  j d d t d d  S(   s)    returns the default cache size.
        s   PRAGMA DEFAULT_CACHE_SIZERc   i    (   R.   R   Rd   (   R!   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR6   4  s    t   uniqsc         C   s   d | } |  j    } | j |  g  } x} | D]u } | rc | d | k r | j | d  q q0 | d d } t | j    d k r0 | | k r0 | j |  q0 q0 W| j   | S(   sA    returns a sorted list of distinct chromosomes in table.
        s   select distinct chrom from %st   chromi   i    (   RX   R   RZ   t   lent   stripR4   (   R!   RR   t	   fullChromt	   statementR\   R]   R^   t	   shortName(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getChromosomes:  s    
$
c         C   s   d } | r! |  j  | d  } n  | rQ |  j  | d d d } t | |  } n  | r{ |  j  | d  } t | |  } n  | S(   sI    returns the maximum coordinate for reads on a given chromosome.
        i    R|   t   splicest
   startFieldt   startRt   multi(   t   getMaxStartCoordinateInTablet   max(   R!   R}   t   doUniqst   doMultit	   doSplicest   maxCoordt	   spliceMaxt   multiMax(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getMaxCoordinateP  s    t   startc         C   sg   d } d | | | f } |  j    } y+ | j |  t | j   d d  } Wn d | GHn X| S(   Ni    s)   select max(%s) from %s where chrom = '%s's,   couldn't retrieve coordMax for chromosome %s(   RX   R   R.   RY   (   R!   R}   RR   R   R   t   sqlStatementR\   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR   d  s    ic   )   	   C   sx  |  j  | | | | | | | |  } | r3 d } n |  j d | | | | |  } |  j | | |  } | r | d | | g } | r | j d  | j |  | j d  | j |  | j |  q n | d | g } | r| r d } n  | rD| d | g } | rS| j d  | j |  | j d  | j |  qSn | d | g } t j |  } | rqd } n  | d | d	 | d
 | d g } n  | r|  j rd |  j _	 n d |  j
 _	 | j d  n# | r| j d  n | j d  |  j   } t j |  } | j |  i  }  | rg  | D]9 }! i t |! d  d 6|! d d 6t |! d  d 6^ q*|  | <|  j rt j |  j _	 qtt j |  j
 _	 nd }" d }# d }$ x| D]}! |! d }% | r|! d } n |! d d } | r| |" k rg  |  | <| }" | }& n | r|% }' d |% k r>|% j d  d }' n  d |' k rh| rh|% j d  \ }' }$ n  |' |# k rg  |  |' <|' }# |' }& qn  i t |! d  d 6}( | rt |! d  |( d <n  | s|! d |( d <n  | rt |! d  |( d <n  | r|! d |( d <n  | r&|! d |( d <n  |	 r9|% |( d <n  |
 rL| |( d <n  | r_|$ |( d <n  |  |& j |(  qW|  S(    s    returns a dictionary of reads in a variety of formats
        and which can be restricted by chromosome or custom-flag.
        Returns unique reads by default, but can return multireads
        with doMulti set to True.

        s    select start, sense, sum(weight)s   select ID, chrom, start, readIDs
   from uniqss	   UNION ALLs
   from multis"   select start, sense, weight, chroms	   union alls   from (sL   ) group by chrom,start having ( count(start) > 1 and count(chrom) > 1) unions   from(sG   ) group by chrom, start having ( count(start) = 1 and count(chrom) = 1)s   order by starts   order by readID, starts   order by chrom, starti    R   i   t   sensei   t   weightR	   t   readIDR}   i   s   ::t   /t   stopt   flagt   mismatcht   pairIDN(   t   getReadWhereQueryt   getReadSelectQueryt   getReadGroupQueryRZ   Rl   Rm   R   t   NoneR   R   R   RX   R   R.   R   R   R   Ry   ()   R!   t   bothEndst   noSenseR   R}   R   t
   withWeightt   withFlagt   withMismatcht   withIDt	   withChromt
   withPairIDR   R   t   findallOptimizet
   readIDDictt   readLikeR   R   t   limitt   hasMismatcht   flagLiket   strandt	   combine5pt
   whereQueryt   selectQueryt
   groupQueryRM   t	   subSelectt   sqlStmtR\   t   sqlQueryRq   R^   t   currentChromt   currentReadIDR   R   t   dictKeyt	   theReadIDt   newrow(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getReadsDictq  s    $				J	

	
c
         C   s  |	 r d }
 d } n d }
 d } g  } | d k rV | |  j  k rV | j d |  n  | d k r | r t j d | d g d  } | j |  q | j d	 |  n  | d
 k r | j d |
 | f  n  | d
 k r | j d | | f  n  t |  d k r0t j d | d g d  } | j |  n  | rF| j d  n  | d k rf| j d |  n  t |  d k rt j | d  } d | } n d } | S(   Nt   startLt   stopRR   R   R	   s   chrom = '%s's   flag LIKE "%s   %"s   flag = '%s'is   %s > %ds   %s < %di    s   readID LIKE  's   %'s   mismatch != ''t   +t   -s   sense = '%s's    and s   where %s(   R   R   (   R   RZ   Rl   Rm   R~   (   R!   R}   R   R   R   R   R   R   R   t   splicet	   startTextt   stopTextRp   t   flagLikeClauset   readIDClauset   whereStatementR   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR     s:    	c   	      C   s   | g } | r | j  d  n  | s5 | j  d  n  | rK | j  d  n  | ra | j  d  n  | rw | j  d  n  t j | d  } | S(   NR   R   R   R   R   t   ,(   RZ   Rl   Rm   (	   R!   t
   baseSelectR   R   R   R   R   t   selectClauseR   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR   "  s    	c         C   sR   g  } | r d g } n  | d k r? | r? | j  d |  n  t j |  } | S(   Ns   GROUP BY start, sensei    s   LIMIT %d(   RZ   Rl   Rm   (   R!   R   R   R   t   groupByR   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR   9  s    c   "   
   C   s  |  j  | | | | | | | d t } d } |  j | | | | |  } |  j   } d | | f } | j |  d } d } i  } x| D]} d } | d } | r | d } n | d d } | r | | k r g  | | <| } | } nX | rEd	 | k r| j d	  \ } } n | } | | k rEg  | | <| } | } qEn  i t | d
  d
 6} t | d  | d <t | d  | d <t | d  | d <| s| d | d <n  | rt | d  | d <n  | r| d | d <n  | r| d | d <n  | r| | d <n  |	 r | | d <n  |
 r3| | d <n  | r| j   }  |  d =|  d =| }! |! d
 =|! d =| | j	 |   | | j	 |!  q | | j	 |  q W| S(   s    returns a dictionary of spliced reads in a variety of
        formats and which can be restricted by chromosome or custom-flag.
        Returns unique spliced reads for now.
        R   s6   select ID, chrom, startL, stopL, startR, stopR, readIDs)   %s from splices %s order by chrom, startLR	   i    R   R}   i   R   R   t   stopLR   R   R   R   R   R   R   (
   R   Rd   R   RX   R   Ry   R.   R   t   copyRZ   ("   R!   R   R   R}   R   R   R   R   R   R   R   R   t	   splitReadR   R   R   R   R   R   R   R   R\   RM   R   R   Rq   R^   R   R   R   R   R   t   leftDictt	   rightDict(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getSplicesDictF  sp    	'

	
t   bothc	         C   s  d }	 d }
 d } d } | d k r1 d | } n  | rl y" t  |  j | | | |   }	 Wql d }	 ql Xn  | r y" t  |  j | | | |   }
 Wq d }
 q Xn  | r y" t  |  j | | | |   } Wq d } q Xn  | r |	 |
 | } | S|	 |
 | f Sd S(   s0    return read counts for a given region.
        i    R	   R   R   s    sense ='%s' N(   R   R   (   R   R+   R,   R-   (   R!   R}   t   rmint   rmaxR|   R   R   t   reportCombinedR   R:   R;   R<   t   restrictR/   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt	   getCounts  s2    """c         C   s1   |  j  | | | d t d t d t d t d d S(   s0    return read counts for a given region.
        R|   R   R   R   R   R   (   R   Rd   (   R!   R}   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getTotalCounts  s    c         C   s\  g  } d }	 | d k r7 | |  j  k r7 d | g } n  | d k rc | j d | t |  f  n  | d k r | j d | t |  f  n  | d k r | j |  n  t |  d k r t j | d  }
 d |
 } n d } |  j   } | r| j d | | | f  n | j d	 | | f  | j   } y t	 | d  }	 Wn d }	 n X|	 S(
   s;    returns the number of row in the specified table.
        i    R	   s
   chrom='%s's   %s >= %ss   %s <= %ss    and s   where %ss0   select count(distinct chrom+%s+sense) from %s %ss   select sum(weight) from %s %s(
   R   RZ   Rn   R~   Rl   Rm   RX   R   t   fetchoneR.   (   R!   RR   R}   R   R   R   t   distinctR   Rp   Re   R   R   R\   t   result(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getTableEntryCount  s0      
c      	   C   s"   |  j  d | | | | | d d S(   s9    returns the number of row in the splices table.
        R   R   R   (   R   (   R!   R}   R   R   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR-     s    c         C   s   |  j  d | | | | |  S(   sD    returns the number of distinct readIDs in the uniqs table.
        R|   (   R   (   R!   R}   R   R   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR+     s    c         C   s   |  j  d | | | | |  S(   sA    returns the total weight of readIDs in the multi table.
        R   (   R   (   R!   R}   R   R   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR,     s    c         C   s  g  } | r | j  d  n  | r2 | j  d  n  | rH | j  d  n  t |  d k ro t j | d  } n d } d } | d k r d | } n  d | | f }	 |  j   }
 |
 j |	  |
 j   } | r g  | D] } | d j d	  d ^ q Sg  | D] } | d ^ q Sd
 S(   s    get readID's.
        s   select readID from uniqss   select readID from multis   select readID from splicesi    s    union R	   s   LIMIT %ds   %s group by readID %sR   N(   RZ   R~   Rl   Rm   RX   R   RY   Ry   (   R!   R|   R   R   t   pairedR   RM   t
   selectPartt	   limitPartR   R\   R   t   x(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt
   getReadIDs  s(    (c         C   s  |  j    } | r | g } n |  j   } | j   i  } x| D]}} | rY d | GHn  g  | | <| rP|  j d k rP|  j d t d | d t d t d t  } | j   }	 x|	 D]}
 | |
 d } | d	 } | d
 } | d } | d } | d } | j d  } x<| D]4} d | k r)qn  t |  } | d k rs| d } | | d } t	 | d | d ! } nX | d k rt
 | d g  } t
 | | d g  } | t	 | d | d ! d } n  t	 |  t	 |  d } d } t	 |  t	 |  k r| | d } n | | } | | } | | j | | | | g  qWq Wn  |  j d t d | d t d t  } | | j   k rqA n  x/| | D]#} | d } | d } | d } | j d  } x | D] } d | k rqn  t |  } | d k r3| d } | | d } t	 | d | d ! } nX | d k rt
 | d g  } t
 | | d g  } | t	 | d | d ! d } n  | | d } | | j | | | | g  qWqWqA W| S(   sB    returns the uniq and spliced mismatches in a dictionary.
        s%   getting mismatches from chromosome %sR
   R   R}   R   R   R   i    R   R   R   R   R   R   t   NR   i   R   R   (   R{   R   R4   R   R   Rd   R3   Ry   R~   R.   R   RZ   R   (   R!   t   mischromR%   t
   useSplicest   readlent   hitChromListt   snpDictt   achromt
   spliceDictt   spliceIDListt   spliceIDt   spliceEntryt   startpost   lefthalft
   rightstartR   t
   mismatchest   spMismatchListR   t
   change_lent   change_fromt   change_baset
   change_post	   firsthalft
   secondhalft	   change_att   hitDictt	   readEntryR   t   mismatchList(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getMismatches  s~    

*





"

($



")g      ?i    c
         C   s  |  j    }
 y t |
 d  } Wn t k
 r9 d } n X|
 d } d | } i  } t |	  | d <d t |	  | d <| d k r |  j | d | d	 | | } n | | | | d } t d
 d g |  } |  j d t d | d t d | d | d | d t  } | d k  rd } n  x | | D] } | d } | d } | d } | | | | } x t | | |  D]| } yi | s| d k r| d k r| | c | | 7<n/ | d k r| d k r| | c | | 8<n  WqrqrqrXqrWq(W~ | r| d k r| d k rD|  j d t d | d t d | d |  } n |  j d t d | d t  } | | k rxr| | D]c} | d } | d } | d } | d } | d } | | | k  ryx t t	 | |   D]q } | | | } | s| d k r| d k r| | c | 7<q| d k r| d k r| | c | 8<qqWx t t	 | |   D]q } | | | } | s| d k r| d k r| | c | 7<qd| d k rd| d k rd| | c | 8<qdqdWqyqyWn  ~ n  | S(   s   return a profile of the chromosome as an array of per-base read coverage....
            keepStrand = 'both', 'plusOnly', or 'minusOnly'.
            Will also shift position of unique and multireads (but not splices) if shift is a natural number
        Rw   i    R   g      ?R   iR   R   R   t   fg        R   R}   R   R   R   R   R   R   t	   minusOnlyt   plusOnlyR
   R   R   R   R   R   (
   R   R.   t   KeyErrorR   R    R   Rd   t   rangeR   t   abs(   R!   t
   chromosomet   cstartt   cstopt   useMultiR   t   normalizationFactort   trackStrandt
   keepStrandt
   shiftValueR)   R   R   t   scalet   shiftt   lastNTt
   chromModelR   R   t   hstartR   R   t
   currentposR   R   t   Lstartt   Lstopt   Rstartt   Rstopt   rsenseRu   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   getChromProfileo  sr    


"6	


-




!c         C   s$   |  j  j d |  |  j  j   d S(   sL    inserts a list of (pname, pvalue) into the metadata
        table.
        s.   insert into metadata(name, value) values (?,?)N(   R   t   executemanyRf   (   R!   t
   valuesList(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR     s    c         C   sZ   d t  |  | f } | d k r9 | d t  |  7} n  |  j j |  |  j j   d S(   sM    update a metadata field given the original value and the new value.
        s.   update metadata set value='%s' where name='%s'R	   s    and value='%s' N(   Rn   R   R   Rf   (   R!   R9   t   newValuet   originalValueRM   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   updateMetadata  s
    c         C   s$   |  j  j d |  |  j  j   d S(   su    inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
        into the uniqs table.
        sn   insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)N(   R   R  Rf   (   R!   R  (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   insertUniqs  s    c         C   s$   |  j  j d |  |  j  j   d S(   su    inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
        into the multi table.
        sn   insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)N(   R   R  Rf   (   R!   R  (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   insertMulti  s    c         C   s$   |  j  j d |  |  j  j   d S(   s    inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
        into the splices table.
        s   insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)N(   R   R  Rf   (   R!   R  (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   insertSplices  s    c         C   s$   |  j  j d |  |  j  j   d S(   s    inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
        into the multisplices table.
        s   insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)N(   R   R  Rf   (   R!   R  (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   insertMultisplices  s    c         C   s   d } | d k r d } n  | r; |  j  j d | |  n  | r[ |  j  j d | |  n  |  j d k r | r |  j  j d | |  |  j  j d | |  n  |  j  j   d	 S(
   s6   update reads on file database in a list region of regions for a chromosome to have a new flag.
            regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
            sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
        R	   R   s    and sense = ? sG   UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? sG   UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? R
   s\   UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? s\   UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? N(   R   R  R   Rf   (   R!   t   regionsListR|   R   R   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt	   flagReads  s    	c         C   sw   | r |  j  j d |  n  | r: |  j  j d |  n  |  j d k rf | rf |  j  j d |  n  |  j  j   d S(   s4    set the flag fields in the entire dataset.
        s   UPDATE uniqs SET flag = '%s's   UPDATE multi SET flag = '%s'R
   s   UPDATE splices SET flag = '%s'N(   R   R   R   Rf   (   R!   R   R|   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   setFlags  s    c         C   s   |  j  d | | |  d S(   sn    reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
        R	   N(   R  (   R!   R|   R   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt
   resetFlags  s    c         C   s   |  j  j d |  d  S(   NsI   UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? (   R   R  (   R!   t   readList(    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   reweighMultireads  s    t   ONc         C   s/   y |  j  j d |  Wn d | GHn Xd  S(   Ns   PRAGMA SYNCHRONOUS = %ss-   warning: couldn't set PRAGMA SYNCHRONOUS = %s(   R   R   (   R!   Rj   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   setSynchronousPragma   s    c         C   s5   |  j  j d |  | r1 |  j  j d |  n  d  S(   Ns   PRAGMA CACHE_SIZE = %ds   PRAGMA DEFAULT_CACHE_SIZE = %d(   R   R   (   R!   R&   R?   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt
   setDBcache'  s    c         C   s3   |  j    } | j |  | r/ | j   } | Sd  S(   N(   RX   R   RY   (   R!   R   Rc   R\   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR   -  s
    c         C   s7   |  j  |  |  j r& |  j j   n |  j j   d  S(   N(   R   R   R   Rf   R   (   R!   R   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyRQ   6  s    	c         C   s   | |  j    k r" |  j |  n  |  j d  |  j j d  d GH|  j j d  d GH|  j j d  d GH|  j j d  d	 GH|  j d
 k r |  j j d  d GH|  j j d  d GH|  j j d  d GHn  |  j j   |  j d  d S(   s    Builds the file indeces for the main tables.
            Cache is the number of 1.5 kb pages to keep in memory.
            100000 pages translates into 150MB of RAM, which is our default.
        t   OFFs-   CREATE INDEX uPosIndex on uniqs(chrom, start)s   built uPosIndexs(   CREATE INDEX uChromIndex on uniqs(chrom)s   built uChromIndexs-   CREATE INDEX mPosIndex on multi(chrom, start)s   built mPosIndexs(   CREATE INDEX mChromIndex on multi(chrom)s   built mChromIndexR
   s0   CREATE INDEX sPosIndex on splices(chrom, startL)s   built sPosIndexs1   CREATE INDEX sPosIndex2 on splices(chrom, startR)s   built sPosIndex2s*   CREATE INDEX sChromIndex on splices(chrom)s   built sChromIndexR#  N(   R6   R%  R$  R   R   R   Rf   (   R!   R&   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt
   buildIndex?  s(    c         C   s   y |  j  d  |  j j d  |  j j d  |  j j d  |  j j d  |  j d k r |  j j d  y |  j j d  Wn n X|  j j d	  n  |  j j   Wn d
 GHn X|  j  d  d S(   s5    drops the file indices for the main tables.
        R&  s   DROP INDEX uPosIndexs   DROP INDEX uChromIndexs   DROP INDEX mPosIndexs   DROP INDEX mChromIndexR
   s   DROP INDEX sPosIndexs   DROP INDEX sPosIndex2s   DROP INDEX sChromIndexs   problem dropping indexR#  N(   R$  R   R   R   Rf   (   R!   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt	   dropIndex\  s"    	c         C   s  d |  _  t j d  |  _  |  j |  j   |  j j   } d } | d k rk d | GHd | } | |  _ n	 d |  _ |  j  j d  |  j  j d  |  j  j d  | j d  } g  } x) | D]! } | j | d	 | d
 f  q W|  j  j	 d |  |  j
 d |  |  j
 d |  |  j d k r7|  j |  n  | r| d k r|  j  j d  |  j  j d  |  j d k r|  j  j d  |  j  j d  qq|  j  j d  |  j  j d  |  j d k r|  j  j d  |  j  j d  qn  t |  _ t j |  j  _ |  j  j   d S(   s    makes a copy of the dataset into memory for faster access.
        Can be restricted to a "full" chromosome. Can also build the
        memory indices.
        R	   s   :memory:s
   memSync %ss    where chrom = '%s' s   PRAGMA temp_store = MEMORYs   PRAGMA CACHE_SIZE = 1000000s   delete from metadatas    select name, value from metadataRW   Rj   s.   insert into metadata(name, value) values (?,?)R|   R   R
   s&   CREATE INDEX uPosIndex on uniqs(start)s&   CREATE INDEX mPosIndex on multi(start)s*   CREATE INDEX sPosLIndex on splices(startL)s*   CREATE INDEX sPosRIndex on splices(startR)s-   CREATE INDEX uPosIndex on uniqs(chrom, start)s-   CREATE INDEX mPosIndex on multi(chrom, start)s1   CREATE INDEX sPosLIndex on splices(chrom, startL)s1   CREATE INDEX sPosRIndex on splices(chrom, startR)N(   R   R   R   R   R   Rb   R   R   RZ   R  t   copyDBEntriesToMemoryR   t   copySpliceDBEntriesToMemoryRd   R   R   R   Rf   (   R!   R}   Ru   Rb   t   whereclauseR]   t   results2R^   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   memSyncv  sJ    		
		c         C   s   |  j  j   } | j d | | f  } g  } x_ | D]W } | j | d | d t | d  t | d  | d | d | d | d	 f  q5 W|  j j d
 | |  d  S(   NsK   select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %sR   R}   R   R   R   R   R   R   sk   insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)(   R   Rb   R   RZ   R.   R   R  (   R!   RL   Rp   Rb   t   sourceEntriest   destinationEntriesR^   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR)    s    Uc         C   s   |  j  j   } | j d |  } g  } xy | D]q } | j | d | d t | d  t | d  t | d  t | d  | d | d	 | d
 | d f
  q/ W|  j j d |  d  S(   Nsa   select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %sR   R}   R   R   R   R   R   R   R   R   s   insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)(   R   Rb   R   RZ   R.   R   R  (   R!   Rp   Rb   R.  R/  R^   (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR*    s    O N(>   R   R   t   __doc__R   Rd   R*   R0   R2   R    R5   R   RH   R1   RN   RO   RV   R_   RX   R`   Ra   R7   R   R   R{   R6   R   R   R   R   R   R   R   R   R   R   R   R-   R+   R,   R   R   R   R  R   R  R  R  R  R  R  R  R   R"  R$  R%  R   RQ   R'  R(  R-  R)  R*  (    (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyR      s   	1				
	
										!		,		N#)!P		M										4
(    (   t   sqlite3R   Rl   R@   RC   RI   R    t
   commoncodeR   R   R   R   t	   ExceptionR   R   (    (    (    s=   /woldlab/castor/data00/home/georgi/erange-4.0a/ReadDataset.pyt   <module>   s   