
Nc           @   s-  d  d l  Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l	 m	 Z	 d  d l
 m Z m Z m Z d Z d e f d     YZ d f  d     YZ d	 e f d
     YZ d f  d     YZ d   Z d   Z d   Z d   Z d   Z d   Z d d d  Z d S(   iN(   t   array(   t   getReverseComplementt   getConfigParsert   getConfigOptions   3.0t   ReadDatasetErrorc           B   s   e  Z RS(    (   t   __name__t
   __module__(    (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR      s   t   ReadCounterc           B   sD   e  Z d  Z d Z d  Z i  d d  Z d   Z d   Z d   Z RS(   i    g        t   bothc         C   s   | |  _  | |  _ d  S(   N(   t   multiReadIDCountt   sense(   t   selfR	   R
   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   __init__   s    	c         C   sk   |  j  |  } | d k r7 |  j t d |  7_ n0 t | j  rX |  j d 7_ n |  j d 7_ d  S(   Ni   g      ?(   t   getReadMultiplicityt   multiReadCountt   floatt   isSpliceEntryt   cigart   spliceReadCountt   uniqReadCount(   R   t   readt   multiplicity(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   __call__   s    c         C   sN   t  |  } d | j | f } y |  j | } Wn t k
 rI d } n X| S(   Ns   %s%si   (   t   getPairedReadNumberSuffixt   qnameR	   t   KeyError(   R   R   t   pairReadSuffixt   readIDR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   '   s    
c         C   s/   t  |  } d | j | f } |  j j |  S(   Ns   %s%s(   R   R   R	   t   has_key(   R   R   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   isMulti2   s    (	   R   R   R   R   R   R   R   R   R   (    (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR      s   	
	t   MaxCoordFinderc           B   s#   e  Z d  Z d  Z d  Z d   Z RS(   i    c         C   s|   |  j  |  r* t |  j | j  |  _ nN t | j  r` t |  j t | j | j   |  _ n t |  j | j  |  _ d  S(   N(	   R   t   maxt   maxMultiStartt   posR   R   t   getSpliceRightStartt   maxSpliceStartt   maxUniqueStart(   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   >   s
    '(   R   R   R$   R    R#   R   (    (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   9   s   t   ReadDatasetc           B   s  e  Z d  Z e d e e e d  Z d   Z d   Z d   Z d   Z	 d   Z
 d   Z d	   Z d
   Z d   Z d   Z d   Z d d d d  Z d d  Z d   Z d   Z d   Z d d  Z d d  Z d   Z d   Z d   Z e d  Z d   Z d   Z e e e d  Z d d   Z e e e d d e e e e e e e e e e d dM dM d! e e d e d"  Z! e d d e d d#  Z" e d$ d%  Z# d e d&  Z$ e d'  Z% d(   Z& e e d d e e e e e e e e e e dM dM d d)  Z' d*   Z( dM dM dM e e e e d+ d,  Z) d dM dM d+ d-  Z* d d d d.  Z+ d d d d e d d/  Z, dM dM dM d+ e d0  Z- dM dM dM d+ e d1  Z. d d d d e d2  Z/ e e e e d! d3  Z0 dM e e d4  Z1 d! d! e e d5 e d+ d6 d7  Z2 d8   Z3 d d9  Z4 d:   Z5 d;   Z6 d<   Z7 d=   Z8 e e e d+ d>  Z9 e e e d?  Z: d@   Z; e e e dA  Z< dB   Z= dC dD  Z> e dE  Z? e dF  Z@ dG   ZA d dH  ZB dI   ZC d e dJ  ZD d dK  ZE d dL  ZF RS(N   sC   Class for storing reads from experiments. Assumes that custom scripts
    will translate incoming data into a format that can be inserted into the
    class using the insert* methods. Default class subtype ('DNA') includes
    tables for unique and multireads, whereas 'RNA' subtype also includes a
    splices table.
    t   DNAc         C   sZ  d |  _  d |  _ d |  _ t |  _ t |  _ d |  _ d |  _ d |  _	 |  j
 | d  |  _ t j | d  |  _ d |  _ d |  _ i d d 6|  _ | r | d	 k r t d   n  | r | |  _ |  j |  j   n |  j d |  _ y |  j d |  _ Wn- t k
 r*|  j d t t  f g  n X|  j   |  _ | rV|  j | | |  n  d S(
   s    creates an rds datafile if initialize is set to true, otherwise
        will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
        t    t   rbt   RNAt   dataTypeR&   s8   failed to initialize: datasetType must be 'DNA' or 'RNA't
   rdsVersionN(   R&   R)   (   t   dbcont   memconR*   t   currentRDSVersionR+   t   Falset	   memBackedt   memChromt	   memCursort   cachedDBFilet   getMultiReadIDCountst   multiReadIDCountst   pysamt   Samfilet   bamfilet   Nonet   readsizet   totalReadWeightt   metadataR   t   initializeTablesR   t   insertMetadataR   t   getFullReadCountt   fullReadCountst   printRDSInfo(   R   t   datafilet
   initializet   datasetTypet   verboset   cachet   reportCount(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   Q   s4    											 c         C   s=   d } |  j  j } x$ | D] } | |  j  j |  7} q W| S(   s    return the number of reads in the bam file
            This is not the same as the original as the original returned total weight.
        i    (   R8   t
   referencest   count(   R   RI   RH   t	   reference(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   __len__   s
    c   	      C   s   y t  j | |  } Wn# t k
 r; d GHt j d  n Xi  } xk | j d t  D]W } t |  } d | j | f } y | | c d 7<WqU t	 k
 r d | | <qU XqU Wx. | j
   D]  } | | d k r | | =q q W| S(   Ns   samfile index not foundi   t	   until_eofs   %s%s(   R6   R7   t
   ValueErrort   syst   exitt   fetcht   TrueR   R   R   t   keys(	   R   t   samFileNamet   fileModet   samfilet   readIDCountsR   R   t   readNameR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR4      s"    c         C   si   |  j  d k rb |  j d } | |  j d 7} |  j d k rP | |  j d 7} n  t |  |  _  n  |  j  S(   sB    return the total weight of usable reads in the bam file.
        t   uniqt   multiR)   t   spliceN(   R;   R9   R@   R*   t   int(   R   t   total(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR;      s    c         C   s    |  j  d k r |  j   n  d S(   s2    cleanup copy in local cache, if present.
        R'   N(   R3   t	   uncacheDB(   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   __del__   s    c         C   s   | r d | GHn	 d | GHd GH|  j  j   } | j   x, | D]$ } d | d t |  j  |  GHq@ W| r | r |  j   n  d |  j   GHd  S(   Ns   INITIALIZED dataset %ss
   dataset %ss	   metadata:s   	s   default cache size is %d pages(   R<   RR   t   sortt   strt   printReadCountst   getDefaultCacheSize(   R   RB   RG   RC   t	   pnameListt   pname(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRA      s    	
"c         C   sp   |  j  d } |  j  d } |  j d k r; d | | f GHn1 |  j d k rl |  j  d } d | | | f GHn  d  S(   NRX   RY   R&   s"   
%d unique reads and %d multireadsR)   RZ   s4   
%d unique reads, %d spliced reads and %d multireads(   R@   R*   (   R   t   ucountt   mcountt   scount(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRa      s    c         C   sT   t    } t | d d d d } | t _ d t j   |  _ t j | |  j  d S(   s+    copy geneinfoDB to a local cache.
        t   generalt   cistematic_tempt   defaults   /tmps   %s.dbN(   R   R   t   tempfilet   tempdirt   mktempR3   t   shutilt   copyfile(   R   t   filenamet   configParsert   cisTemp(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   cacheDB   s
    		c         C   s   t  j |  j |  d S(   s+    copy geneinfoDB to a local cache.
        N(   Rn   Ro   R3   (   R   Rp   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   saveCacheDB   s    c         C   sI   |  j  d k rE y t j |  j   Wn d |  j  GHn Xd |  _ n  d S(   s-    delete geneinfoDB from local cache.
        R'   s   could not delete %sN(   R3   t   ost   removet   cachedDB(   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR]      s    c         C   s!   d | | f } |  j  |  d S(   s:    attach another database file to the readDataset.
        s   attach '%s' as %sN(   t   execute(   R   Rp   t   dbNamet   stmt(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   attachDB  s    c         C   s   d | } |  j  |  d S(   s4    detach a database file to the readDataset.
        s	   detach %sN(   Rx   (   R   Ry   Rz   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   detachDB  s    
t   *R'   c         C   sG   d | | | | | f } | d k r6 | d | 7} n  |  j  |  d S(   s    import into current RDS the table (with columns destcolumns,
            with default all columns) from the database file asname,
            using the column specification of ascolumns (default all).
        s&   insert into %s %s select %s from %s.%sR'   s    where flag = '%s' N(   t   executeCommit(   R   Ry   t   tablet	   ascolumnst   destcolumnst   flaggedRz   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   importFromDB  s    c         C   st   g  } |  j    } | d k r+ d | } n  d | } | j |  | j   } x | D] } | j | d  qU W| S(   sB    get a list of table names in a particular database file.
        R'   s   %s.s3   select name from %ssqlite_master where type='table't   name(   t   getSqlCursorRx   t   fetchallt   append(   R   Ry   t
   resultListt   sqlRz   t   resultst   row(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt	   getTables'  s    
c         C   s(   |  j  r |  j   } n |  j   } | S(   N(   R0   t   getMemCursort   getFileCursor(   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   :  s    	c         C   s   |  j  j   S(   s]    returns a cursor to memory database for low-level (SQL)
        access to the data.
        (   R-   t   cursor(   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   C  s    c         C   s   |  j  j   S(   s[    returns a cursor to file database for low-level (SQL)
        access to the data.
        (   R,   R   (   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   J  s    i c         C   s   | j  d |  | j  d  | j  d |  j  d } d | } | j  d |  | j  d |  |  j d k r d	 } d | } | j  d
 |  d	 } d | } | j  d |  n  | j   d S(   s}    creates table schema in a database connection, which is
        typically a database file or an in-memory database.
        s   PRAGMA DEFAULT_CACHE_SIZE = %ds3   create table metadata (name varchar, value varchar)s,   insert into metadata values('dataType','%s')s   start int, stop intsw   (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)s   create table uniqs %ss   create table multi %sR)   s,   startL int, stopL int, startR int, stopR ints   create table splices %ss   create table multisplices %sN(   Rx   R*   t   commit(   R   t   dbConnectionRF   t   positionSchemat   tableSchema(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR=   \  s    


c         C   s   |  j  S(   N(   R<   (   R   t	   valueName(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getMetadata  s    c         C   sP   |  j  d k rI |  j j d t  } x" | D] } |  j | j  Pq+ Wn  |  j  S(   s    This is following the same model as the original where it assumes that all
            read have the same readsize and that this is reflected by the size of the
            first read.
        RL   N(   R:   R9   R8   RP   RQ   t   calculateReadsizeFromCigarR   (   R   t   bamFileIteratorR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getReadSize  s    c         C   s>   d } t  g  | D] \ } } | | k r | ^ q  |  _ d  S(   Ni    i   (   i    i   (   t   sumR:   (   R   R   t   taket   opt   length(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR     s    c         C   s   d S(   s6    returns 0 as cache is going to be deprecated
        i    (    (   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRb     s    c         C   s;   | r t  |  j    } n t  |  j    } | j   | S(   sS    returns a sorted list of distinct chromosomes in bam file reference list.
        (   t   listt   getFullReferenceNamest   getShortReferenceNamesR_   (   R   t	   fullChromR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getChromosomes  s
    
c         C   s   t  |  j j  S(   s4    returns a set of bam file reference names.
        (   t   setR8   RH   (   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR     s    c         C   s\   t    } |  j j } x@ | D]8 } | d } t | j    d k r | j |  q q W| S(   sV    returns a set of bam file reference names after removing first 3 characters.
        i   i    (   R   R8   RH   t   lent   stript   add(   R   R   RH   t
   chromosomet	   shortName(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR     s    	
c         C   st   d } t    } |  j j d | d |  | r: | j } n  | rU t | j |  } n  | rp t | j |  } n  | S(   sI    returns the maximum coordinate for reads on a given chromosome.
        i    RJ   t   callback(   R   R8   RP   R$   R   R#   R    (   R   t   chromt   doUniqst   doMultit	   doSplicest   maxCoordt   coordFinder(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getMaxCoordinate  s    	t   startc         C   sg   d } d | | | f } |  j    } y+ | j |  t | j   d d  } Wn d | GHn X| S(   Ni    s)   select max(%s) from %s where chrom = '%s's,   couldn't retrieve coordMax for chromosome %s(   R   Rx   R[   R   (   R   R   R   t
   startFieldR   t   sqlStatementR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getMaxStartCoordinateInTable,  s    ic   $      C   s  |  j  j d | d | d |  } i  } | r | d k r g  | D]B } t | j  s@ i t | j  d 6t | j  d 6d d 6^ q@ | | <nd } x| D]z} t |  } d	 | j	 | f } t | j  s |  j
 | | d
 | r q n  t | j  } | r|  j  j | j  } n |  j  j | j  d } | r| } d | k rb| j d  d } n  d | k r| r| j d  \ } } n  | }  n | }  i t | j  d 6}! | r|  j d k r|  j | j  n  t | j |  j  |! d <n  | s	| |! d <n  | rIy d |  j | |! d <WqIt k
 rEd |! d <qIXn  | rRn  | ry( | j d  }" t |" | j |  }# Wn t k
 rd }# n X|# |! d <n  |	 r| |! d <n  |
 r| |! d <n  | r| |! d <n  y | |  j |!  Wq t k
 r|! g | |  <q Xq W| S(   s    First pass of rewrite
            1) Leave as much in place as possible
            2) For now treat multis just like uniqs
        RJ   R   t   endR'   R
   g      ?t   weighti    s   %s%sR   i   s   ::t   /t   stopi   t   MDt   mismatchR   R   t   pairIDN(   R8   RP   R   R   R[   R!   t   getReadSenset
   is_reverseR   R   t   readDoesNotMeetCriteriat   getrnamet   rnamet   splitR:   R9   R   R5   R   t   optt   getMismatchest   seqR   ($   R   t   bothEndst   noSenseR   R   t   flagt
   withWeightt   withFlagt   withMismatcht   withIDt	   withChromt
   withPairIDR   R   t   findallOptimizet
   readIDDictt   readLikeR   R   t   limitt   hasMismatcht   flagLiket   strandt	   combine5pR   t   resultsDictR   R   R   R   R
   t	   theReadIDt   dictKeyt   newrowt   mismatchTagt
   mismatches(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getReadsDict  sl    C!V'	
c   
      C   s   |  j  | |  } | d k r] y" | | j d  k r? t } n  Wq] t k
 rY t } q] Xn  t |  d k rr n  | r y | j d  }	 Wq t k
 r t } q Xn  | d k r t | j  | k r t } n  | S(   NR'   t   ZFi    R   t   +t   -(   R   R   (   t   rejectMultiReadEntryR   RQ   R   R   R   R   (
   R   R   R   R   R   R   R   R   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   N  s"    !	i
   c         C   sE   t  } | |  j j   k rA |  j | | k s5 | rA t } qA n  | S(   N(   R/   R5   RR   RV   RQ   (   R   R   R   t	   thresholdt
   rejectRead(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   m  s
    c
         C   s  |	 r d }
 d } n d }
 d } g  } | d k rV | |  j  k rV | j d |  n  | d k r | r t j d | d g d  } | j |  q | j d	 |  n  | d
 k r | j d |
 | f  n  | d
 k r | j d | | f  n  t |  d k r0t j d | d g d  } | j |  n  | rF| j d  n  | d k rf| j d |  n  t |  d k rt j | d  } d | } n d } | S(   Nt   startLt   stopRR   R   R'   s   chrom = '%s's   flag LIKE "%s   %"s   flag = '%s'is   %s > %ds   %s < %di    s   readID LIKE  's   %'s   mismatch != ''R   R   s   sense = '%s's    and s   where %s(   R   R   (   R1   R   t   stringt   joinR   (   R   R   R   R   R   R   R   R   R   RZ   t	   startTextt   stopTextt   whereClauset   flagLikeClauset   readIDClauset   whereStatementt
   whereQuery(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getReadWhereQueryw  s:    	c   	      C   s   | g } | r | j  d  n  | s5 | j  d  n  | rK | j  d  n  | ra | j  d  n  | rw | j  d  n  t j | d  } | S(   NR   R
   R   R   R   t   ,(   R   R   R   (	   R   t
   baseSelectR   R   R   R   R   t   selectClauset   selectQuery(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getReadSelectQuery  s    	c         C   sR   g  } | r d g } n  | d k r? | r? | j  d |  n  t j |  } | S(   Ns   GROUP BY start, sensei    s   LIMIT %d(   R   R   R   (   R   R   R   R   t   groupByt
   groupQuery(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getReadGroupQuery  s    c   #      C   s  |  j  j d | d | d |  } i  } d } x| D]} t | j  sO q4 n  t | j  } | j } | r |  j  j | j  } n |  j  j | j  d } | r | } d | k r |
 r | j	 d  \ } } n  | } n | } |  j
 d k r|  j | j  n  t | j |  j
 | j  \ } } } } i t |  d 6} t |  | d <t |  | d	 <t |  | d
 <| s| | d <n  | rd | d <n  | rn  | ry( | j d  } t | | j |  }  Wn t k
 rd }  n X|  | d <n  | r| | d <n  |	 r| | d <n  |
 r,| | d <n  | r| j   }! |! d	 =|! d
 =| }" |" d =|" d =y | | j |!  Wn t k
 r|! g | | <n X| | j |"  q4 y | | j |  Wq4 t k
 r| g | | <q4 Xq4 W| S(   s    First pass of rewrite
            1) Leave as much in place as possible
            2) For now treat multis just like regular splices
        RJ   R   R   i    i   R   R   t   stopLt   startRR   R
   g      ?R   R   R'   R   R   R   R   N(   R8   RP   R   R   R   R   R   R   R   R   R:   R9   R   t   getSpliceBoundsR!   R[   R   R   R   R   t   copyR   (#   R   R   R   R   R   R   R   R   R   R   R   R   t	   splitReadR   R   R   R   R   R   R   R   R   R
   R   R   R   R   R   R   R   R   R   R   t   leftDictt	   rightDict(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getSplicesDict  sx    !		'
c      
   C   s   i d d 6d d 6d d 6} xq |  j    D]c } |  j d | d t d t d t  \ } } } | d c | 7<| d c | 7<| d c | 7<q( W| S(   Ni    RX   RY   RZ   R   t   reportCombinedt   splices(   R   t	   getCountsR/   RQ   (   R   t   fullReadCountR   t	   uniqCountt
   multiCountt   spliceCount(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR?     s    

-R   c	      	   C   s   d }	 d }
 d } |  j  d | d | d | d |  } | rH | j }	 n  | rZ | j }
 n  | rl | j } n  | r |	 |
 | } | S|	 |
 | f Sd S(   s0    return read counts for a given region.
        i    R   t   rmint   rmaxR
   N(   t   getBamReadCounterR   R   R   (   R   R   R  R  t   uniqsRY   R   R   R
   Re   Rf   Rg   t   readCounterR\   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR     s    $c      	   C   s;   t  |  j |  } |  j j d | d | d | d |  | S(   NRJ   R   R   R   (   R   R5   R8   RP   (   R   R   R  R  R
   R	  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR    s    %c         C   s1   |  j  | | | d t d t d t d t d d S(   s0    return read counts for a given region.
        R  RY   R   R   R
   R   (   R   RQ   (   R   R   R  R  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getTotalCounts  s    c         C   s\  g  } d }	 | d k r7 | |  j  k r7 d | g } n  | d k rc | j d | t |  f  n  | d k r | j d | t |  f  n  | d k r | j |  n  t |  d k r t j | d  }
 d |
 } n d } |  j   } | r| j d | | | f  n | j d	 | | f  | j   } y t	 | d  }	 Wn d }	 n X|	 S(
   s;    returns the number of row in the specified table.
        i    R'   s
   chrom='%s's   %s >= %ss   %s <= %ss    and s   where %ss0   select count(distinct chrom+%s+sense) from %s %ss   select sum(weight) from %s %s(
   R1   R   R`   R   R   R   R   Rx   t   fetchoneR[   (   R   R   R   R  R  t   restrictt   distinctR   R   RI   R   R   R   t   result(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getTableEntryCount  s0      
c      	   C   s+   |  j  d | d | d | d |  } | j S(   NR   R  R  R
   (   R  R   (   R   R   R  R  R  R  R	  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getSplicesCount  s    $c      	   C   s+   |  j  d | d | d | d |  } | j S(   NR   R  R  R
   (   R  R   (   R   R   R  R  R  R  R	  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getUniqsCount  s    $c      	   C   s+   |  j  d | d | d | d |  } | j S(   NR   R  R  R
   (   R  R   (   R   R   R  R  R  R  R	  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getMultiCount)  s    $c         C   s  g  } | r | j  d  n  | r2 | j  d  n  | rH | j  d  n  t |  d k ro t j | d  } n d } d } | d k r d | } n  d | | f }	 |  j   }
 |
 j |	  |
 j   } | r g  | D] } | d j d	  d ^ q Sg  | D] } | d ^ q Sd
 S(   s    get readID's.
        s   select readID from uniqss   select readID from multis   select readID from splicesi    s    union R'   s   LIMIT %ds   %s group by readID %sR   N(   R   R   R   R   R   Rx   R   R   (   R   R  RY   R   t   pairedR   Rz   t
   selectPartt	   limitPartt   sqlQueryR   R  t   x(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt
   getReadIDs0  s(    (c         C   s  |  j    } | r | g } n |  j   } | j   i  } xy| D]q} | rY d | GHn  g  | | <| rJ|  j d k rJ|  j d t d | d t d t d t  } | j   }	 x|	 D]}
 | |
 d } | d	 } | d
 } | d } | d } | d } | j d  } x6| D].} d | k r)qn  t |  } | d k rs| d } | | d } t	 | d | d ! } nR | d k rt
 | d  } t
 | | d  } | t	 | d | d ! d } n  t	 |  t	 |  d } d } t	 |  t	 |  k r| | d } n | | } | | } | | j | | | | g  qWq Wn  |  j d t d | d t d t  } | | j   k rqA n  x)| | D]} | d } | d } | d } | j d  } x | D] } d | k rqn  t |  } | d k r-| d } | | d } t	 | d | d ! } nR | d k rt
 | d  } t
 | | d  } | t	 | d | d ! d } n  | | d } | | j | | | | g  qWqWqA W| S(   sB    returns the uniq and spliced mismatches in a dictionary.
        s%   getting mismatches from chromosome %sR)   R   R   R   R   R   i    R   R   R   R
   R   R   t   NR   i   R   R   (   R   R   R_   R*   R   RQ   RR   R   R   R[   R   R   R   (   R   t   mischromRE   t
   useSplicest   readlent   hitChromListt   snpDictR   t
   spliceDictt   spliceIDListt   spliceIDt   spliceEntryt   startpost   lefthalft
   rightstartR
   R   t   spMismatchListR   t
   change_lent   change_fromt   change_baset
   change_post	   firsthalft
   secondhalft	   change_att   hitDictt	   readEntryR   t   mismatchList(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   R  s~    

*





"

($



")g      ?i    c
         C   s  |  j    }
 y t |
 d  } Wn t k
 r9 d } n X|
 d } d | } i  } t |	  | d <d t |	  | d <| d k r |  j | d | d	 | | } n | | | | d } t d
 d g |  } |  j d t d | d t d | d | d | d t  } | d k  rd } n  x | | D] } | d } | d } | d } | | | | } x t | | |  D]| } yi | s| d k r| d k r| | c | | 7<n/ | d k r| d k r| | c | | 8<n  WqrqrqrXqrWq(W~ | r| d k r| d k rD|  j d t d | d t d | d |  } n |  j d t d | d t  } | | k rxr| | D]c} | d } | d } | d } | d } | d } | | | k  ryx t t	 | |   D]q } | | | } | s| d k r| d k r| | c | 7<q| d k r| d k r| | c | 8<qqWx t t	 | |   D]q } | | | } | s| d k r| d k r| | c | 7<qd| d k rd| d k rd| | c | 8<qdqdWqyqyWn  ~ n  | S(   s   return a profile of the chromosome as an array of per-base read coverage....
            keepStrand = 'both', 'plusOnly', or 'minusOnly'.
            Will also shift position of unique and multireads (but not splices) if shift is a natural number
        R:   i    R*   g      ?R   iR   R   R   t   fg        R   R   R   R   R   R   R
   R   t	   minusOnlyt   plusOnlyR)   R   R   R   R   R   (
   R   R[   R   R   R    R   RQ   t   rangeR   t   abs(   R   R   t   cstartt   cstopt   useMultiR  t   normalizationFactort   trackStrandt
   keepStrandt
   shiftValueR<   R  R*   t   scalet   shiftt   lastNTt
   chromModelR.  R/  t   hstartR
   R   t
   currentposR  R"  t   Lstartt   Lstopt   Rstartt   Rstopt   rsenset   index(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   getChromProfile  sr    


"6	


-




!c         C   s(   x! | D] \ } } | |  j  | <q Wd S(   sL    inserts a list of (pname, pvalue) into the metadata
        table.
        N(   R<   (   R   t
   valuesListRd   t   pvalue(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR>     s    c         C   sZ   d t  |  | f } | d k r9 | d t  |  7} n  |  j j |  |  j j   d S(   sM    update a metadata field given the original value and the new value.
        s.   update metadata set value='%s' where name='%s'R'   s    and value='%s' N(   R`   R,   Rx   R   (   R   Rd   t   newValuet   originalValueRz   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   updateMetadata  s
    c         C   s$   |  j  j d |  |  j  j   d S(   su    inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
        into the uniqs table.
        sn   insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)N(   R,   t   executemanyR   (   R   RJ  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   insertUniqs  s    c         C   s$   |  j  j d |  |  j  j   d S(   su    inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
        into the multi table.
        sn   insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)N(   R,   RO  R   (   R   RJ  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   insertMulti  s    c         C   s$   |  j  j d |  |  j  j   d S(   s    inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
        into the splices table.
        s   insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)N(   R,   RO  R   (   R   RJ  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   insertSplices"  s    c         C   s$   |  j  j d |  |  j  j   d S(   s    inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
        into the multisplices table.
        s   insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)N(   R,   RO  R   (   R   RJ  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   insertMultisplices+  s    c         C   s   d } | d k r d } n  | r; |  j  j d | |  n  | r[ |  j  j d | |  n  |  j d k r | r |  j  j d | |  |  j  j d | |  n  |  j  j   d	 S(
   s6   update reads on file database in a list region of regions for a chromosome to have a new flag.
            regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
            sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
        R'   R   s    and sense = ? sG   UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? sG   UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? R)   s\   UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? s\   UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? N(   R,   RO  R*   R   (   R   t   regionsListR  RY   R   R
   R  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt	   flagReads4  s    	c         C   sk   | r |  j  d |  n  | r2 |  j  d |  n  |  j d k rZ | rZ |  j  d |  n  |  j j   d S(   s4    set the flag fields in the entire dataset.
        R  RY   R)   R   N(   t   setFlagsInDBR*   R,   R   (   R   R   R  RY   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   setFlagsK  s    c         C   s   |  j  j d | | f  d S(   s8    set the flag field for every entry in a table.
        s   UPDATE %s SET flag = '%s'N(   R,   Rx   (   R   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRV  [  s    c         C   s   |  j  d | | |  d S(   sn    reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
        R'   N(   RW  (   R   R  RY   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt
   resetFlagsb  s    c         C   s   |  j  j d |  d  S(   NsI   UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? (   R,   RO  (   R   t   readList(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   reweighMultireadsi  s    t   ONc         C   s/   y |  j  j d |  Wn d | GHn Xd  S(   Ns   PRAGMA SYNCHRONOUS = %ss-   warning: couldn't set PRAGMA SYNCHRONOUS = %s(   R,   Rx   (   R   t   value(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   setSynchronousPragmam  s    c         C   s   d  S(   N(    (   R   RF   Rj   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt
   setDBcache{  s    c         C   s3   |  j    } | j |  | r/ | j   } | Sd  S(   N(   R   Rx   R   (   R   t	   statementt   returnResultsR   R  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRx     s
    c         C   s7   |  j  |  |  j r& |  j j   n |  j j   d  S(   N(   Rx   R0   R-   R   R,   (   R   R_  (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR~     s    	c         C   s   | |  j    k r" |  j |  n  |  j d  |  j j d  d GH|  j j d  d GH|  j j d  d GH|  j j d  d	 GH|  j d
 k r |  j j d  d GH|  j j d  d GH|  j j d  d GHn  |  j j   |  j d  d S(   s    Builds the file indeces for the main tables.
            Cache is the number of 1.5 kb pages to keep in memory.
            100000 pages translates into 150MB of RAM, which is our default.
        t   OFFs-   CREATE INDEX uPosIndex on uniqs(chrom, start)s   built uPosIndexs(   CREATE INDEX uChromIndex on uniqs(chrom)s   built uChromIndexs-   CREATE INDEX mPosIndex on multi(chrom, start)s   built mPosIndexs(   CREATE INDEX mChromIndex on multi(chrom)s   built mChromIndexR)   s0   CREATE INDEX sPosIndex on splices(chrom, startL)s   built sPosIndexs1   CREATE INDEX sPosIndex2 on splices(chrom, startR)s   built sPosIndex2s*   CREATE INDEX sChromIndex on splices(chrom)s   built sChromIndexR[  N(   Rb   R^  R]  R,   Rx   R*   R   (   R   RF   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt
   buildIndex  s(    c         C   s   y |  j  d  |  j j d  |  j j d  |  j j d  |  j j d  |  j d k r |  j j d  y |  j j d  Wn n X|  j j d	  n  |  j j   Wn d
 GHn X|  j  d  d S(   s5    drops the file indices for the main tables.
        Ra  s   DROP INDEX uPosIndexs   DROP INDEX uChromIndexs   DROP INDEX mPosIndexs   DROP INDEX mChromIndexR)   s   DROP INDEX sPosIndexs   DROP INDEX sPosIndex2s   DROP INDEX sChromIndexs   problem dropping indexR[  N(   R]  R,   Rx   R*   R   (   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt	   dropIndex  s"    	c         C   s  d |  _  t j d  |  _  |  j |  j   |  j j   } d } | d k rk d | GHd | } | |  _ n	 d |  _ |  j  j d  |  j  j d  |  j  j d  | j d  } g  } x) | D]! } | j | d	 | d
 f  q W|  j  j	 d |  |  j
 d |  |  j
 d |  |  j d k r7|  j |  n  | r| d k r|  j  j d  |  j  j d  |  j d k r|  j  j d  |  j  j d  qq|  j  j d  |  j  j d  |  j d k r|  j  j d  |  j  j d  qn  t |  _ t j |  j  _ |  j  j   d S(   s    makes a copy of the dataset into memory for faster access.
        Can be restricted to a "full" chromosome. Can also build the
        memory indices.
        R'   s   :memory:s
   memSync %ss    where chrom = '%s' s   PRAGMA temp_store = MEMORYs   PRAGMA CACHE_SIZE = 1000000s   delete from metadatas    select name, value from metadataR   R\  s.   insert into metadata(name, value) values (?,?)R  RY   R)   s&   CREATE INDEX uPosIndex on uniqs(start)s&   CREATE INDEX mPosIndex on multi(start)s*   CREATE INDEX sPosLIndex on splices(startL)s*   CREATE INDEX sPosRIndex on splices(startR)s-   CREATE INDEX uPosIndex on uniqs(chrom, start)s-   CREATE INDEX mPosIndex on multi(chrom, start)s1   CREATE INDEX sPosLIndex on splices(chrom, startL)s1   CREATE INDEX sPosRIndex on splices(chrom, startR)N(   R-   t   sqlitet   connectR=   R,   R   R1   Rx   R   RO  t   copyDBEntriesToMemoryR*   t   copySpliceDBEntriesToMemoryRQ   R0   t   Rowt   row_factoryR   (   R   R   RH  R   t   whereclauseR   t   results2R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   memSync  sJ    		
		c         C   s   |  j  j   } | j d | | f  } g  } x_ | D]W } | j | d | d t | d  t | d  | d | d | d | d	 f  q5 W|  j j d
 | |  d  S(   NsK   select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %sR   R   R   R   R
   R   R   R   sk   insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)(   R,   R   Rx   R   R[   R-   RO  (   R   Ry   R   R   t   sourceEntriest   destinationEntriesR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRf    s    Uc         C   s   |  j  j   } | j d |  } g  } xy | D]q } | j | d | d t | d  t | d  t | d  t | d  | d | d	 | d
 | d f
  q/ W|  j j d |  d  S(   Nsa   select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %sR   R   R   R   R   R   R
   R   R   R   s   insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)(   R,   R   Rx   R   R[   R-   RO  (   R   R   R   Rm  Rn  R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRg    s    O N(G   R   R   t   __doc__R/   RQ   R   RK   R4   R;   R^   RA   Ra   Rs   Rt   R]   R{   R|   R   R   R   R   R   R=   R   R   R   Rb   R   R   R   R   R   R9   R   R   R   R   R   R   R   R?   R   R  R
  R  R  R  R  R  R   RI  R>   RN  RP  RQ  RR  RS  RU  RW  RV  RX  RZ  R]  R^  Rx   R~   Rb  Rc  Rl  Rf  Rg  (    (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR%   H   s   	A						
	
								9					"
-	`		4"Q		V																4
c         C   s   |  r d } n d } | S(   NR   R   (    (   t   reverseR
   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR     s    	c         C   s7   t  } x* |  D]" \ } } | d k r t } Pq q W| S(   Ni   (   R/   RQ   (   t   cigarTupleListt   isSplicet	   operationR   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR     s    c         C   s[   d } xN | D]F \ } } | d k rI t  |  |  } t  | |  } | S| | 7} q Wd  S(   Ni    i   (   R[   (   R   Rq  t   offsetRs  R   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR"   $  s    c   	      C   sv   d } xS | D]K \ } } | d k rN t  |  |  } t  | |  } d } q | | 7} q W| | } |  | | | f S(   Ni    i   (   R[   (	   R   R:   Rq  Rt  Rs  R   R   R   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   1  s    	
c         C   s>   d } t  |   s d S|  j r( d } n |  j r: d } n  | S(   NR'   s   /1s   /2(   t   isPairedReadt   is_read1t   is_read2(   R   t
   readSuffix(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   A  s    				c         C   s   |  j  o |  j p |  j S(   N(   t   is_proper_pairRv  Rw  (   R   (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyRu  N  s    R'   R   c         C   s,  g  } d } d } t  j d |   } t  j d |   } x t t |   D] } | | }	 | t | |  } t j |	 |  d k r qI n  yx | r | | }
 n d }
 | d k r t |	  }	 t |
  }
 n  t | d  } | j d |	 | |
 f  | d 7} WqI t	 k
 rd	 SXqI Wt j
 | d
  S(   Nt   ^i    s   \d+s   \d+([ACGTN]|\^[ACGTN]+)R  R   i   s   %s%d%sR'   R   (   t   ret   findallR4  R   R[   R   t   findR   R   t
   IndexErrorR   (   R   t   querySequenceR
   t   outputt   deletionMarkert   positiont   lengthst   mismatchSequencest   mismatchEntryR   t   genomicNucleotidet#   erange1BasedElandCompatiblePosition(    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyR   R  s.    
	(   t   sqlite3Rd  R   Rk   Rn   Ru   R{  RN   R6   R    t
   commoncodeR   R   R   R.   t	   ExceptionR   R   R   R%   R   R   R"   R   R   Ru  R   (    (    (    sF   /woldlab/castor/data00/home/georgi/code/erange-4.0a-BAM/ReadDataset.pyt   <module>   s6   &      			
				