Ñò
‘Mc           @   s~   d  d k  Z d  d k l Z d  d k l Z d  d k l Z d  d k l	 Z	 d  d k
 l Z d  d k Z g  d d d	 „ Z d S(
   iÿÿÿÿN(   t
   geneinfoDB(   t   pvalue(   t   hyperGeometric(   t   sqrt(   t   shufflei    g{®Gáz„?c   C   $   C   s  d | GH| d j o t  d | d ƒ } n t  d | d ƒ } t  d | d ƒ } t  d | d ƒ } t |  ƒ d j  o
 d	 GHd
 S|  d }	 |	 d }
 t i i |
 ƒ t d t ƒ } h  } h  } h  } g  } g  } h  } g  } g  } d GHt i i |
 ƒ } t i i |
 ƒ } d t | ƒ GHx1 | D]) } d | | <t i i	 |
 | ƒ | | <q%Wx9 |  D]1 } | | j o | | j o | i
 | ƒ qYqYWx¨ | D]  \ }
 } | | j o‡ | i
 | ƒ y | |
 | f } Wn
 q•n XxV | D]J } | i d ƒ } | d } | | j o d | | <n | | c d 7<qãWq•q•Wd GHx¤ | D]œ } y | | } Wn
 qEn Xxx | D]p } | i d ƒ } | d } | | j o g  | | <n | | | j o% | | i
 | ƒ | | c d 7<qmqmWqEWt | ƒ } h  } d GHd t | ƒ GHxD | D]< } | | }  |  | j o g  | |  <n | |  i
 | ƒ qW| i ƒ  }! |! i ƒ  |! i ƒ  g  }" x0 |! D]( }  x | |  D] } |" i
 | ƒ qWqWg  }# y t i i |
 ƒ }$ Wn d |
 GHn Xx, |$ D]$ }% |% | j o |# i
 |% ƒ qáqáW| i ƒ  }& h  }' h  }( | d j oéd GHh  }) xß t | ƒ D]Ñ }* d |* d GHh  |) |* <x |& D] } d |) |* | <qjWt |# ƒ |# |  }+ x~ |+ D]v }, y | |
 |, f }- Wn
 qn XxL |- D]D } | i d ƒ }. |. d } | |& j o |) |* | c d 7<qËqËWqWqFWd GHx†|& D]è } d |' | <d }/ x, t | ƒ D] }* |' | c |) |* | 7<qJWt |' | ƒ t | ƒ |' | <xB t | ƒ D]4 }* |/ |) |* | |' | |) |* | |' | 7}/ q—Wt |/ t | d ƒ ƒ |( | <| i d | |' | |( | f ƒ q'Wn“ t t |# ƒ ƒ }0 x} |& D]u } | | }1 | | j o |1 | | 8}1 n | |1 |0 |' | <t | |1 |0 |0 |1 |0 |0 | |0 d ƒ |( | <q0Wd GHxL|" D]D} | | }2 | | }1 | | j o |1 | | 8}1 n yàt |( | ƒ }3 |3 d j o& t |2 |' | ƒ t |( | ƒ }4 n d }4 |1 d j o t |2 ƒ d t |1 ƒ }5 n d }5 d }6 | d j o1 |4 d j  o t d |4 ƒ }6 qØt |4 ƒ }6 n* |2 |' | }4 t t |# ƒ |1 | |2 ƒ }6 d }7 |2 |' | d j  o
 d }7 n | i d | |2 |1 |5 |4 |6 | | f ƒ | i
 |4 | |2 |1 |5 |4 |6 | | f f ƒ t |" ƒ d j og |6 | t t |" ƒ ƒ j  oJ |1 d j o= |2 d j o0 | i
 |6 | |2 |1 |5 |6 |7 | | f f ƒ n Wqµ| i d  | |2 |1 | | f ƒ qµXqµW| i ƒ  d! GH| i ƒ  | i ƒ  x% | D] \ }4 } | i d | ƒ q'	W| i ƒ  d" GH| i ƒ  h  }8 x€| D]x\ }6 } | d } | i d# | ƒ t  d$ | | d% f d ƒ }9 g  }: | | j o d& | GHqn	n d' | t | | ƒ f GHxÃ| | D]·} d( }; d( }< d( }= d( }> | |8 j o |8 | \ }; }< }= n8y\ | i | ƒ }> |> d }; | i | ƒ }? | i | ƒ }@ t i |? ƒ }< t i |@ d) ƒ }= WnØ t | ƒ }; y» t | ƒ d* j oD | i | d | d ƒ }A t |A ƒ d j o | i |A ƒ }> qn t |> ƒ d j oM | d }; | i |A ƒ }? | i |A ƒ }@ t i |? ƒ }< t i |@ d) ƒ }= n WqrqrXn X|: i
 d+ |; |< |= f ƒ | |8 j o |; |< |= f |8 | <qù	qù	W|: i ƒ  x |: D] }B |9 i |B ƒ qÅW|9 i ƒ  qn	W| i ƒ  t i i |
 ƒ d
 S(,   s«   calculates GO enrichment (and depletion) statistics for a given list of 
        geneIDs (assumed to be all from the same genome) using either the 
        hypergeometric distribution or random sampling for roundsRandomization 
        rounds if greater than 0. Specific geneID's can be excluded from the 
        calculations if included in excludeIDList. 
 
        Results are saved in files with the given fileprefix. GO Terms that are 
        larger than 15 genes and that have pvalues < sigLevel / (#num GO categories) 
        are reported in fileprefix.gosig, whereas genes that matches the GO term are 
        listed in fileprefix.ZZZZZZZ where the Z's are the GOID.
    s   calculateGOStats: %si    s   %s.goexpt   ws	   %s.gostats   %s.gozscores   %s.gosigi   s    Need at the very least one gene!Nt   caches   Getting GO lists   len(goDesc) = %ds   	s   Getting GO list for locusLists   Arranging by sizes   numGenes = %ss!   could not get gene entries for %ss   Get Random samplings   Round %ds   Calculating statss	   %s	%f	%f
g      ð?s   Writing out gostatg        g      Y@g      ð¿t   enrichedt   depleteds"   %s	%d out of %d	%2.2f	%f	%3.3g	%s
i   s   %s	%d out of %d	 	%s
s   writing gozscores   writing gosigs"   %s	%d out of %d	%2.2f	%3.3g	%s	%s
s   %s.%si   s   could not find %ss   %s	%d genest    t   ;i   s	   %s	%s	%s
(    t   opent   lent
   cistematict   coret   cacheGeneDBR    t   Truet
   allGOTermst   getAllGOInfot   getGOIDCountt   appendt   splitt   strt   keyst   sortt   reverset   getGenomeGeneIDst   rangeR   t   floatR   t   writeR   R   t   closet   getGeneInfot   geneIDSynonymst   getDescriptiont   stringt   joint	   getGeneIDt   uncacheGeneDB(C   t
   geneIDListt
   fileprefixt   excludeIDListt   roundsRandomizationt   sigLevelt	   goexpfilet
   gostatfilet   gozscorefilet	   gosigfilet   firstgidt   genomet   idbt   goBint
   goPossiblet   foundt	   locusListt   excludeLocusListt   excludeGOBint   zListt   sigListt   goDesct   goInfot   GOIDt   entryt   gIDt   excludeGOTermst   excludeGOTermsFieldst   locust   locusGOTermst   locusGOTermsFieldst   numGenest   goSizet   goLent	   goLengthst   goListt   allGIDst   theGIDst   aGIDt   gokeyst   meant   standardDevt   samplet	   sampleNumt   sampleGenest   gidt   goarrayt   goarrayFieldst   sumofsquarest   Nt   possiblet   countt   divisort   zscoret
   percentaget   pvalt   statust
   annotCachet   goidsigfilet   goidsigfileListt
   geneSymbolt   outsynt   outdesct   geneInfot   synonymst   descriptiont	   newGeneIDt   line(    (    sN   /woldlab/castor/data00/home/georgi/erange-4.0a/cistematic/cisstat/analyzego.pyt   calculateGOStats(   sÒ   	

 
   
  
 


    
 

  
$ 
  2* 
8 

&
*,J4)


 

 
	 


 
(   t   cistematic.coreR   t   cistematic.core.geneinfoR    t   cistematic.cisstat.scoreR   t   cistematic.cisstat.helperR   t   mathR   t   randomR   R"   Ri   (    (    (    sN   /woldlab/castor/data00/home/georgi/erange-4.0a/cistematic/cisstat/analyzego.pyt   <module>    s   