Ñò
Xs*Jc           @   sE  d  d k  Z  d  d k Z d  d k Z d  d k Z d  d k Td  d k Td  d k l Z d  d k Z d  d k	 Z	 d  d k
 Z
 d  d k Z d  d k Z d  d k Z e i ƒ  Z d Z d Z e  i d ƒ Z e  i d ƒ Z d „  Z d	 „  Z d
 „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d d „ Z d „  Z d „  Z  d S(   iÿÿÿÿN(   t   *(   t   OptionParsers	   /bin/greps   /bin/cats   \+s   \-c         C   so   g  } g  } xV |  D]N } t  i | i ƒ o | i | ƒ q t i | i ƒ o | i | ƒ q q W| | f S(   sq   
	input: a list of bed6 object 
	Return: two lists of bed6 objects, one for plus strand, one for minus strand. 
	(   t   plust   matcht   strandt   appendt   minus(   t   bed_listt   plus_bed_listt   minus_bed_listt   b(    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   breakUpStrands(   s     c         C   sO  t  i |  ƒ d j p t ‚ d g d } t |  ƒ d j ot |  ƒ } |  d i } d } xž t d t |  ƒ ƒ D]‡ } |  | } | i | j o] t | ƒ d | j  o# | d g | t | ƒ d 7} n | | c d 7<| i } d } qr | d 7} qr Wt | ƒ d | j  o# | d g | t | ƒ d 7} n | | c d 7<n | S(   s·   
	Input:  
		sorted_bed_list: a list of sorted bed6 objects. Already assumed that 
		the tags are from one chromosome and in one direction. 
	Return: the histogram of the tag copies
	i   i    id   (   t   Utilityt	   is_sortedt   AssertionErrort   lent   startt   range(   t   sorted_bed_listt   unique_tag_histogramt   total_number_tagst   current_valuet   current_countt   indext   item(    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   find_read_copy_distribution9   s(     
#	
#c   	      C   s'  t  i |  ƒ d j p t ‚ g  } g  } t |  ƒ d j oç t |  ƒ } | i |  d ƒ |  d i } d } x’ t d t |  ƒ ƒ D]{ } |  | } | i | j oD | | j o | i | ƒ n | i } d } g  } | i | ƒ q‚ | d 7} | i | ƒ q‚ W| t j o | i | ƒ q#n | S(   sé   
	Input:  
		sorted_bed_list: a list of sorted bed6 objects. Already assumed that 
				the tags are from one chromosome and in one direction. 
		n: the copies for a read 
	Return: the list of BED6 reads with copy number equal to n.
	i   i    (	   R   R   R   R   R   R   R   t   extendt	   threshold(	   R   t   nt   n_copy_read_listt	   temp_listR   R   R   R   R   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   find_n_copy_readsW   s.    	 
	
c   	      C   s'  g  } g  } t  i |  ƒ d j p t ‚ t |  ƒ d j oç t |  ƒ } |  d i } | i |  d ƒ d } x’ t d t |  ƒ ƒ D]{ } |  | } | i | j oD | | j o | i | ƒ n | i } d } g  } | i | ƒ q‚ | d 7} | i | ƒ q‚ W| | j o | i | ƒ q#n | S(   s	  
	Input: 	
		sorted_bed_list: a list of sorted bed6 objects. Already assumed that 
						the tags are from one chromosome and in one direction. 
		threshold:	the threshold for read copy
	Return: the list of BED6 reads with copy number above or equal to threshold.
	i   i    (   R   R   R   R   R   R   R   R   (	   R   R   t   multiple_copy_read_listR   R   R   R   R   R   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   find_multi_copy_reads~   s.     
	
c         C   sU  t  i |  ƒ d j p t ‚ d } t |  ƒ d j ot | d ƒ } t |  ƒ } |  d i } d } |  d } xª t d t |  ƒ ƒ D]“ }	 |  |	 }
 |
 i | j oA | | j o t | | ƒ | d 7} n |
 i } d } |
 } q„ | | j o t | | ƒ | d 7} n | d 7} q„ W| | j o t | | ƒ | d 7} n | i ƒ  n | S(   s  
	A read has n copies in the sorted_bed_list. If n<=cutoff, all the copies are retained.
	If n>cutoff, only cutoff number of copies of the read are retained.  
	
	Output: write bed objects with the extra redundant copies filtered out.
	Return: the number of reads remained
	i   i    t   w(	   R   R   R   R   t   openR   R   t   writet   close(   R   t   cutofft   outfilet   counter2t   outR   R   R   t   current_tagR   R   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   filter_reads§   s6    
 
	
c         C   s8  d } t  |  ƒ d j ot | d ƒ } t  |  ƒ } |  d i } d } |  d } xª t d t  |  ƒ ƒ D]“ }	 |  |	 }
 |
 i | j oA | | j o t | | ƒ | d 7} n |
 i } d } |
 } qg | | j o t | | ƒ | d 7} n | d 7} qg W| | j o t | | ƒ | d 7} n | i ƒ  n | S(   sf  
	A read has n copies in the sorted_bed_list. If n<=cutoff, all the copies are retained.
	If n>cutoff, only cutoff number of copies of the read are retained.  
	
	Output: write bed objects with the extra redundant copies filtered out. This function 
			uses out = open(outfile, 'a') instead of out = open(outfile, 'w')
	Return: the number of reads remained
	i    t   ai   (   R   R#   R   R   R$   R%   (   R   R&   R'   R(   R)   R   R   R   R*   R   R   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   filter_reads_addÍ   s4    	
 
	
c         C   sm   |  i  d t |  i ƒ d t |  i ƒ d |  i d t t |  i ƒ ƒ d |  i d } | i | ƒ d S(   sV   
	write one line into outfile. The file openning and closing is handled by outside. 
	s   	s   
N(	   t   chromt   strR   t   endt   namet   intt   scoreR   R$   (   R   R)   t   outline(    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyR$   ò   s    \c         C   s"   x |  D] } t  | | ƒ q Wd S(   s[   
	write a bed6_list into outfile. The file openning and closing is handled from outside. 
	N(   R$   (   t	   bed6_listR)   R   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt
   write_listú   s     c         C   s•   g  } t  |  ƒ t  | ƒ j  o; | } xi t t  |  ƒ ƒ D] } | | c |  | 7<q8 Wn8 |  } x. t t  | ƒ ƒ D] } | | c | | 7<qs W| S(   N(   R   t   xrange(   R,   R
   t   tR   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   combine_histogram  s      i    c         C   s<   d } x/ t  | t |  ƒ ƒ D] } | | |  | 7} q W| S(   sF   
	Threshold serves at the starting value for integration, inclusive.
	i    (   R   R   (   t	   histogramR   t   totalR   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   find_total_in_histogram  s
     c         C   s~   t  | d ƒ } x^ t t |  ƒ ƒ D]J } |  | d j o3 t | ƒ d t |  | ƒ d } | i | ƒ q" q" W| i ƒ  d  S(   NR"   i    s   	s   
(   R#   R7   R   R/   R$   R%   (   R,   R'   R)   R   R4   (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   write_histogram  s     "c         C   s¾   t  i |  } g  } t i |  | d d ƒ } xŒ | D]„ } | | i ƒ  j ok | | i d t i d ƒ ƒ } t | ƒ \ } } t | ƒ }	 t	 |	 | ƒ } t | ƒ }
 t	 |
 | ƒ } q2 q2 W| S(   s\   
	file_name is for the raw tag file. 
	need BED6 to split the positive and negative tags. 
	t   BED6i    t   keyR   (
   t
   GenomeDatat   species_chromst   BEDt   keyst   sortt   operatort
   attrgetterR   R   R9   (   t   speciest	   file_namet   chromsR:   t   bed_valsR.   R   R   R	   t   plus_histogramt   minus_histogram(    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   combine_read_copy_distribution   s     (!   t   ret   ost   syst   shutilt   matht   stringt   optparseR   RE   RB   t   UCSCt   bisectR@   R   t   getcwdt   Dirt   grept   catt   compileR   R   R   R   R   R!   R+   R-   R$   R6   R9   R<   R=   RM   (    (    (    sM   /woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/lib/bed_preprocessing.pyt   <module>   s4   0

			'	)	&	%						