o
    á-d–.  ã                   @   sd   d dl Zd dlZddd„Zddd„Zddd„Zdd
d„Zddd„Zddd„Z	ddd„Z
ddd„ZdS )é    Né   c                 C   s.   t  | ¡}|dd…|f d |dd…|f< |S )aâ  Converts BED format positions to position format.
        
        Parameters
        ----------
        data : array-like, shape (n_peaks, n_features)
        A numpy array containing peak data in BED format.
        
        startidx : int
        The start index of the array. Default: 1
        
        Returns
        -------
        posdata : array-like, shape (n_peaks, n_features)
        Returns a copy of the array incrementing the start
        position by one.
        Nr   ©ÚnpÚcopy)ÚdataÚstartidxZposdata© r   úY/oak/stanford/groups/akundaje/atwang/gp_mouse_sc_analyses/resources/amulet/peakoverlap.pyÚconvertToPositionFormatFromBED   ó   
 r
   c                 C   s.   t  | ¡}|dd…|f d |dd…|f< |S )aç  Converts position format positions to BED format.
        
        Parameters
        ----------
        data : array-like, shape (n_peaks, n_features)
        A numpy array containing peak data in position format.
        
        startidx : int
        The start index of the array. Default: 1
        
        Returns
        -------
        beddata : array-like, shape (n_peaks, n_features)
        Returns a copy of the array decrementing the start
        position by one.
        Nr   r   )r   r   Zbeddatar   r   r	   ÚconvertToBEDFormatFromPosition   r   r   c           	      C   s¾   t  | dd…|f ¡}tƒ }tdt|ƒƒD ]G}t  | dd…|f || k¡d }| |dd…f }t j|dd…|f dd}t jt  |||f t j	 ¡t  || t j	 ¡fdd||| < q|S )a0  Returns a dictionary by chromosome where each element in
        the dictionary contains an array containing start positions
        sorted in ascending order and the position of the original
        data element containing the start position.
        
        Time complexity: O(nlogn), n = # of peaks
        
        Parameters
        ----------
        data : array-like, shape (n_peaks, n_features)
        A numpy array containing peak data in position format.
        
        chridx : int
        The chromsome index of the array. Default: 0
        
        startidx : int
        The start index of the array. Default: 1
        
        Returns
        -------
        rv : dict
        A dictionary mapping each chromosome to an array:
        [0] = start position [1] = index in original data.
        Nr   Ú	mergesort)Úkindr   )Úaxis)
r   ÚuniqueÚdictÚrangeÚlenÚwhereÚargsortÚconcatenateÚ	transposeÚnewaxis)	r   Úchridxr   ZallchrÚrvÚiÚidxZchrdataÚsidxr   r   r	   ÚgetChrStartSorted8   s   "@r   é   c                 C   s  z||  }W n   g }Y d}t |ƒ}|| dkrAt||| d  ƒ}	||	df }
|
|k r0|	}n|
|kr7|	}n|	}|	}|| dksg }|}|t |ƒk r„|||df kr„||df }||df }|||f }||krr||krr| |¡ |d }|t |ƒk r„|||df ksSt|ƒS )aÀ  Returns the index of all regions that overlap the given
        chromosome, start, and end position.
        
        Time Complexity: O(logn), n = # of elements in the region list
        
        Parameters
        ----------
        chrom : str
        Chromsome of the position.
        
        start : int
        The start position.
        
        end : int
        The end position
        
        chrstartsorted : dict
        The chromosome start sorted dictionary of the regions
        to identify whether the given coordinates overlap.
        
        data : array-like, shape (n_peaks, n_features)
        The data corresponding to the sorted dictionary in
        positon format.
        
        eidx : int
        The end position indec within the data parameter. Default: 2
        
        Returns
        -------
        rv : tuple
        A tuple containing the index positions of data that
        overlap the given position.
        r   r   r   )r   ÚintÚappendÚtuple)ZchromÚstartÚendZchrstartsortedr   ZeidxZstartsortedÚsÚeÚmiZmstartr   r   ZdidxZcstartZcendr   r   r	   ÚgetOverlappingRegions[   s6   "÷
úr(   c              	   C   s|   t |||ƒ}tjt| ƒtd}	tdt| ƒƒD ]%}
| |
|f }| |
|f }| |
|f }tt||||||ƒƒdkr;d|	|
< q|	S )aå  Returns a boolean vector indicating whether or not the peak
        in the list overlaps a set of peaks.
        
        Time Complexity: O(nlogn), n = # of elements in the region list
        
        Parameters
        ----------
        data : array-like, shape (n_peaks, n_features)
        A numpy array containing peak data in position format.
        
        
        peakset : array-like, shape (n_peaks, n_features)
        A numpy array containing peak data in position format.
        
        
        chridx : int
        The chromsome index of the data parameter. Default: 0
        
        startidx : int
        The start index of the data parameter. Default: 1
        
        endidx : int
        The end index of the data parameter. Default: 2
        
        setchridx : int
        The chromosome index of the peakset parameter. Default: 0
        
        setstartidx : int
        The start index of the peakset parameter. Default: 1
        
        setendidx : int
        The end index of the peakset parameter. Default: 2
        
        Returns
        -------
        rv : arraylike, shape (n_peaks,)
        A boolean vector indicating whether the peaks in data overlap
        with the peaks in the peakset.
        ©Údtyper   T)r   r   Úzerosr   Úboolr   r(   )r   Zpeaksetr   r   ÚendidxZ	setchridxZsetstartidxZ	setendidxZsortedconsensusr   r   ÚcurchrÚcurstartÚcurendr   r   r	   ÚgetOverlapIndexš   s   (€r1   c           	   	   C   sv   t  t| ƒ¡}t  t| ƒt|ƒf¡}tdt|ƒƒD ]}t| || |||||ƒ t¡}||dd…|f< || }q||fS )aö  Counts how many peaks & in which dataset the current peak
        list overlaps over a set of peak lists.
        
        Time Complexity: O(m*nlogn)
        n = # of elements in the region list
        m = # of datasets
        
        Parameters
        ----------
        countdataset : array-like, shape (n_peaks, n_features)
        A numpy array containing peak data in position format.
        
        datasets : tuple
        A tuple containing multiple peak datasets in position format.
        
        chridx : int
        The chromsome index of the data parameter. Default: 0
        
        startidx : int
        The start index of the data parameter. Default: 1
        
        endidx : int
        The end index of the data parameter. Default: 2
        
        Returns
        -------
        overlapvector : arraylike, shape (n_peaks,)
        A vector indicating the number of datasets in datasets
        overlapping with the corresponding peak in countdataset.
        
        overlapmatrix : arraylike, shape (n_peaks, n_datasets)
        A boolean matrix indicating whether the peak countdataset
        overlaps with a peak in the corresponding dataset. Columns
        are ordered respective of the ordering in the dataset tuple.
        r   N)r   r+   r   r   r1   Úastyper    )	ZcountdatasetÚdatasetsr   r   r-   ZoverlapvectorZoverlapmatrixr   Zcurvr   r   r	   ÚgetOverlapCountÌ   s   $
r4   c                 C   sØ  t ƒ }tdt| ƒƒD ]}t| | ||ƒ||< q
t|d ƒ}tdt| ƒƒD ]}t |t|| ƒ¡}q$g }|D ]¯}tjt| ƒtjd}	g }
	 g }tdt| ƒƒD ]}| | }|| | |	| df }| 	||dd…f ¡ qMt 
|¡}d}|d|f }d}|d|f }tdt|ƒƒD ]}|||f }||kr—|}|}|||f }||k r¥|}|}q‡||k r²|
 	|||g¡ |	| d |	|< d}tdt|	ƒƒD ]}|	| t|| | ƒkrÕd} nqÃ|rÙnqD|
D ]}| 	|¡ qÜq5tj
|tdS )a¤  Returns a strict set of consensus peaks requiring that every
        sample overlaps every other sample in the consensus peak region.
        
        Time Complexity: O(m^2*n + m*nlogn) m=#of datasets, n=#peaks
        
        Parameters
        ----------
        data : tuple
        A tuple of numpy arrays containing peak data in position format.
        
        chridx : int
        The chromsome index of the data parameter. Default: 0
        
        startidx : int
        The start index of the data parameter. Default: 1
        
        endidx : int
        The end index of the data parameter. Default: 2
        
        Returns
        -------
        rv : arraylike, shape (n_peaks,)
        A numpy array of genomic positions where peaks across
        all datasets overlap.
        
        Notes
        -----
        Chromosome start and end positions must be the same over all peak
        datasets.
        r   r   r)   TNF)r   r   r   r   Úlistr   Úunion1dr+   Úint32r!   ÚarrayÚobject)r   r   r   r-   Z
sorteddatar   ZchromosomesZallchrstrictpeaksr.   ZcountersZstrictpeaksZcurlistÚcurdataZcurindexZ	minendidxZminendZmaxstartidxZmaxstartr/   r0   Z
maxreachedÚcpr   r   r	   ÚgetStrictConsensusPeaksú   s^    
€þÚ(ÿr<   c                 C   sÔ   t  | ¡}t|ƒ}g }|D ]T}|| }||d d d…f }	tdt|ƒƒD ]0}
|||
df d d…f }|d |	d krI| ||	d |	d g¡ |}	q$t|	d |d ƒ|	d< q$| ||	d |	d g¡ qt j|t jdS )N©r   r   é   r   r   r)   )	r   r   r   r   r   r!   Úmaxr8   r9   )r3   r   r   r-   ZcombineddataZsortedlocationsr   r.   Z	locationsZcurlocir   Znextlocir   r   r	   ÚgetUnionPeaksU  s   
r@   )r   r=   )r   )r   r   r   r   r   r   )r   r   r   )Únumpyr   ÚpandasÚpdr
   r   r   r(   r1   r4   r<   r@   r   r   r   r	   Ú<module>   s    



#
?
2
.[