o
    Uݢgh"                     @  s   d dl mZ d dlZd dlZd dlmZ d dlZd dlm	Z	m
Z
 G dd deZG dd deZd	d
 Zdd Z	d!d"ddZd#ddZd$ddZd%dd ZdS )&    )annotationsN)Counter)ensure_binary
ensure_strc                      s$   e Zd Z fddZdd Z  ZS )CSVParseExceptionc                   s   t  | || _d S N)super__init__msg)selfr
   	__class__ c/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/csv_utils.pyr	      s   
zCSVParseException.__init__c                 C  s   | j S r   )r
   )r   r   r   r   __str__   s   zCSVParseException.__str__)__name__
__module____qualname__r	   r   __classcell__r   r   r   r   r      s    r   c                   @  s   e Zd ZdZdS )CSVEmptyExceptionzThe CSV file has no data rows.N)r   r   r   __doc__r   r   r   r   r      s    r   c                 c  sj   t | ddd}d}d}z:|D ]5}| }|sq|dsEd}|r1ddd	 |dD }d}| sBtd
| d| d| |V  qW nO ty } zCt|j	d d}|j	| }	|j
||j	d  jddddddd}
d|	 d }td
| d| d|j d|j	 d|
 d| |d}~ww W d   n1 sw   Y  |std
| d| ddS )zIterates through non-comment lines in the file, returning them as unicode strings.

    Also skips the BOM if present.

    Raises:
        CSVParseException: The file has no non-comment lines or some lines can't be decoded as ascii.
    z	utf-8-sigN)encodingnewlineFT#,c                 s  s    | ]}|  V  qd S r   )strip).0xr   r   r   	<genexpr>2   s    z"_iter_file_rows.<locals>.<genexpr>The 
 csv file z& contains non-ascii characters.

Row:
   r   zutf-8replace)errors 
^z+ has one or more invalid utf-8 characters: z2. The first bad character is at absolute position z9.
This snippet shows the offending character in context:
z has no data.)openr   
startswithjoinsplitisasciir   UnicodeDecodeErrormaxstartobjectdecoder"   reasonr   )filename_bytesfilenamedescriptive_namefZhas_datafirstrowerrZcontext_startZbadchar_offsetZsnippetZcaretr   r   r   _iter_file_rows   sb   

,r:   c           	   	   C  s   | j }t|}|t|std|d|d||rSt|}||sSt|| }d|d|d|}|rG|dd|7 }|d| d7 }t|t	|t	|k rsd|dtdd	 t
| D }t|d S )
NzThe {} file header does not contain one or more required comma-separated fields: "{}".
The following fields were found: "{}".
Please check that your file is in CSV format and has the required field names.z, zThe {} file header contains invalid columns: "{}".
Only the following comma-delimited fields may be used in the file header: "{}".z*
The following columns are required: "{}".z*
Please check that you have formatted the z< file correctly and included the appropriate column headers.z"{} csv has a duplicated column: {}c                 s  s     | ]\}}|d kr|V  qdS )   Nr   )r   namecountr   r   r   r   v   s    z$_validate_columns.<locals>.<genexpr>)Z
fieldnames	frozenset
issupersetsetr   formatr*   issubsetsortedlenr   items)	readerr5   required_cols
valid_colsZ	col_namesZcol_setZ	valid_setZextra_columnsr
   r   r   r   _validate_columnsR   sF   

rI   r4   os.PathLikec                 C  sx   t t| }tj|std| d|  t|tjs*d| d|  }t|t	t
|| |}t|||| |S )a1  Returns non-comment lines from the csv files.

    Verifies ASCII encoding and no duplicate columns.

    Args:
        filename: The csv file to open.
        descriptive_name (str): The description of the file, to use in error messages.
        required_cols (sequence of str): columns which must be present.
        valid_cols (sequence of str): columns which may be present.  If provided,
            then it is an error for other columns to be present.

    Returns:
        csv.DictReader: The non-comment rows.

    Raises:
        CSVParseException
    zCould not find the r    r   z5 csv is not readable, please check file permissions: )r   osfspathpathisfiler   accessR_OKcsvZ
DictReaderr:   rI   )r4   r5   rG   rH   r3   r
   rF   r   r   r   load_csv_filter_comments{   s   rR   out_csvbcs_per_genomedict[str, list[str]]c                 C  sp   t | d)}tj|dd}| D ]\}}|D ]}|t|t|g qqW d   dS 1 s1w   Y  dS )zWrite the barcodes to a CSV.

    Args:
        bcs_per_genome (dict of str to list): Map each genome to its cell-associated barcodes
    wr&   ZlineterminatorN)r(   rQ   writerrE   writerowr   )rS   rT   r6   rX   ZgenomeZbcsZbcr   r   r   write_filtered_barcodes   s   "rZ   str | bytesbarcode_list
np.ndarray	in_tissue
list[bool]normalization_arrayc           	   
   C  s   |j |j ksJ dd|j  d d|j  d |j d t|ks4J dd|j  d dt| d t| d	6}tj|d
d}|g d t|||D ]\}}}|t|tt	|t
j|ddg qNW d   dS 1 srw   Y  dS )aK  Write the normalization constant for each barcode to a CSV.

    Args:
        out_csv (Union[str, bytes]): Path of the CSV
        barcode_list (np.ndarray): list of barcodes
        in_tissue (list[bool]): boolean list of in tissue
        normalization_array (np.ndarray): normalization for each barcode - in the same order
    z;Lengths of barcodes and normalization factorsdo not match. zbarcode list length: z. zNormalization array shape: .r   z6Lengths of barcodes and in tissue vectordo not match. zIn tissue length: rV   r&   rW   )barcoder^   Znormalization_factor   )Z	precisionN)shaperD   r(   rQ   rX   rY   zipr   strintnpZformat_float_positional)	rS   r\   r^   r`   r6   rX   rb   Zbc_in_tissueZnormalized_valuer   r   r   write_isotype_normalization_csv   s8   
"ri   r;   c              
   C  s   | sd}dS t |dN}t| D ]@\}}t |0}g }t|D ]	}|t| q|dkr7|D ]}	||	 q/|D ]}
||
 q9W d   n1 sKw   Y  qW d   dS 1 s\w   Y  dS )zCombine a list of CSV files.

    Files specified in input_csvs are combined into
    a single output csv in output_csv. It is assumed that all
    CSVs have the same header structure. The number of header
    lines is specified in header_lines.
    NrV   r   )r(   	enumeraterangeappendnextwrite)Z
input_csvsZ
output_csvZheader_linesoutiZicsvinfileheader_Zhlineliner   r   r   combine_csv   s&   
"ru   r   )r4   rJ   )rS   rJ   rT   rU   )rS   r[   r\   r]   r^   r_   r`   r]   )r;   )
__future__r   rQ   rK   collectionsr   Znumpyrh   Zsixr   r   	Exceptionr   r   r:   rI   rR   rZ   ri   ru   r   r   r   r   <module>   s   	8*
#
)