o
    rei7                     @   sH   d Z ddlZddlZdddZdd ZdddZdd	d
ZdddZdS )a  
Shared configuration module for model datasets.

This module provides a unified way to access model dataset configurations
used across multiple scripts (run_filter.py, run_kmeans.py, etc.).

The configuration can be read from Snakefile.original or use a fallback
hardcoded mapping.
    Nc              
   C   s  | du r+t jt jt}t j|dt j|dg}|D ]}t j|r*|}  nq| rt j| rzt| d}| }W d   n1 sHw   Y  i }d}t	
||}|r| }	d}
d}d}|	|  d }|t|k r|| }|d	v r|dks||d  d
kr|sd}|}n>||krd}d}n5|s|dkr|
d7 }
n*|dkr|
d8 }
|
dkr||	|d  }t|| d|v rtd|   |d W S n
|d7 }|t|k sqW n ty } ztd|  W Y d}~nd}~ww td t S )a7  
    Get VARIANT_DATASET_CONFIGS from Snakefile or return fallback config.
    
    Parameters:
    -----------
    snakefile_path : str, optional
        Path to Snakefile.original. If None, tries to find it automatically.
    
    Returns:
    --------
    dict
        VARIANT_DATASET_CONFIGS dictionary
    NzSnakefile.originalZ	Snakefilerz VARIANT_DATASET_CONFIGS\s*=\s*\{r   F   )"'\T{}ZVARIANT_DATASET_CONFIGSz$Loaded VARIANT_DATASET_CONFIGS from z/Warning: Could not parse Snakefile for config: z0Using fallback hardcoded VARIANT_DATASET_CONFIGS)ospathdirnameabspath__file__joinexistsopenreadresearchstartendlenexecprint	Exceptionget_fallback_config)snakefile_path
script_dirZpotential_pathsr
   fcontent	namespacepatternmatchZ	start_posZbrace_countZ	in_stringZstring_charicharZ
config_stre r%   /oak/stanford/groups/akundaje/airanman/projects/lab/rare-disease-manuscript/curation/broad/varbook-container/snakemake/model_datasets_config.pyget_model_datasets_config   sn   
 


r'   c                   C   s   dddgddgdgiS )z
    Get fallback hardcoded VARIANT_DATASET_CONFIGS.
    
    Returns:
    --------
    dict
        Fallback VARIANT_DATASET_CONFIGS
    z4Broad neurodevelopmental and neuromuscular disorderszFetal BrainzKUN_FB*z	KUN_HDMA*)namemodelsZmodel_supersetr%   r%   r%   r%   r&   r   _   s   
r   c                 C   sT   t |}| |vr
g S ||  }t|trd|v r|d S t|tr"|S td|  d)a  
    Get the list of model dataset configs for a variant dataset.
    
    Handles both list and dict formats in VARIANT_DATASET_CONFIGS.
    
    Parameters:
    -----------
    variant_dataset : str
        Variant dataset name
    snakefile_path : str, optional
        Path to Snakefile.original
    
    Returns:
    --------
    list
        List of model dataset config dicts
    model_datasetsz+Invalid config format for variant_dataset 'z2': expected list or dict with 'model_datasets' key)r'   
isinstancedictlist
ValueError)variant_datasetr   Zconfigsconfigr%   r%   r&   get_model_datasets_lists   s   

r1   c                 C   sh   t | |}|D ]}t|tr!|d|kr!|dg }|r!|  S qtd| d|  ddd |D  )a  
    Get model patterns for a model_dataset name.
    
    Parameters:
    -----------
    variant_dataset : str
        Variant dataset name
    model_dataset_name : str
        Model dataset name (e.g., "Fetal Brain")
    snakefile_path : str, optional
        Path to Snakefile.original
    
    Returns:
    --------
    list of str
        Model patterns (e.g., ["KUN_FB*"])
    r(   r)   z1Could not find model patterns for model_dataset 'z' in variant_dataset 'z'. Available model_datasets: c                 S   s    g | ]}t |tr|d qS r(   r+   r,   get.0cr%   r%   r&   
<listcomp>   s     z3get_model_patterns_from_dataset.<locals>.<listcomp>)r1   r+   r,   r4   r.   )r/   model_dataset_namer   r*   r0   r)   r%   r%   r&   get_model_patterns_from_dataset   s   
r:   c                 C   s   t | |}dd |D S )a7  
    List all available model datasets for a variant dataset.
    
    Parameters:
    -----------
    variant_dataset : str
        Variant dataset name
    snakefile_path : str, optional
        Path to Snakefile.original
    
    Returns:
    --------
    list of str
        List of model dataset names
    c                 S   s(   g | ]}t |trd |v r|d qS r2   r3   r5   r%   r%   r&   r8      s   ( z'list_model_datasets.<locals>.<listcomp>)r1   )r/   r   r*   r%   r%   r&   list_model_datasets   s   
r;   )N)__doc__r	   r   r'   r   r1   r:   r;   r%   r%   r%   r&   <module>   s   

O

%#