o
    eiC                     @   sn  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZ zddlZW n eyA   edejd ed Y nw zddlmZ W n ey_   ed	ejd ed Y nw zddlZd
ZW n eyz   dZedejd Y nw dedee
 fddZdedee
e	  fddZdedee
e
  fddZddefddZdd Zedkre  dS dS )u   
Scraper for HDMA motif lexicon from the MOTIFS.html page.

Extracts the pattern → motif_name_safe → annotation mapping from:
https://greenleaflab.github.io/HDMA/MOTIFS.html

The page uses DataTables with embedded JavaScript data.
    N)Path)DictListOptionalzDError: requests module not found. Install with: pip install requestsfile   )BeautifulSoupzPError: beautifulsoup4 module not found. Install with: pip install beautifulsoup4TFz:Warning: pandas not available, will use CSV writer insteadhtml_contentreturnc                 C   s  d}t || t j}|D ]1}z#|d}t|}t|tr0t|dkr0t|d tr0|W   S W q tj	t
fy=   Y qw t| d}|d}|D ]}|jdu rRqJ|j}	g d}
|
D ]R}t ||	t j}|D ]E}z.|d}t dd	|}t d
d|}t|}t|trt|dkr|W       S W qg tj	t
fy } zW Y d}~qgd}~ww q[g d}|D ]I}t ||	t j}|D ]<}z.|d}t dd	|}t d
d|}t|}t|trt|dkr|W       S W q tj	t
fy   Y qw qqJdS )z
    Extract DataTable data from HTML page.
    
    The data is embedded in JavaScript as a DataTable initialization.
    We need to find the data array in the script tags.
    z,var\s+\w*[Dd]ata\w*\s*=\s*(\[\[[\s\S]*?\]\])r   r   html.parserscriptN)"data"\s*:\s*(\[\[[\s\S]*?\]\])z'data'\s*:\s*(\[\[[\s\S]*?\]\])zdata\s*:\s*(\[\[[\s\S]*?\]\])z,\s*\]]z,\s*\]\s*\]z]])z"var\s+\w+\s*=\s*(\[\[[\s\S]*?\]\])z$const\s+\w+\s*=\s*(\[\[[\s\S]*?\]\])z"let\s+\w+\s*=\s*(\[\[[\s\S]*?\]\]))refinditer	MULTILINEgroupjsonloads
isinstancelistlenJSONDecodeError
ValueErrorr	   find_allstringDOTALLsub)r
   json_patternmatchesmatchdata_strdatasoupscriptsr   script_contentdata_patternspatternevar_patterns r+   /oak/stanford/groups/akundaje/airanman/projects/lab/rare-disease-manuscript/curation/broad/varbook-container/snakemake/./scrape_hdma_motifs.pyextract_datatable_data&   sj   

$



	



r-   c              
   C   s   d}t || }|D ]8}|d}t d|}|rBz|d}t|}t|tr4t|dkr4|W   S W q
 tj	t
fyA   Y q
w q
dS )z
    Alternative method: Extract data from DataTable initialization.
    
    Look for the full DataTable config object and extract the data array.
    z1\$\([^)]+\)\.DataTable\s*\(\s*(\{[\s\S]*?\})\s*\)r   r   r   N)r   r   r   searchr   r   r   r   r   r   r   )r
   r(   r    r!   
config_str
data_matchr"   r#   r+   r+   r,   extract_from_datatable_init}   s"   




	r1   c           	      C   s   t | d}|d}|du rdS g }|d}|r,dd |ddgD }|r,|| |d	}|rN|d
D ]}dd |ddgD }|rM|| q8|rR|S dS )zQ
    Fallback: Parse the HTML table directly if JavaScript extraction fails.
    r   tableNtheadc                 S      g | ]}|j d dqS T)stripget_text).0thr+   r+   r,   
<listcomp>       z$parse_html_table.<locals>.<listcomp>r:   tdtbodytrc                 S   r4   r5   r7   )r9   r=   r+   r+   r,   r;      r<   )r	   findr   append)	r
   r$   r2   rows
header_rowheadersr>   r?   cellsr+   r+   r,   parse_html_table   s$   





rF   /https://greenleaflab.github.io/HDMA/MOTIFS.htmlurlc                    s  t d|  d tj| dd}|  |j}d}t d t|}|du r,t d t|}|du r8t d t|}|du r@td	g d
}t	|d t
sUtdt| dt|}|dkratdt	|d t
rnt|d nd}|t|d kr|t|d krt d| d| dtjd g }t|D ]fdd|D }|| q|}t|}|rt|d nd}|dkrt	|d t
rt|d dkrtdd |d dd D nd}	tdd |d dd D  }
|	r	|
r	t|d t|d kr	|d }|dd }n|t|kr|d| n|}|}n|}|}|rt|d   fdd|D }t|t|krKt dt|t|  dtjd |}t| krt| k ru|dd tt| D  }t d   d!tjd n|d  }t d"  d!tjd |rtt|dkrtt|dkrt|d dkrt d#t|d  d!tjd t d$|dd%  tjd t d&|d dd%  tjd d}d}d}t|D ]\}|d'kr}q|d(kr}q|d)kr}qt|d dkr=d*d |D }t|dkr=|dd }|dur!|dkr!|d8 }|dur/|dkr/|d8 }|dur=|dkr=|d8 }g }t|D ].\}|krS|d+ qC|kr_|d, qC|krk|d( qC|| qC|}tr|rtj||d-S tj|d-S ||d.S )/z
    Scrape the HDMA motif lexicon from the HTML page.
    
    Returns either a pandas DataFrame (if available) or a list of lists with headers.
    z	Fetching z...   )timeoutNz7Attempting to extract DataTable data from JavaScript...z'Trying alternative extraction method...z%Falling back to HTML table parsing...z%Could not extract data from HTML page)pattern_classidx_uniq
motif_namemotif_name_safe
annotationannotation_broadcategoryquery_consensuscwm_fwdcwm_rev
total_hitstotal_n_seqletsn_component_celltypes	top_organcwm_entropyentropy_ratior(   
best_matchbest_match_TOMTOM_qvalr   z%Data appears to be a flat array with z1 elements. Expected nested array (list of lists).zExtracted data is emptyr         zDetected transposed data: z columns with z rows each. Transposing...r   c                    s$   g | ]} t |k r|  nd qS ) r   r9   col)ir+   r,   r;      s   $ z&scrape_hdma_motifs.<locals>.<listcomp>c                 s   s    | ]}t |tV  qd S )N)r   strr9   xr+   r+   r,   	<genexpr>   s    z%scrape_hdma_motifs.<locals>.<genexpr>Fc                 s   s6    | ]}t |trt|d ddd V  qdS ).r_   -N)r   rd   replaceisdigitre   r+   r+   r,   rg      s    
c                    s   g | ]
}t | kr|qS r+   r`   r9   row)expected_lenr+   r,   r;     s    zWarning: Filtered out z rows with incorrect lengthc                 S   s   g | ]}d | qS )col_r+   r9   rc   r+   r+   r,   r;     s    zWarning: Extended headers to z columnszWarning: Truncated headers to zDebug: First row has zDebug: Headers: 
   zDebug: First row values: r[   rN   rO   c                 S   s   g | ]}|d d qS )r   Nr+   rl   r+   r+   r,   r;   C  r<   r(   rM   columns)rD   rB   )printrequestsgetraise_for_statustextr-   r1   rF   r   r   r   r   sysstderrrangerA   allany	enumerate
HAS_PANDASpd	DataFrame)rH   responser
   r#   rs   num_top_levelnum_elements_firstrB   rm   first_row_all_stringsfirst_row_not_numericrD   
valid_rowsbest_match_idxmotif_name_safe_idxannotation_idxcol_namenew_headersr+   )rn   rc   r,   scrape_hdma_motifs   s   
 2
&"  







r   c               
      s&  t jdd} | jdddd | jdtdd	 | jd
dg ddd |  }zAt|j}trt|t	j
r| tdt  d tdt j   fdd|jD }|r] |  }ntdtjd   }|jrqt|j}ntd}|j|ddd tdt| d|  td t|d  W d S |}|d }|d }tdt| d td|  g }	g }
|jD ]}||v r||}|	| |
| q|
stdtjd ttt|}	|}
|jrt|j}ntd}t|d d!d"/}tj|dd#}||
 |D ]tt|	kr|fd$d|	D  qW d    n	1 s+w   Y  tdt| d|  td% t|d d D ]\}tt|	krctd fd&d|	D  qHW d S  t!y } ztd'| tjd d(d l"}|#  t$d) W Y d }~d S d }~ww )*Nz/Scrape HDMA motif lexicon from MOTIFS.html page)descriptionz--urlrG   zURL of the HDMA MOTIFS page)defaulthelpz--outputz6Output TSV file path (default: hdma_motif_mapping.tsv))typer   z	--columns+)r(   rN   rO   rP   r[   zfColumns to include in output (default: pattern motif_name_safe annotation annotation_broad best_match))nargsr   r   z
Extracted z motifsz	Columns: c                    s   g | ]	}| j v r|qS r+   rr   ra   )dfr+   r,   r;     s    zmain.<locals>.<listcomp>z@Warning: None of the requested columns found. Using all columns.r   zhdma_motif_mapping.tsv	F)sepindexzSaved z motifs to z
Sample data:rq   rD   rB   wr_   )newline)	delimiterc                    s   g | ]} | qS r+   r+   rp   rm   r+   r,   r;     s    z
Sample data (first 10 rows):c                    s   g | ]}t  | qS r+   )rd   rp   r   r+   r,   r;     r<   zError: r   r   )%argparseArgumentParseradd_argumentrd   
parse_argsr   rH   r   r   r   r   rt   r   r   rs   copyry   rz   outputr   to_csvhead	to_stringr   rA   r{   opencsvwriterwriterowmaxr~   join	Exception	traceback	print_excexit)parserargsresultavailable_columns	df_outputoutput_path	data_dictrD   rB   col_indicesoutput_headersrb   idxfr   rc   r)   r   r+   )r   rm   r,   maind  s   





r   __main__)rG   )__doc__r   r   ry   r   r   pathlibr   typingr   r   r   ru   ImportErrorrt   rz   r   bs4r	   pandasr   r   rd   r-   r1   rF   r   r   __name__r+   r+   r+   r,   <module>   sH   	W .j
