#!/usr/bin/env python3
"""
Simple script to explore the structure of the HDMA motif compendium HDF5 file.
Focuses on displaying motif names and overall file structure.
"""

import h5py
import sys
from pathlib import Path


def explore_hdf5_structure(name, obj, depth=0, max_depth=10):
    """Recursively explore HDF5 file structure."""
    indent = "  " * depth
    if isinstance(obj, h5py.Group):
        print(f"{indent}Group: {name}")
        if depth < max_depth:
            for key in obj.keys():
                explore_hdf5_structure(key, obj[key], depth + 1, max_depth)
    elif isinstance(obj, h5py.Dataset):
        print(f"{indent}Dataset: {name} (shape: {obj.shape}, dtype: {obj.dtype})")
        # If it's a small string dataset, show a sample
        if obj.dtype.kind == 'S' or obj.dtype.kind == 'U':
            try:
                if obj.size <= 10:
                    print(f"{indent}  Values: {obj[:]}")
                elif obj.size > 0:
                    sample = obj[:min(5, obj.size)]
                    print(f"{indent}  Sample values: {sample}")
            except:
                pass


def find_motif_names(hdf5_file):
    """Search for motif names in the HDF5 file."""
    motif_names = []
    
    def search_for_motifs(name, obj):
        """Helper function to search for motif-related data."""
        if isinstance(obj, h5py.Dataset):
            # Check if dataset name suggests it contains motif names
            if 'motif' in name.lower() or 'name' in name.lower():
                try:
                    if obj.dtype.kind == 'S' or obj.dtype.kind == 'U':
                        data = obj[:]
                        if len(data) > 0:
                            print(f"\nFound potential motif names in: {name}")
                            print(f"  Shape: {obj.shape}, dtype: {obj.dtype}")
                            if obj.size <= 100:
                                print(f"  All values:")
                                for i, val in enumerate(data):
                                    if isinstance(val, bytes):
                                        val = val.decode('utf-8')
                                    print(f"    [{i}] {val}")
                                    motif_names.append(val)
                            else:
                                print(f"  First 20 values:")
                                for i, val in enumerate(data[:20]):
                                    if isinstance(val, bytes):
                                        val = val.decode('utf-8')
                                    print(f"    [{i}] {val}")
                                    motif_names.append(val)
                except Exception as e:
                    print(f"  Error reading {name}: {e}")
    
    hdf5_file.visititems(search_for_motifs)
    return motif_names


def main():
    hdf5_path = Path("/oak/stanford/groups/akundaje/airanman/refs/HDMA/motif_compendium.modisco_object.h5")
    
    if not hdf5_path.exists():
        print(f"Error: File not found: {hdf5_path}")
        sys.exit(1)
    
    print(f"Exploring HDF5 file: {hdf5_path}")
    print("=" * 80)
    
    try:
        with h5py.File(hdf5_path, 'r') as f:
            print("\n=== File Structure ===")
            print(f"File mode: {f.mode}")
            print(f"Number of top-level keys: {len(f.keys())}")
            print(f"Top-level keys: {list(f.keys())}")
            
            print("\n=== Detailed Structure ===")
            for key in f.keys():
                explore_hdf5_structure(key, f[key], depth=0, max_depth=5)
            
            print("\n=== Searching for Motif Names ===")
            motif_names = find_motif_names(f)
            
            if motif_names:
                print(f"\n=== Summary: Found {len(motif_names)} motif names ===")
                unique_names = list(set(motif_names))
                print(f"Unique motif names: {len(unique_names)}")
                if len(unique_names) <= 50:
                    print("\nAll unique motif names:")
                    for name in sorted(unique_names):
                        print(f"  - {name}")
                else:
                    print("\nFirst 50 unique motif names:")
                    for name in sorted(unique_names)[:50]:
                        print(f"  - {name}")
            else:
                print("\nNo obvious motif names found. Checking common locations...")
                # Check common Modisco structure locations
                common_paths = [
                    'metacluster_0/patterns',
                    'metacluster_0/seqlets',
                    'patterns',
                    'seqlets',
                ]
                for path in common_paths:
                    if path in f:
                        print(f"\nFound path: {path}")
                        explore_hdf5_structure(path, f[path], depth=0, max_depth=3)
    
    except Exception as e:
        print(f"Error reading HDF5 file: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()



