import gzip

def gtf_to_bed(gtf_file, bed_file):
    genes = set()
    with gzip.open(gtf_file, 'rt') as gtf, open(bed_file, 'w') as bed:
        bed.write('#chr\tstart\tend\tname\tscore\tstrand\tEnsembl_ID\tgene_type\n')
        for line in gtf:
            if line.startswith('#'):
                continue
            chrom, _, feature, start, end, _, strand, _, attributes = line.strip().split('\t')
            start = int(start) - 1
            if "." not in chrom:
                chrom = f'chr{chrom}'

            attribute_dict = {}
            for attribute in attributes.rstrip(';').split(';'):
                # try:
                key, value = attribute.strip().split(' ', maxsplit=1)
                # except ValueError:
                #     print(attribute, attributes) ####
                attribute_dict[key] = value.strip('"')

            if feature == 'gene':
                gene_id = attribute_dict["gene_id"]
                gene_name = attribute_dict.get("gene_name", gene_id)
                if gene_name in genes:
                    # print(gene_name, gene_id, chrom, start, end, strand) #### 
                    continue
                gene_type = attribute_dict["gene_biotype"]
                bed.write(f'{chrom}\t{start}\t{end}\t{gene_name}\t0\t{strand}\t{gene_id}\t{gene_type}\n')
                genes.add(gene_name)
            

gtf_file, = snakemake.input
bed_file, = snakemake.output

gtf_to_bed(gtf_file, bed_file)