#
# License: FreeBSD (Berkeley Software Distribution)
# Copyright (c) 2013, Sara Sheehan, Kelley Harris, Yun Song
#
# strip a vcf file so we just have the haplotype information (creates a new file)
# note: for unphased sites, N's will be used, if unphased sites are too prevalent this could lead to poor results
#


import optparse
import sys


#------------------------------
# PARSE COMMAND LINE ARGUMENTS
#------------------------------


parser = optparse.OptionParser(description='creates a plot of the likelihood over the EM iterations')

parser.add_option('-v', '--vcf_filename', type='string', help='path to input vcf file')
parser.add_option('-o', '--out_filename', type='string', help='path to output stripped vcf file')

(opts, args) = parser.parse_args()

mandatories = ['vcf_filename','out_filename']
for m in mandatories:
    if not opts.__dict__[m]:
        print 'mandatory option ' + m + ' is missing\n'
        parser.print_help()
        sys.exit()



#---------
# HELPERS
#---------


# see if a genotype is phased or unambiguously called (i.e. homozygous)
# input: genotype, i.e. 0/0, 1|0 (both okay), or 1/0 (not okay)
def phased(genotype):
    if (genotype[0] == genotype[2]) or (genotype[1] == '|'):
        return True
    return False


# if 0, return reference allele; if 1, return alternative allele
def to_add(allele, ref, alt):
    if allele == '0':
        return ref
    else:
        return alt

    
#------
# MAIN
#------


# parse vcf file
vcf_file = file(opts.vcf_filename,'r')
out_file = file(opts.out_filename,'w')

for line in vcf_file:
    if line[0] != '#': # not a comment line

        # get tokens
        tokens = line.strip().split()
        chrom = tokens[0]
        pos = tokens[1]
        ref = tokens[3]
        alt = tokens[4]
        hap_lst_raw = tokens[9:]

        # parse haplotypes
        haps_str = ''
        for hap_raw in hap_lst_raw:
            genotype = hap_raw[:3]
            if phased(genotype):
                haps_str += to_add(genotype[0],ref,alt)
                haps_str += to_add(genotype[2],ref,alt)
            else: 
                haps_str += 'NN' # add unknown alleles

        # write the stripped down portion
        out_file.write(chrom + ' ' + pos + ' ' + haps_str + '\n')


# close files
vcf_file.close()
out_file.close()

