##################################
#                                #
# Last modified 06/02/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s region chrFieldID leftFieldID rightFieldID outfilename [-labelFields ID1,...IDN] [-noNumbering] [-peakID fieldID] [-narrowPeak]' % sys.argv[0]
        print '       this script will take any file with genomic coordinates and convert it to format similar to ERANGE hts format, although with artificial scores' 
        print '       unless the -peakID or the -narrowPeak options are used, the peak will be calculated as the middle of the region' 
        print '       -labelFields will be merged with a __ between them in the indicated order; if you do not want the regions numbered, used the -noNumbering option' 
        sys.exit(1)

    inputfilename = sys.argv[1]
    chrFieldID = int(sys.argv[2])
    leftFieldID = int(sys.argv[3])
    rightFieldID = int(sys.argv[4])
    outputfilename = sys.argv[5]
    outfile = open(outputfilename, 'w')

    doNarrowPeak=False
    if '-narrowPeak' in sys.argv:
        doNarrowPeak=True

    doPeakID=False
    if '-peakID' in sys.argv:
        doPeakID=True
        peakFieldID = int(sys.argv[sys.argv.index('-peakID')+1])

    doPeakID=False
    if '-peakID' in sys.argv:
        doPeakID=True
        peakFieldID = int(sys.argv[sys.argv.index('-peakID')+1])

    doLabelFields=False
    if '-labelFields' in sys.argv:
        doLabelFields=True
        labels = sys.argv[sys.argv.index('-labelFields')+1].split(',')
        LabelFields=[]
        for ID in labels:
            LabelFields.append(int(ID))
        LabelFieldsDict={}
        doNumbering=True
        if '-noNumbering' in sys.argv:
            doNumbering=False

    outline = '#regionID\tchrom\tstart\tstop\tRPM\tfold\tmulti%\tplus%\tleftPlus%\tpeakPos\tpeakHeight\tpValue'
    outfile.write(outline + '\n')

    lineslist = open(inputfilename)
    i=0
    for line in lineslist:
        if line.startswith('#'):
            continue
        i+=1
        if i % 1000000 == 0:
            print i, 'lines processed'
        fields = line.strip().split('\t')
        if doLabelFields:
            regionID = ''
            for ID in LabelFields:
                regionID = regionID + fields[ID] + '__'
            regionID = regionID.replace('/','-')
            if doNumbering:
                if LabelFieldsDict.has_key(regionID):
                    LabelFieldsDict[regionID]+=1
                else:
                    LabelFieldsDict[regionID]=1
                regionID = regionID + str(LabelFieldsDict[regionID])
            else:
                regionID = regionID[0:-2]
        else:
            regionID = 'region' + str(i)
        if doNarrowPeak:
            chr = fields[0]
            left = int(fields[1])
            right = int(fields[2])
            peakoffset = int(fields[9])
            peak = left + peakoffset
        else:
            chr=fields[chrFieldID]
            left = int(fields[leftFieldID])
            right = int(fields[rightFieldID])
            if doPeakID:
                peak = fields[peakFieldID]
            else:
                peak = int((left + right)/2.0)
        outline = regionID + '\t' + chr + '\t' + str(left) + '\t' + str(right) + '\t5\t5\t1\t1\t1\t1' + '\t' + str(peak) + '\t5\t0'
        outfile.write(outline + '\n')

    outfile.close()

run()