##################################
#                                #
# Last modified 02/22/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set


def run():

    if len(sys.argv) < 2:
        print 'usage: python %s <list of files filename> outputfilename ' % sys.argv[0]
        print '	format of list of files file: label <tab> filename' 
        sys.exit(1)
    
    files = sys.argv[1]
    outfilename = sys.argv[2]

    LabelToFileDict={}
    linelist=open(files)
    for line in linelist:
        fields=line.strip().split('\t')
        label=fields[0]
        LabelToFileDict[label]={}
        file=fields[1]
        LabelToFileDict[label]['file']=file
    
    LabelKeys=LabelToFileDict.keys()
    LabelKeys.sort()
    DataDict={}
    for label in LabelKeys:
        file=LabelToFileDict[label]['file']
        print file
        linelist=open(file)
        for line in linelist:
            if line[0]=='#':
                continue
            fields=line.strip().split('\t')
            if fields[2]!='TSS':
                continue
            TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0]
            GeneID=fields[8].split('gene_id "')[1].split('";')[0]
            chr=fields[0]
            TSS=fields[3]
            strand=fields[6]
            value=fields[8].split('npIDR "')[1].split('";')[0]
            DataID = (chr,TSS,strand,GeneID,TranscriptID)
            if DataDict.has_key(DataID):
                pass
            else:
                DataDict[DataID]={}
            DataDict[DataID][label]=value

    outfile = open(outfilename, 'w')

    outline='#chr\tTSS\tstrand\tGeneID\tTranscriptID(s)\t'

    for label in LabelKeys:
        outline=outline+label+'\t'

    outfile.write(outline.strip()+'\n')
        
    DataKeys=DataDict.keys()
    DataKeys.sort()
    bad=0
    for DataID in DataKeys:
        (chr,TSS,strand,GeneID,TranscriptID)=DataID 
        outline= chr + '\t' + TSS + '\t' + strand + '\t' + GeneID + '\t' + TranscriptID + '\t'
        for label in LabelKeys:
            if DataDict[DataID].has_key(label):
                outline=outline+DataDict[DataID][label]+'\t'
            else:
                outline=outline + '2' + '\t'
        outfile.write(outline.strip()+'\n')
  
    outfile.close()
            
run()
