##################################
#                                #
# Last modified 06/10/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set


def run():

    if len(sys.argv) < 2:
        print 'usage: python %s SAMfilename outputfilename ' % sys.argv[0]
        print ' -multiReadCorrection not implemented yet' 
        sys.exit(1)
    
    cachePages = 2000000

    SAM = sys.argv[1]
    outfilename = sys.argv[2]

    OtherChrDict={}
    OtherChrDict['Reads']=[]
    OtherChrDict['Splices']=[]
    OtherChrDict['Multi']={'1':0, '3':0, '0':0, '255':0}
    ChrMDict={}
    ChrMDict['Reads']=[]
    ChrMDict['Splices']=[]
    ChrMDict['Multi']={'1':0, '3':0, '0':0, '255':0}

    i=0
    linelist = open(SAM)
    for line in linelist:
        if line.startswith('@'):
            continue
        fields=line.strip().split('\t')
        chr=fields[2]
        if chr=='chrM':
            ChrMDict['Reads'].append(fields[1])
            if 'N' in fields[5]:
                ChrMDict['Splices'].append(fields[1])
                ChrMDict['Multi'][fields[4]]+=1
        else:
            OtherChrDict['Reads'].append(fields[1])
            if 'N' in fields[5]:
                OtherChrDict['Splices'].append(fields[1])
                OtherChrDict['Multi'][fields[4]]+=1
        i+=1
        if i % 1000000 == 0:
            print i, 'lines processed'

    ChrMDict['Reads']=list(Set(ChrMDict['Reads']))
    ChrMDict['Splces']=list(Set(ChrMDict['Splices']))
    OtherChrDict['Reads']=list(Set(OtherChrDict['Reads']))
    OtherChrDict['Splces']=list(Set(OtherChrDict['Splices']))

    outfile = open(outfilename, 'w')

    outfile.write('chrM reads stats:'+'\n')
    outfile.write('Total chrM reads:'+str(len(ChrMDict['Reads']))+'\n')
    outfile.write('Spliced chrM reads:'+str(len(ChrMDict['Splices']))+'\n')
    outfile.write('Mutli_classes:\t0\t1\t3\t255\n')
    outfile.write('Counts:\t'+str(ChrMDict['Multi']['0'])+'\t'+str(ChrMDict['Multi']['1'])+'\t'+str(ChrMDict['Multi']['3'])+'\t'+str(ChrMDict['Multi']['255'])+'\n')
    outfile.write('=============================================================================\n')
    outfile.write('Non-chrM reads stats:'+'\n')
    outfile.write('Total reads:'+str(len(OtherChrDict['Reads']))+'\n')
    outfile.write('Spliced reads:'+str(len(OtherChrDict['Splices']))+'\n')
    outfile.write('Mutli_classes:\t0\t1\t3\t255\n')
    outfile.write('Counts:\t'+str(OtherChrDict['Multi']['0'])+'\t'+str(OtherChrDict['Multi']['1'])+'\t'+str(OtherChrDict['Multi']['3'])+'\t'+str(OtherChrDict['Multi']['255'])+'\n')
    
    outfile.close()
            
run()
