##################################
#                                #
# Last modified 2017/12/04       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import pysam
import os

def FLAG(FLAG):

    Numbers = [0,1,2,4,8,16,32,64,128,256,512,1024]

    FLAGList=[]

    MaxNumberList=[]
    for i in Numbers:
        if i <= FLAG:
            MaxNumberList.append(i)

    Residual=FLAG
    maxPos = len(MaxNumberList)-1

    while Residual > 0:
        if MaxNumberList[maxPos] <= Residual:
            Residual = Residual - MaxNumberList[maxPos]
            FLAGList.append(MaxNumberList[maxPos])
            maxPos-=1
        else:
            maxPos-=1
  
    return FLAGList

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for i in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-i-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s BAM(TopHat/STAR) outfile [-withMulti] [-sgRNAprefix string]' % sys.argv[0]
        print '\tNOTE: assumed guideID format:'
        print '\t\tENSG00000151418.11::ATP6V1G3::ENST00000309309.11::3'
        sys.exit(1)

    BAM = sys.argv[1]
    samtools = sys.argv[2]
    outfilename = sys.argv[3]

    withMulti = False
    if '-withMulti' in sys.argv:
        withMulti = True

    sgRNAprefix = 'sgRNA'
    if '-sgRNAprefix' in sys.argv:
        sgRNAprefix = sys.argv[sys.argv.index('-sgRNAprefix') + 1]

    GuideSeqDict = {}

    cmd = samtools + ' view ' + BAM
    p = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = p.readline()
        if line == '':
            break
#    samfile = pysam.Samfile(BAM, "rb" )
#    for read in samfile.fetch(until_eof=True):
#        fields = str(read).split('\t')
        fields = line.strip().split('\t')
#        guideID = read.qname
        guideID = fields[0]
        try:
            geneID = guideID.split('::')[0]
            geneName = guideID.split('::')[1]
            transcriptID = guideID.split('::')[2]
        except:
            print 'problem with guide RNA read ID, skipping'
            print fields
#        sequence = read.seq
        sequence = fields[9]
        if 16 in FLAG(int(fields[1])):
            sequence = getReverseComplement(sequence)
#        if read.is_reverse:
#            sequence = getReverseComplement(sequence)
        chr = fields[2]
#        pos = read.pos
        pos = fields[3]
#        CIGAR = read.cigar
        CIGAR = fields[5]
        if GuideSeqDict.has_key(sequence):
            if GuideSeqDict[sequence]['coordinates'][0] != (chr,pos,CIGAR):
                if withMulti:
                    GuideSeqDict[sequence]['coordinates'].append((chr,pos,CIGAR))
                else:
                    print 'multimapping guide detected, exiting'
                    print fields
                    print GuideSeqDict[sequence]
                    sys.exit()
            pass
        else:
            GuideSeqDict[sequence] = {}
            GuideSeqDict[sequence]['coordinates'] = []
            GuideSeqDict[sequence]['geneNames'] = []
            GuideSeqDict[sequence]['geneIDs'] = []
            GuideSeqDict[sequence]['transcripts'] = []
        GuideSeqDict[sequence]['coordinates'].append((chr,pos,CIGAR))
        GuideSeqDict[sequence]['geneNames'].append(geneName)
        GuideSeqDict[sequence]['geneIDs'].append(geneID)
        GuideSeqDict[sequence]['transcripts'].append(transcriptID)

    outfile = open(outfilename, 'w')
    outline = '#sgRNA_ID\tgeneIDs\tgeneNames\ttranscripts\tmapping(s)\tgRNA_sequence\tgRNA_sequence_reverse_complement\tGC%'
    outfile.write(outline + '\n')

    OutlineList = []

    i=0
    for sequence in GuideSeqDict.keys():
        i+=1
        GuideSeqDict[sequence]['geneNames'] = list(Set(GuideSeqDict[sequence]['geneNames']))
        GuideSeqDict[sequence]['geneIDs'] = list(Set(GuideSeqDict[sequence]['geneIDs']))
        GuideSeqDict[sequence]['transcripts'] = list(Set(GuideSeqDict[sequence]['transcripts']))
        GuideSeqDict[sequence]['geneNames'].sort()
        GuideSeqDict[sequence]['geneIDs'].sort()
        GuideSeqDict[sequence]['transcripts'].sort()
        GuideSeqDict[sequence]['coordinates'] = list(Set(GuideSeqDict[sequence]['coordinates']))
        GuideSeqDict[sequence]['coordinates'].sort()
        outline = ''
        for ID in GuideSeqDict[sequence]['geneIDs']:
            outline = outline + ID + ',' 
        outline = outline[0:-1] + '\t'
        for ID in GuideSeqDict[sequence]['geneNames']:
            outline = outline + ID + ',' 
        outline = outline[0:-1] + '\t'
        for ID in GuideSeqDict[sequence]['transcripts']:
            outline = outline + ID + ',' 
        outline = outline[0:-1] + '\t'
        for (chr,pos,CIGAR) in GuideSeqDict[sequence]['coordinates']:
            outline = outline + chr + ':' + pos + ':' + CIGAR + ','
        outline = outline[0:-1] + '\t' + sequence + '\t' + getReverseComplement(sequence)
        GC = (sequence.count('G') + sequence.count('C'))/(len(sequence) + 0.0)
        outline = outline + '\t' + str(GC)
        OutlineList.append(outline)
 
    OutlineList.sort()

    i=0
    for outline in OutlineList:
        i+=1
        outfile.write(sgRNAprefix + '_' + str(i) + '\t' + outline + '\n')

    outfile.close()

run()
