##################################
#                                #
# Last modified 2019/01/18       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os

# FLAG field meaning
# 0x0001 1 the read is paired in sequencing, no matter whether it is mapped in a pair
# 0x0002 2 the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
# 0x0004 4 the query sequence itself is unmapped
# 0x0008 8 the mate is unmapped 1
# 0x0010 16 strand of the query (0 for forward; 1 for reverse strand)
# 0x0020 32 strand of the mate 1
# 0x0040 64 the read is the first read in a pair 1,2
# 0x0080 128 the read is the second read in a pair 1,2
# 0x0100 256 the alignment is not primary (a read having split hits may have multiple primary alignment records)
# 0x0200 512 the read fails platform/vendor quality checks
# 0x0400 1024 the read is either a PCR duplicate or an optical duplicate

def FLAG(FLAG):

    Numbers = [0,1,2,4,8,16,32,64,128,256,512,1024]

    FLAGList=[]

    MaxNumberList=[]
    for i in Numbers:
        if i <= FLAG:
            MaxNumberList.append(i)

    Residual=FLAG
    maxPos = len(MaxNumberList)-1

    while Residual > 0:
        if MaxNumberList[maxPos] <= Residual:
            Residual = Residual - MaxNumberList[maxPos]
            FLAGList.append(MaxNumberList[maxPos])
            maxPos-=1
        else:
            maxPos-=1
  
    return FLAGList

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s BAMfilename samtools outputfilename' % sys.argv[0]
        print '\Note: the -regions option and the -chr option will be integrated if both run, i.e. only the regions within the wanted chromosomes will be used'
        sys.exit(1)

    BAM = sys.argv[1]
    samtools = sys.argv[2]
    outfilename = sys.argv[3]

    InsertLengthDistribution = {}
    InsertLengthDistribution['singleton'] = 0

    cmd = samtools + ' view ' + BAM
    p = os.popen(cmd, "r")
    line = 'line'
    RL = 0
    while line != '':
        line = p.readline()
        if line == '':
            break
        RL+=1
        if RL % 1000000 == 0:
            print str(RL/1000000) + 'M lines processed'
        fields = line.strip().split('\t')
        FLAGfields = FLAG(int(fields[1]))
        if 8 in FLAGfields:
             InsertLengthDistribution['singleton'] += 1
             continue
        pos = int(fields[3])
        matepos = int(fields[7])
        if matepos > pos:
            continue
        IL = pos - matepos + len(fields[9])
        if InsertLengthDistribution.has_key(IL):
            pass
        else:
            InsertLengthDistribution[IL] = 0
        InsertLengthDistribution[IL] += 1

    outfile = open(outfilename, 'w')

    outline = '#Length\tNumberPairs'
    outfile.write(outline + '\n')

    keys = InsertLengthDistribution.keys()
    keys.sort()
    for IL in keys:
        outline = str(IL) + '\t' + str(InsertLengthDistribution[IL])
        outfile.write(outline + '\n')

    outfile.close()
            
run()
