##################################
#                                #
# Last modified 08/29/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math

# FLAG field meaning
# 0x0001 1 the read is paired in sequencing, no matter whether it is mapped in a pair
# 0x0002 2 the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
# 0x0004 4 the query sequence itself is unmapped
# 0x0008 8 the mate is unmapped 1
# 0x0010 16 strand of the query (0 for forward; 1 for reverse strand)
# 0x0020 32 strand of the mate 1
# 0x0040 64 the read is the first read in a pair 1,2
# 0x0080 128 the read is the second read in a pair 1,2
# 0x0100 256 the alignment is not primary (a read having split hits may have multiple primary alignment records)
# 0x0200 512 the read fails platform/vendor quality checks
# 0x0400 1024 the read is either a PCR duplicate or an optical duplicate

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def SAMoutput(read1,read2,TranscriptDict):

    FLAG1 = 1 + 2 + 64 + 
    FLAG2 = 1 + 2 + 128 + 

    ID = read1[ID]
    premRNA = False
    if ':pre_mRNA:' in ID:
        premRNA = True
    fields = ID.split(':')
    geneName = fields[0].split('@')[1]
    geneID = fields[1]
    transcriptName = fields[2]
    transcriptID = fields[3]
    chr = fields[4]
    if fields[5].split('-')[2] == 'plus_strand'
        strand = '+'
    if fields[5].split('-')[2] == 'minus_strand'
        strand = '-'
    if premRNA:
        pos_read1 = int(fields[6].split('pos_read1='))
        pos_read2 = int(fields[7].split('pos_read2='))
        CIGAR_read1 = int(fields[8].split('CIGAR_read1='))
        CIGAR_read2 = int(fields[9].split('CIGAR_read2='))
    else:
        pos_read1 = int(fields[7].split('pos_read1='))
        pos_read2 = int(fields[8].split('pos_read2='))
        CIGAR_read1 = int(fields[9].split('CIGAR_read1='))
        CIGAR_read2 = int(fields[10].split('CIGAR_read2='))

    if premRNA:
        

def run():

    if len(sys.argv) < 11:
        print 'usage: python %s GTF FASTQ FASTQ2' % sys.argv[0]
        print '\tthe script will print SAM alignments to standrd output; capture them with samtools and sort them to get the final BAM file'
        sys.exit(1)

    GTF = sys.argv[1]
    FASTQ1 = sys.argv[2]
    FASTQ2 = sys.argv[3]

    TranscriptDict = {}

    linelist = open(GTF)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2]!='exon':
            continue
        chr=fields[0]
        start=int(fields[3])
        stop=int(fields[4])
        strand=fields[6]
        if 'transcript_name "' in fields[8]:
            transcriptName=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            transcriptName=fields[8].split('transcript_id "')[1].split('";')[0]
        transcriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if 'gene_name "' in fields[8]:
            geneName=fields[8].split('gene_name "')[1].split('";')[0]
        else:
            geneName=fields[8].split('gene_id "')[1].split('";')[0]
        geneID=fields[8].split('gene_id "')[1].split('";')[0]
        transcript = (geneID, geneName, transcriptID, transcriptName)
        if TranscriptDict.has_key(transcript):
            pass
        else:
            TranscriptDict[transcript]={}
            TranscriptDict[transcript]['exons']=[]
        TranscriptDict[transcript]['exons'].append((chr,start,stop,strand))

    for transcript in TranscriptDict.keys():
        TranscriptDict[transcript]['splicesites'] = {}
        TranscriptDict[transcript]['exons'].sort()
        (chr,start,stop,strand) = TranscriptDict[transcript]['exons']
        if strand == '+':
            pos = 0
            for (chr,start,stop,strand) in TranscriptDict[transcript]['exons']:
                TranscriptDict[transcript]['splicesites'][pos] = start
                pos += (stop-start)
                TranscriptDict[transcript]['splicesites'][pos] = stop
        if strand == '-':
            TranscriptDict[transcript]['exons'].reverse()
            pos = 0
            for (chr,start,stop,strand) in TranscriptDict[transcript]['exons']:
                TranscriptDict[transcript]['splicesites'][pos] = stop
                pos += (stop-start)
                TranscriptDict[transcript]['splicesites'][pos] = start

    linelist1 = open(FASTQ1)
    linelist2 = open(FASTQ1)
    while line1 != '':
        read1 = {}
        line1 = linelist1.readline()
        read1['ID'] = line.strip() 
        line1 = linelist1.readline()
        read1['sequence'] = line.strip() 
        line1 = linelist1.readline()
        line1 = linelist1.readline()
        read1['quality'] = line.strip() 
        read2 = {}
        line2 = linelist2.readline()
        read2['ID'] = line.strip() 
        line2 = linelist2.readline()
        read2['sequence'] = line.strip() 
        line2 = linelist2.readline()
        line2 = linelist2.readline()
        read2['quality'] = line.strip() 
        if read1['ID'][0] != '@' or read2['ID'][0] != '@':
            print 'fastq broken'
            sys.exit(1)
        def SAMoutput(read1,read2,TranscriptDict):   

run()