##################################
#                                #
# Last modified 2018/05/04       #
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import random
from sets import Set
import time

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','X':'X','a':'t','t':'a','g':'c','c':'g','n':'n','x':'x','R':'R','r':'r','M':'M','m':'m','Y':'Y','y':'y','S':'S','s':'s','K':'K','k':'k','W':'W','w':'w'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def retrievePrimers():

    primers = [
        ('oMCB1685_sublib_1', ('CTTAAACCGGCCAACATACC', 'ATGCTACTCGTTCCTTTCGA')),
        ('oMCB1686_sublib_2', ('TGCTCTTTATTCGTTGCGTC', 'TCTTATCGGTGCTTCGTTCT')),
        ('oMCB1687_sublib_3', ('TGAGCCTTATGATTTCCCGT', 'GTCCGTTTTCCTGAATGAGC')),
        ('oMCB1688_sublib_4', ('CGTTCTAAACGGCTAGATGC', 'AGTCTGTCTTTCCCCTTTCC')),
        ('oMCB1689_sublib_5', ('GTATCCGAAGCGTGGAGTAT', 'CAGGTATGCGTAGGAGTCAA')),
        ('oMCB1690_sublib_6', ('CCAAAGATTCAACCGTCCTG', 'ATTAGCCATTTCAGGACGGA')),
        ('oMCB1691_sublib_7', ('TATTCATGCTTGGACGGACT', 'ACTATGTACCGCTTGTTGGA')),
        ('oMCB1692_sublib_8', ('ATCGACAATGGTATGGCTGA', 'TATGTCTCCTAGCCACTCCT')),
        ('oMCB1693_sublib_9', ('GTCCTAGTGAGGAATACCGG', 'CCGAAGAATCGCAGATCCTA')),
        ('oMCB1694_sublib_10', ('TTAGATAGGTGTGTAGGCGC', 'TAAGGTGCGTACTAGCTGAC')),
        ('oMCB1695_sublib_11', ('TTCCGTTTATGCTTTCCAGC', 'TCCTTGGAGTTTAGAGCGAG')),
        ('oMCB1696_sublib_12', ('GTATAGTTTGTGCGGTGGTC', 'ATCAATCCCCTACACCTTCG')),
        ('oMCB1697_sublib_13', ('TCAGCCTTTCATTGATTGCG', 'TTCCTTGATACCGTAGCTCG')),
        ('oMCB1698_sublib_14', ('AGGGTCGTGGTTAAAGGTAC', 'CGTTTCTTTCCGGTCGTTAG')),
        ('oMCB1699_sublib_15', ('TGCAAGTGTACAAATCCAGC', 'GAACGGTGATCCCTTTCCTA')),
        ('oMCB1700_sublib_16', ('CTTAAGGTTTGCCCATTCCC', 'TGTTATAGCTTCCACGGTGT')),
        ('oMCB1701_sublib_17', ('TGGTTCGTTAGTCGATCTCC', 'AGACGGGATTTTACTGGGTC')),
        ('oMCB1702_sublib_18', ('TATTTTGTAGAGCGTTCGCG', 'TCTTTGCTTCGCAAGTCTTG')),
        ('oMCB1703_sublib_19', ('TTCTGTAAGTTTCGTCGGGA', 'CTAAACACCGCACCTCACTA')),
        ('oMCB1704_sublib_20', ('TTGACGTACGTAGGTTCTCC', 'GAACACAACTACACTGACGC')),
        ('oMCB1705_sublib_21', ('GAGATGAGTAGACGAGTGGG', 'ATGGTCACTGACTCGCATTA')),
        ('oMCB1706_sublib_22', ('CTTTGGGCTTTCAGATGAGC', 'CAAAGATTTCTGTCGGTCGG')),
        ('oMCB1707_sublib_23', ('TGTCATATGCTAACGTCCGT', 'TGGCTACTTTCTTAGCGGAA')),
        ('oMCB1708_sublib_24', ('TTGCGACATCACAATTCTCG', 'TACTTCGAGACTTCATGCGT')),
        ('oMCB1709_sublib_25', ('TCAGTATGGCGTCTTGAAGT', 'ATGGCCCGACCTCTATTATG')),
        ('oMCB1710_sublib_26', ('TCATGTCGTGACCAGTAGAC', 'TGGGTCTAGTGAACTTCGTC')),
        ('oMCB1711_sublib_27', ('AACTAACGGATTTAAGCGCG', 'AACATATGTTGCTTCGTCCG')),
        ('oMCB1712_sublib_28', ('CATTTTCTGTTCCCCAGTGG', 'TCGAGTTAGATTGTCACCCC')),
        ('oMCB1713_sublib_29', ('ATTTGCCTAACCACTCCACT', 'TCAGAGCTTTTCGGTACAGT')),
        ('oMCB1714_sublib_30', ('TGACTTATGAACCTTTGCGC', 'GCCCAGGAGTAGTCGTTAAT')),
        ('oMCB1715_sublib_31', ('ATAGGATTAGCTGATGGGCC', 'TCTGTGTTCCGACTAAGGTC')),
        ('oMCB1716_sublib_32', ('TGAGATTCGGGACTATTCGG', 'TCTGTTGTTAGACTCCGACC')),
        ('oMCB1717_sublib_33', ('TTGGTTAGTACACGGGACTC', 'GTACGTCTGAACTTGGGACT')),
        ('oMCB1718_sublib_34', ('ATTTGTGTATCGAGGCTCGT', 'AGACACGCGATTGTTTAACC')),
        ('oMCB1719_sublib_35', ('ATCGTTCCCCATCACATTCT', 'CCGTTCGTTTTGAGCACTTA')),
        ('oMCB1720_sublib_36', ('ATTACCATGTTATCGGGCGA', 'AGGTTAGGGAACGCAAGATT')),
        ('oMCB1721_sublib_37', ('TCGGTGGATATGACGTAACC', 'CCAGACTGTGCTCGTTATCT')),
        ('oMCB1722_sublib_38', ('GGTCAGATGGTTTACATGCG', 'AGTTGTTCTCTATCCGCGAT')),
        ('oMCB1723_sublib_39', ('TCTCGTTCGAAAATCATCGC', 'GATTAAATCTCGCCGGTGAC')),
        ('oMCB1724_sublib_40', ('TGCAAATGTGAGGTAGCAAC', 'TTGTAGTTTTCGCTTGCGTT')),
        ('oMCB1725_sublib_41', ('AAAGTCAAAGTGCGTTTCGT', 'TGTGTTGCTCTCTCATAGCC')),
        ('oMCB1725_sublib_42', ('GCTTATTCGTGCCGTGTTAT', 'TACTTTTGATTGCTGTGCCC')),
        ('oMCB1725_sublib_43', ('TTTGCTTCAGTCAGATTCGC', 'GTTCAATCACTGAATCCCGG')),
        ('oMCB1725_sublib_44', ('GTCGAGTCCTATGTAACCGT', 'CAGGGGTCGTCATATCTTCA')),
        ('oMCB1725_sublib_45', ('GTAAGATGGAAGCCGGGATA', 'CACCTCATAGAGCTGTGGAA')),
        ('oMCB1725_sublib_46', ('GGTGTCGCAACATGATCTAC', 'CGGTTCCTAGTCATGTTTGC')),
        ('oMCB1725_sublib_47', ('GTGCTAAGTCACACTGTTGG', 'TTGTACTAATCTCGTCCCGG')),
        ('oMCB1725_sublib_48', ('TCTAAACAGTTAGGCCCAGG', 'TTATGTTCACAACTGGCGTG')),
        ('oMCB1725_sublib_49', ('GTCTTTATACTTGCCTGCCG', 'TGGAACTGATTTGGCCTTTG')),
        ('oMCB1725_sublib_50', ('CACCGCGATCAATACAACTT', 'TATAGTTCCTCCCATGCACC')),
        ('oMCB1725_sublib_51', ('TTCGGATAGACTCAGGAAGC', 'ACAATAGACAGACCCATGCA')),
        ('oMCB1725_sublib_52', ('CCATTGATAGATTCGCTCGC', 'GAGTCGAGCTAGCATAGGAG')),
        ('oMCB1725_sublib_53', ('TTTTCTACTTTCCGGCTTGC', 'TTGTGGGAGCTTCTTACCAT')),
        ('oMCB1725_sublib_54', ('ATGACTATTGGGGTCGTACC', 'TCGTACGGGAATGACCATAG')),
        ('oMCB1725_sublib_55', ('TCGACAATAGTTGAGCCCTT', 'AGACACAACGTAGCCGATTA')),
        ('oMCB1725_sublib_56', ('GAGCCATGTGAAATGTGTGT', 'CGGACTAAAGGATCGAGTCA')),
        ('oMCB1725_sublib_57', ('CGTATACGTAAGGGTTCCGA', 'CATCGGATAACACAAAGCGT')),
        ('oMCB1725_sublib_58', ('TTATGATGTCCGGATACCCG', 'GATGTATACTCCACCGTGGT')),
        ('oMCB1725_sublib_59', ('TCTTAGAAATCCACGGGTCC', 'TGAGATATGTACCTGGTGCC')),
        ('oMCB1725_sublib_60', ('GAAGGGTGGATCATCGTACT', 'ATTCTTGGGCCTATCGTTGT')),
        ('oMCB1725_sublib_61', ('GGCTGTTAGTTTTAGAGCCG', 'AAACCATATACAGCCGTCGT')),
        ('oMCB1725_sublib_62', ('AGTGGTGTAGTGGCTTCTAC', 'TAGCTAAATCCCACCCGATG')),
        ('oMCB1725_sublib_63', ('CTCAGAGGGAGTTCAACTGT', 'GTGCGGTTACAGTTTTGACT')),
        ('oMCB1725_sublib_64', ('TTTGGCAGATCATTAACGGC', 'GGGACTACATAGGGTGACAG')),
        ('oMCB1725_sublib_65', ('TATGATCTCCGTACACGAGC', 'CGTTGTCGTTCCAAAGAAGT')),
        ('oMCB1725_sublib_66', ('AGTGCCATGTTATCCCTGAA', 'AGTCACACATATACGGACCC')),
        ('oMCB1725_sublib_67', ('TTATACATCTGGACGCCTCC', 'AGAGAACCCCTATTATGGCG')),
        ('oMCB1725_sublib_68', ('TCCTCGATTCTCCAATCAGG', 'TCGTTAGGCTAAAACATGCG')),
        ('oMCB1725_sublib_69', ('GCTTAACGCATTTCAAGCAC', 'TGATAGGTCGTTCAGCCTAC')),
        ('oMCB1725_sublib_70', ('CTTTTATGTTCCTCGCAGGG', 'TCGGGACTTTCATAAGCACT')),
        ('oMCB1725_sublib_71', ('GTGGGCGTTAGCAAATTACA', 'ATTTTATGCGTCCAGTTCGG')),
        ('oMCB1725_sublib_72', ('AGAGATTATTAGGCGTGGGG', 'AAGGCTGGTATTTCCCTTCA')),
        ('oMCB1725_sublib_73', ('TAGGATTACTGCTCGGTGAC', 'CATACTGTTGGTTGCTAGGC')),
        ('oMCB1725_sublib_74', ('TCGCGTGAGTGGTTCATATA', 'ATATACTGGATTCCGCCGTT')),
        ('oMCB1725_sublib_75', ('CAATAGATACCCACCCGTCA', 'ACTTATGAACCCTTGGCACT')),
        ('oMCB1725_sublib_76', ('ATATATCCGCCGTTGTACGT', 'ATAGATGTATGCCGTTCGGT')),
        ('oMCB1725_sublib_77', ('CGAGAGTCTCCCACGATATC', 'TCTCTGTTTTCCGCACTTTG')),
        ('oMCB1725_sublib_78', ('ATTCAGTTGGTCTTACGGGT', 'AGTTATTCGTCTTTCCCGGT')),
        ('oMCB1725_sublib_79', ('GGATTGCAACGTCAGGAAAT', 'TACAGGAATCTCCACGAAGC')),
        ('oMCB1725_sublib_80', ('GAATGTTGCAGACTGGAAGG', 'CCTCGGGCTTGTTACTAGAT')),
        ('oMCB1725_sublib_81', ('GTCCATGAATACAACACCGG', 'ATTCTTCCGTCCAACGTACT')),
        ('oMCB1725_sublib_82', ('TCGAACAATTTGCGATACCC', 'TAATCATACGAGTGGGCCTC')),
        ('oMCB1725_sublib_83', ('AAGTGCACATTTCGTTTCGA', 'AGTTGGTAGAATTGACCGGT'))
        ]

    return primers

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s input outprefix' % sys.argv[0]
        print '\tassumed input format:'
        print '\t\tlabel\tlibrary.csv\tN_copies'
        sys.exit(1)

    input = sys.argv[1]
    outprefix = sys.argv[2]

    primers = retrievePrimers()

    outfile1 = open(outprefix + '.order.csv', 'w')
    outfile2 = open(outprefix + '.primers', 'w')

    linelist = open(input)
    OL = 0
    for inputline in linelist:
        if inputline.startswith('#'):
            continue
        inputfields = inputline.strip().split('\t')
        label = inputfields[0]
        file = inputfields[1]
        copies = int(inputfields[2])
        DataDict = {}
        lines = open(file)
        total_entries = 0
        for line in lines:
            sgRNA = line.strip().split(',')[1]
            ID = line.strip().split(',')[0]
            total_entries += 1
            DataDict[sgRNA] = ID
        guides = DataDict.keys()
        sgRNAs = []
        for guide in guides:
            ID = DataDict[guide]
            sgRNAs.append((ID,guide))
        sgRNAs.sort()
        print 'printing', copies, 'copies of library in', file
        print 'found', total_entries, 'total entries in library,', len(sgRNAs), 'of them unique; discarding', total_entries - len(sgRNAs), 'duplicates'
        (index, (leftPrimer, rightPrimer)) = primers[OL]
        outline = label + '\t' + index + '\t' + leftPrimer + '\t' + rightPrimer + '\t' + str(len(sgRNAs)) + ' x ' + str(copies)
        outfile2.write(outline + '\n')
        for c in range(copies):
            rightPrimerRev = getReverseComplement(rightPrimer)
            for sgRNA in sgRNAs:
                outline = sgRNA[0] + ',' + leftPrimer + sgRNA[1] + rightPrimerRev
                outfile1.write(outline + '\n')
        OL+=1     

    outfile1.close()
    outfile2.close()

run()
