#!/usr/bin/awk -f ########## #The MIT License (MIT) # # Copyright (c) 2015 Aiden Lab # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ########## # Takes a SAM file, looks for chimeric reads and normal reads. Outputs # only those reads mapped to chromosomes 1-24 with MAPQ > 0. # # Output to fname1 is of the form: # strand1 chromosome1 position1 frag1 strand2 chromosome2 position2 frag2 # mapq1 cigar1 seq1 mapq2 cigar2 seq2 # where chr1 is always <= chr2 # # Output to fname2 retains SAM format. # # Chimeric reads are treated as follows: # "Normal" chimeras (with 3 reads, two with the ligation junction), where the # third read (w/o ligation junction) is within 20Kbp of one of the ligation # junction reads, are sent to fname1. All other chimeras (where either there # are more than 3 reads, or they don't contain the ligation junction, or the # one non-ligation junction end is far from the others, are sent to fname2. # # awk -f chimeric_blacklist.awk -v fname1="norm_chimera" fname2="abnorm_chimera" fname3="unmapped" # Juicer version 1.5 # returns absolute value function abs(value) { return (value<0?-value:value); } # returns minimum of two values function min(value1,value2) { return (value1 c2) return 0; # c1 == c2 if (s1 < s2) return 1; if (s1 > s2) return 0; # s1 == s2 && c1 == c2 if (p1 < p2) return 1; if (p1 > p2) return 0; # all are equal, doesn't matter return 1; } BEGIN{ OFS="\t"; tottot = -1; # will count first non-group } { # input file is sorted by read name. Look at read name to group # appropriately split($1,a,"/"); if(a[1]==prev){ # move on to next record. look below this block for actions that occur # regardless. count++; } else { # deal with read pair group tottot++; if (count==3) { # chimeric read for (j=1; j <= 3; j++) { split(c[j], tmp); split(tmp[1],readname,"/"); read[j] = readname[2]; name[j] = tmp[1]; # strand; Bit 16 set means reverse strand str[j] = and(tmp[2],16); # chromosome chr[j] = tmp[3]; # position pos[j] = tmp[4]; # mapq score m[j] = tmp[5]; # cigar string cigarstr[j] = tmp[6]; # sequence seq[j] = tmp[10]; #qual[j] = tmp[11]; # get rid of soft clipping to know correct position if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) { split(tmp[6], cigar, "S"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } # get rid of hard clipping to know correct position else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) { split(tmp[6], cigar, "H"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 16) { # count Ms,Ds,Ns,Xs,=s for sequence length seqlength=0; currstr=tmp[6]; # can look like 15M10S20M, need to count all not just first where=match(currstr, /[0-9]+[M|D|N|X|=]/); while (where>0) { seqlength+=substr(currstr,where,RLENGTH-1)+0; currstr=substr(currstr,where+RLENGTH); where=match(currstr, /[0-9]+[M|D|N|X|=]/); } pos[j] = pos[j] + seqlength - 1; # add soft clipped bases in for proper position if (tmp[6] ~ /[0-9]+S$/) { where = match(tmp[6],/[0-9]+S$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } else if (tmp[6] ~ /[0-9]+H$/) { where = match(tmp[6],/[0-9]+H$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } # Mitochrondria loops around if (chr[j] ~ /MT/ && pos[j] >= 16569) { pos[j] = pos[j] - 16569; } } # blacklist - if 3rd bit set (=4) it means unmapped mapped[j] = and(tmp[2],4) == 0; } dist[12] = abs(chr[1]-chr[2])*10000000 + abs(pos[1]-pos[2]); dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]); dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]); if (min(dist[12],min(dist[23],dist[13])) < 1000) { # The paired ends look like A/B...B. Make sure we take A and B. if (read[1] == read[2]) { # take the unique one "B" for sure read2 = 3; # take the end of "A/B" that isn't close to "B" read1 = dist[13] > dist[23] ? 1:2; } else if (read[1] == read[3]) { read2 = 2; read1 = dist[12] > dist[23] ? 1:3; } else if (read[2] == read[3]) { read2 = 1; read1 = dist[12] > dist[13] ? 2:3; } else { printf("reads strange\n") > "/dev/stderr" exit 1 } if (mapped[read1] && mapped[read2]) { count_norm++; if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) { # print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2],qual[read1],qual[read2] > fname1; print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1; } else { # print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1],qual[read2],qual[read1] > fname1; print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1; } } else { for (i in c) { print c[i] > fname3; } count_unmapped++; } } else { # chimeric read with the 3 ends > 1KB apart count_abnorm++; for (i in c) { print c[i] > fname2; } } } else if (count > 3) { # chimeric read > 3, too many to deal with count_abnorm++; for (i in c) { print c[i] > fname2; } } else if (count == 2) { # code here should be same as above, but it's a "normal" read j = 0; for (i in c) { split(c[i], tmp); split(tmp[1],readname,"/"); str[j] = and(tmp[2],16); chr[j] = tmp[3]; pos[j] = tmp[4]; m[j] = tmp[5]; cigarstr[j] = tmp[6]; seq[j] = tmp[10]; #qual[j] = tmp[11]; name[j] = tmp[1]; # blacklist - if 3rd bit set (=4) it means unmapped mapped[j] = and(tmp[2],4) == 0; if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) { split(tmp[6], cigar, "S"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) { split(tmp[6], cigar, "H"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 16) { # count Ms,Ds,Ns,Xs,=s for sequence length seqlength=0; currstr=tmp[6]; where=match(currstr, /[0-9]+[M|D|N|X|=]/); while (where>0) { seqlength+=substr(currstr,where,RLENGTH-1)+0; currstr=substr(currstr,where+RLENGTH); where=match(currstr, /[0-9]+[M|D|N|X|=]/); } pos[j] = pos[j] + seqlength - 1; if (tmp[6] ~ /[0-9]+S$/) { where = match(tmp[6],/[0-9]+S$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } if (tmp[6] ~ /[0-9]+H$/) { where = match(tmp[6],/[0-9]+H$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } if (chr[j] ~ /MT/ && pos[j] >= 16569) { pos[j] = pos[j] - 16569; } } j++; } if (mapped[0] && mapped[1]) { count_reg++; if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) { # ideally we'll get rid of printing out cigar string at some point #print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1],qual[0],qual[1] > fname1; print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1; } else { #print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0],qual[1],qual[0] > fname1; print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1; } } else { for (i in c) { print c[i] > fname3; } count_unmapped++; } } else if (count == 1) { # this actually shouldn't happen, but it happens with alternate aligners on occasion count_abnorm++; for (i in c) { print c[i] > fname2; } } # reset variables delete c; count=1; prev=a[1]; } # these happen no matter what, after the above processing c[count] = $0; } END{ # deal with read pair group tottot++; if (count==3) { # chimeric read for (j=1; j <= 3; j++) { split(c[j], tmp); split(tmp[1],readname,"/"); read[j] = readname[2]; name[j] = tmp[1]; # strand str[j] = and(tmp[2],16); # chromosome chr[j] = tmp[3]; # position pos[j] = tmp[4]; # mapq score m[j] = tmp[5]; # cigar string cigarstr[j] = tmp[6]; # sequence seq[j] = tmp[10]; #qual[j] = tmp[11]; # get rid of soft clipping to know correct position if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) { split(tmp[6], cigar, "S"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) { split(tmp[6], cigar, "H"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 16) { # count Ms,Ds,Ns,Xs,=s for sequence length seqlength=0; currstr=tmp[6]; # can look like 15M10S20M, need to count all not just first where=match(currstr, /[0-9]+[M|D|N|X|=]/); while (where>0) { seqlength+=substr(currstr,where,RLENGTH-1)+0; currstr=substr(currstr,where+RLENGTH); where=match(currstr, /[0-9]+[M|D|N|X|=]/); } pos[j] = pos[j] + seqlength - 1; # add soft clipped bases in for proper position if (tmp[6] ~ /[0-9]+S$/) { where = match(tmp[6],/[0-9]+S$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } if (tmp[6] ~ /[0-9]+H$/) { where = match(tmp[6],/[0-9]+H$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } # Mitochrondria loops around if (chr[j] ~ /MT/ && pos[j] >= 16569) { pos[j] = pos[j] - 16569; } } mapped[j] = and(tmp[2],4) == 0; } dist[12] = abs(chr[1]-chr[2])*10000000 + abs(pos[1]-pos[2]); dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]); dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]); if (min(dist[12],min(dist[23],dist[13])) < 1000) { # The paired ends look like A/B...B. Make sure we take A and B. if (read[1] == read[2]) { # take the unique one "B" for sure read2 = 3; # take the end of "A/B" that isn't close to "B" read1 = dist[13] > dist[23] ? 1:2; } else if (read[1] == read[3]) { read2 = 2; read1 = dist[12] > dist[23] ? 1:3; } else if (read[2] == read[3]) { read2 = 1; read1 = dist[12] > dist[13] ? 2:3; } else { printf("reads strange\n") > "/dev/stderr" exit 1 } if (mapped[read1] && mapped[read2]) { count_norm++; if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) { print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1; } else { print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1; } } else { for (i in c) { print c[i] > fname3; } count_unmapped++; } } else { # chimeric read with the 3 ends > 1KB apart count_abnorm++; for (i in c) { print c[i] > fname2; } } } else if (count > 3) { # chimeric read > 3, too many to deal with count_abnorm++; for (i in c) { print c[i] > fname2; } } else if (count == 2) { # code here should be same as above, but it's a "normal" read j = 0; for (i in c) { split(c[i], tmp); split(tmp[1],readname,"/"); str[j] = and(tmp[2],16); chr[j] = tmp[3]; pos[j] = tmp[4]; m[j] = tmp[5]; cigarstr[j] = tmp[6]; seq[j] = tmp[10]; #qual[j] = tmp[11]; name[j] = tmp[1]; if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) { split(tmp[6], cigar, "S"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) { split(tmp[6], cigar, "H"); pos[j] = pos[j] - cigar[1]; if (pos[j] <= 0) { pos[j] = 1; } } else if (str[j] == 16) { # count Ms,Ds,Ns,Xs,=s for sequence length seqlength=0; currstr=tmp[6]; where=match(currstr, /[0-9]+[M|D|N|X|=]/); while (where>0) { seqlength+=substr(currstr,where,RLENGTH-1)+0; currstr=substr(currstr,where+RLENGTH); where=match(currstr, /[0-9]+[M|D|N|X|=]/); } pos[j] = pos[j] + seqlength - 1; if (tmp[6] ~ /[0-9]+S$/) { where = match(tmp[6],/[0-9]+S$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } if (tmp[6] ~ /[0-9]+H$/) { where = match(tmp[6],/[0-9]+H$/); cigloc = substr(tmp[6],where,RLENGTH-1) + 0; pos[j] = pos[j] + cigloc; } if (chr[j] ~ /MT/ && pos[j] >= 16569) { pos[j] = pos[j] - 16569; } } mapped[j] = and(tmp[2],4) == 0; j++; } if (mapped[0] && mapped[1]) { count_reg++; if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) { # ideally we'll get rid of printing out cigar string at some point # qual print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1; } else { print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1; } } else { for (i in c) { print c[i] > fname3; } count_unmapped++; } } resfile=fname1".res.txt"; printf("%d %d %d %d %d\n", tottot, count_unmapped, count_reg, count_norm, count_abnorm) >> resfile; }