#This makedoc is for the update to the v98 track (#29625) originally in the hg38.txt makedoc #A user wrote in and many errors were found, largely due to the provider not correctly coverting to proper BED file coordinates #The following is the python script run to generate the new file, followed by the output #Parse the COSMIC input file and try and correct the incorrect coordinates. See https://redmine.soe.ucsc.edu/issues/32430 import subprocess def writeOutToFile(outputFile,chrom,chromStart,chromEnd,refAllele,altAllele,strand,mutationID,legacyID): outputFile.write(chrom+"\t"+str(chromStart)+"\t"+str(chromEnd)+"\t"+mutationID+"\t0\t"+strand+"\t"+refAllele+"\t"+altAllele+"\t"+legacyID+"\n") def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) #Vars inputFile = open("/hive/data/outside/cosmic/hg38/v98/ucsc_export.bed","r") outputFile = open("/hive/data/outside/cosmic/hg38/v98/cosmicV93.bed","w") outputFilePath = "/hive/data/outside/cosmic/hg38/v98/cosmicV93.bed" badItems = 0 insertionsFixed = 0 deletionsFixed = 0 delinsFixed = 0 pointSubsFixed = 0 correctEntries = 0 n = 0 criticalErrors = 0 print("Printing entries that are illegal (chromStart > chromEnd)") #chr19 56183614 56183614 G T + COSV61999794 COSN20232200 for entry in inputFile: n+=1 entry = entry.rstrip() entrySplit = entry.split("\t") chrom = entrySplit[0] chromStart = int(entrySplit[1]) chromEnd = int(entrySplit[2]) refAllele = entrySplit[3] altAllele = entrySplit[4] strand = entrySplit[5] mutationID = entrySplit[6] legacyID = entrySplit[7] # Look for incorrect insertions which have start > end, and need -2 to start if chromStart > chromEnd: badItems+=1 chromStart = chromStart-2 chromEnd = chromEnd+1 print(entry) # Handle insertions which can also be dups, same coordinate insertions are fine but dups need -2 to end and -1 to start elif refAllele == "" and altAllele != "": if chromEnd-chromStart==2: chromEnd = chromEnd-2 chromStart = chromStart-1 insertionsFixed+=1 elif chromEnd-chromStart==1: insertionsFixed+=1 chromEnd = chromEnd-1 chromStart = chromStart-1 else: n+=1 # Handle deletions which are wrong when they are only 1 or 2 bases in size. Some 2 size items are correct, so check for that too elif altAllele == "" and refAllele != "" and len(refAllele) < 3: if chromEnd-chromStart != 2: deletionsFixed+=1 chromStart = chromStart-1 # Handle delins of different sizes which are wrong elif altAllele != "" and refAllele != "" and len(altAllele) != len(refAllele): delinsFixed+=1 chromStart = chromStart-1 # Handle point substitutions which are wrong and need -1 to start elif altAllele != "" and refAllele != "" and len(altAllele) == 1 and len(refAllele) == 1: pointSubsFixed+=1 chromStart = chromStart-1 # Final check for any illegal coordinates, all cases should have been covered before this elif chromStart == chromEnd or chromStart > chromEnd: criticalErrors+=1 print("ERROR: This should never come up. Problem with variant below.") print(entry) # Final count of items that were properly formatted else: correctEntries+=1 #Write out corrected (if needed) coordinates to file: writeOutToFile(outputFile,chrom,chromStart,chromEnd,refAllele,altAllele,strand,mutationID,legacyID) outputFile.close() inputFile.close() # pointSubsFixed delinsFixed deletionsFixed insertionsFixed badItems criticalErrors correctEntries n print("Point substitutions fixed: "+str(pointSubsFixed)+" ("+str(round(pointSubsFixed/n,3))+"%)\nDeletionInsertions fixed: "+str(delinsFixed)+" ("+str(round(delinsFixed/n,3))+"%)\n" \ +"Deletions fixed: "+str(deletionsFixed)+" ("+str(round(deletionsFixed/n,3))+"%)\nInsertions fixed: "+str(insertionsFixed)+" ("+str(round(insertionsFixed/n,3))+"%)\n" \ +"Illegal items with start > end: "+str(badItems)+" ("+str(round(badItems/n,3))+"%)\nCritical errors: "+str(criticalErrors)+" ("+str(round(criticalErrors/n,3))+"%)\n" \ +"Correct entries not changed: "+str(correctEntries)+" ("+str(round(correctEntries/n,3))+"%)\nTotal items: "+str(n)) bash("bedSort "+outputFilePath+" "+outputFilePath+".sorted") bash("bedToBigBed -type=bed6+3 -as=/hive/data/outside/cosmic/hg38/v98/cosmic.as -tab "+outputFilePath+".sorted /cluster/data/hg38/chrom.sizes /hive/data/outside/cosmic/hg38/v98/cosmic.bb") #make symlink bash("ln -s /hive/data/outside/cosmic/hg38/v98/cosmic.bb /gbdb/hg38/cosmic/cosmic.bb") ####OUTPUT##### Printing entries that are illegal (chromStart > chromEnd) chr1 54139647 54139646 G - COSV61283612 COSM4967518 chr1 92832120 92832119 C + COSV59873663 COSM5751392 chr10 14908622 14908621 T - COSV57954760 COSM5751422 chr11 32434699 32434698 T - COSV60065461 COSM5751469 chr11 61967312 61967311 TCTTACTACTTTGACCGCGATGATGTGGCTTTGAAGAACTTTGCCAAATACTTTCTTCACCAATCTCATGAGGA - COSV56445358 COSM4746415 chr12 25225627 25225626 T - COSV55926705 COSM5752083 chr12 25225676 25225675 T - COSV55736226 COSM5751707 chr14 32822108 32822107 A + COSV55234562 COSM5751856 chr14 32822293 32822292 A + COSV55225526 COSM5751218 chr14 32822984 32822983 A + COSV55217470 COSM5751765 chr14 72465624 72465623 GGAT + COSV59575192 COSN190986 chr14 99257556 99257555 T - COSV61735888 COSM5751532 chr14 99257788 99257787 G - COSV61733306 COSM5751244 chr16 67611246 67611245 A + COSV50461550 COSM5751489 chr16 67611510 67611509 A + COSV50465990 COSM5751202 chr17 7669666 7669665 T - COSV53205989 COSM5751520 chr17 31327838 31327837 AGAGTTTA + COSV106105962 COSM34184 chr19 10795440 10795439 A + COSV58965104 COSM5752008 chr19 44274080 44274079 AACTCTCTGGTGAAGACCAGAATTCCTATTAAATATCCTGTCACTTACTT - COSV61933827 COSM5016626 chr19 54145818 54145817 A + COSV55364997 COSM5751982 chr19 54148744 54148743 A + COSV55366597 COSM5751751 chr2 178443266 178443265 AAAGGGGGCATCAAAAAAGCAAGCCAAAAGGAACGCTGCT - COSV57870943 COSM4746427 chr2 189854347 189854346 A + COSV59720649 COSM5751951 chr2 241740952 241740951 G + COSV100345173 COSN31769167 chr21 34834466 34834465 T - COSV55877103 COSM5751784 chr3 52587352 52587351 A - COSV56310253 COSM422809 chr3 195781704 195781703 CACGCCACCCCTCTTCATGTCACCAGCCCTTCCTCAGCATCCACAGGTGA - COSV57801139 COSM5016048 chr6 31669484 31669483 CCT + COSV65507799 COSM306820 chr6 134173149 134173148 A - COSV52804500 COSM1161565 chr6 135195910 135195909 A + COSV57199385 COSM5751503 chr6 138885581 138885580 G + COSV62884323 COSM5752103 chr7 5986937 5986936 G - COSV56220859 COSM5751600 chr7 50327638 50327637 A + COSV58792006 COSM5751834 chrX 37453358 37453357 C + COSV66158042 COSM1161821 chrX 41125720 41125719 TCTCGC + COSV61067912 COSN19269805 Point substitutions fixed: 12394972 (0.938%) DeletionInsertions fixed: 4430 (0.0%) Deletions fixed: 322858 (0.024%) Insertions fixed: 316932 (0.024%) Illegal items with start > end: 35 (0.0%) Critical errors: 0 (0.0%) Correct entries not changed: 162397 (0.012%) Total items: 13218527