# for emacs: -*- mode: sh; -*- # Assembly Name: GRCm38.p4 # Description: Genome Reference Consortium Mouse Build 38 patch release 4 # (GRCm38.p4) # Organism name: Mus musculus # Taxid: 10090 # Submitter: Genome Reference Consortium # Date: 2015-3-23 # Assembly type: haploid-with-alt-loci # Release type: patch # Assembly level: Chromosome # Genome representation: full # GenBank Assembly Accession: GCA_000001635.6 (latest) # RefSeq Assembly Accession: GCF_000001635.24 (latest) # RefSeq Assembly and GenBank Assemblies Identical: yes ############################################################################## # mm10 patch 4 build ############################################################################## ## download sequence, prepare files (DONE - 2016-03-08 - Hiram) ############################################################################## mkdir /hive/data/genomes/mm10/bed/mm10Patch4 cd /hive/data/genomes/mm10/bed/mm10Patch4 mkdir genbank cd genbank time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCA_000001635.6_GRCm38.p4/ ./ # real 4m30.702s # appears to be the entire assembly: faSize GCA_000001635.6_GRCm38.p4_genomic.fna.gz # 2803568840 bases (79356978 N's 2724211862 real 1731886981 upper # 992324881 lower) in 196 sequences in 1 files # Total size: mean 14303922.7 sd 41517244.4 min 1976 (JH584295.1) # max 195471971 (CM000994.2) median 207968 # %35.40 masked total, %36.43 masked real # so the question is, what is new here compared to what we have in mm10 cd /hive/data/genomes/mm10/bed/mm10Patch4 time faCount genbank/GCA_000001635.6_GRCm38.p4_genomic.fna.gz \ > faCount.GRCm38.p4.txt # real 1m4.939s ~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \ | grep new | sed -e 's/^/# /' # one of the new sequences is missing some information from the # assembly_report: MMCHRUN_NODMRKTAC_CTG1 alt-scaffold na na GL456050.1 = NT_080256.1 NOD/MrkTac 142341 na # it appers to be a sequence that isn't located on this assembly # resulting in an incorrect name: # chrna_GL456050_alt 142341 GL456050.1 NT_080256.1 new # there are 130 new sequences: # chr1_KK082441_alt 456798 KK082441.1 NW_006763128.1 new # chr3_KQ030484_fix 214957 KQ030484.1 NW_012132900.1 new # chr4_JH792826_fix 368286 JH792826.1 NW_004058050.1 new # chr4_KQ030485_fix 185548 KQ030485.1 NW_012132901.1 new # chr4_KQ030486_fix 399265 KQ030486.1 NW_012132902.1 new # chr4_KQ030487_fix 316842 KQ030487.1 NW_012132903.1 new # chr4_KQ030488_fix 45901 KQ030488.1 NW_012132904.1 new # chr4_KQ030489_fix 188269 KQ030489.1 NW_012132905.1 new # chr5_JH792827_fix 205713 JH792827.1 NW_004058051.1 new # chr6_KK082442_fix 154766 KK082442.1 NW_006763129.1 new # chr6_KK082443_fix 620323 KK082443.1 NW_006763130.1 new # chr7_JH792828_fix 473738 JH792828.1 NW_004058052.1 new # chr9_KB469738v3_fix 210641 KB469738.3 NW_004450259.3 new # chr9_KQ030490_fix 120154 KQ030490.1 NW_012132906.1 new # chr10_KQ030491_fix 129865 KQ030491.1 NW_012132907.1 new # chr11_KB469739_fix 331111 KB469739.1 NW_004450260.1 new # chr12_KB469740_fix 316140 KB469740.1 NW_004450261.1 new # chr15_KQ030492_fix 166012 KQ030492.1 NW_012132908.1 new # chr15_KQ030493_fix 269476 KQ030493.1 NW_012132909.1 new # chr16_KB469741_fix 267241 KB469741.1 NW_004450262.1 new # chr17_KB469742_fix 1059955 KB469742.1 NW_004450263.1 new # chr18_JH792829_fix 352455 JH792829.1 NW_004058053.1 new # chr19_JH792830_fix 246751 JH792830.1 NW_004058054.1 new # chr19_KQ030494_fix 506812 KQ030494.1 NW_012132910.1 new # chrX_JH792831_fix 182256 JH792831.1 NW_004058055.1 new # chrX_KQ030495_fix 490000 KQ030495.1 NW_012132911.1 new # chrX_KQ030496_fix 260447 KQ030496.1 NW_012132912.1 new # chrX_KQ030497_fix 219721 KQ030497.1 NW_012132913.1 new # chrY_JH792832_fix 544189 JH792832.1 NW_004058056.1 new # chrY_JH792833_fix 221588 JH792833.1 NW_004058057.1 new # chrY_JH792834_fix 331480 JH792834.1 NW_004058058.1 new # chr19_GL456079_alt 46440 GL456079.1 NT_039697.1 new # chr2_GL456024v2_alt 410905 GL456024.2 NT_039223.3 new # chr3_GL456006_alt 320308 GL456006.1 NT_039248.1 new # chr3_GL456007_alt 273369 GL456007.1 NT_039256.1 new # chr3_GL456008_alt 124933 GL456008.1 NT_080257.1 new # chr5_GL456011_alt 180394 GL456011.1 NT_109599.1 new # chr6_GL456025_alt 145152 GL456025.1 NT_039373.1 new # chr6_GL456026v2_alt 636141 GL456026.2 NT_039382.4 new # chr7_GL456014_alt 91251 GL456014.1 NT_078572.1 new # chr12_GL456017v2_alt 1714434 GL456017.2 NT_114985.3 new # chr12_GL456074_alt 84250 GL456074.1 NT_114988.1 new # chr13_JH584305_alt 179252 JH584305.1 NT_187006.1 new # chr14_GL456019_alt 1666365 GL456019.1 NT_039614.1 new # chr16_JH584306_alt 480699 JH584306.1 NT_187007.1 new # chr16_JH584307_alt 250595 JH584307.1 NT_187008.1 new # chr17_JH584308_alt 273566 JH584308.1 NT_187010.1 new # chr17_JH584309_alt 162935 JH584309.1 NT_187011.1 new # chr17_JH590470_alt 1154078 JH590470.1 NT_187009.1 new # chrX_GL456031_alt 247850 GL456031.1 NT_111920.1 new # chrX_GL456032_alt 194271 GL456032.1 NT_039744.2 new # chrX_GL456033v2_alt 78689 GL456033.2 NT_039747.4 new # chr7_GL456013_alt 34498 GL456013.1 NT_039443.1 new # chr1_GL455991_alt 281255 GL455991.1 NT_039192.2 new # chr1_GL455992v2_alt 120856 GL455992.2 NT_039194.5 new # chr1_GL455993_alt 272653 GL455993.1 NT_039198.1 new # chr4_GL455994_alt 170592 GL455994.1 NT_039291.1 new # chr5_GL455995_alt 262166 GL455995.1 NT_039334.1 new # chr8_GL455996_alt 183135 GL455996.1 NT_039469.1 new # chr8_GL455997_alt 251056 GL455997.1 NT_039470.1 new # chr11_GL455998_alt 170397 GL455998.1 NT_078683.1 new # chr13_GL455999v2_alt 138435 GL455999.2 NT_039594.3 new # chr15_GL456000_alt 88442 GL456000.1 NT_039622.1 new # chr16_GL456001v2_alt 1618162 GL456001.2 NT_039634.4 new # chr16_JH584310_alt 261336 JH584310.1 NT_187012.1 new # chr16_JH584311_alt 189707 JH584311.1 NT_187013.1 new # chr16_JH584312_alt 184044 JH584312.1 NT_187014.1 new # chr16_JH584313_alt 374389 JH584313.1 NT_187015.1 new # chr16_JH584314_alt 170763 JH584314.1 NT_187016.1 new # chr17_GL456002v2_alt 285971 GL456002.2 NT_039666.2 new # chr17_GL456021v2_alt 28130 GL456021.2 NT_039661.4 new # chrX_GL456003v2_alt 414761 GL456003.2 NT_039739.2 new # chrX_GL456004_alt 123835 GL456004.1 NT_099024.1 new # chr1_GL456005_alt 154284 GL456005.1 NT_039195.1 new # chr1_JH584315_alt 220020 JH584315.1 NT_187017.1 new # chr4_GL456010_alt 76099 GL456010.1 NT_039296.1 new # chr6_GL456012v2_alt 312568 GL456012.2 NT_039372.2 new # chr6_GL456065_alt 198515 GL456065.1 NT_039367.1 new # chr11_GL456016_alt 161334 GL456016.1 NT_039525.1 new # chr14_GL456020_alt 105813 GL456020.1 NT_096797.1 new # chr16_GL456028v2_alt 176974 GL456028.2 NT_039630.5 new # chr17_GL456022v2_alt 1934410 GL456022.2 NT_039662.3 new # chr13_GL455990_alt 100803 GL455990.1 NT_039593.1 new # chrY_GL456080_alt 191521 GL456080.1 NT_166394.1 new # chrY_GL456081_alt 185346 GL456081.1 NT_166397.1 new # chrY_GL456082_alt 209635 GL456082.1 NT_166400.1 new # chr7_GL455989_alt 55856 GL455989.1 NT_095534.1 new # chr12_GL456068_alt 59641 GL456068.1 NT_096355.1 new # chr11_JH584316_alt 247497 JH584316.1 NT_187028.1 new # chr11_JH584317_alt 186379 JH584317.1 NT_187029.1 new # chr18_JH584318_alt 231969 JH584318.1 NT_187030.1 new # chrY_GL456071_alt 189119 GL456071.1 NT_166396.1 new # chrY_GL456072_alt 204663 GL456072.1 NT_166398.1 new # chrY_GL456073_alt 196014 GL456073.1 NT_166399.1 new # chr1_JH584320_alt 1003859 JH584320.1 NT_187019.1 new # chr1_JH584321_alt 3760957 JH584321.1 NT_187020.1 new # chr1_JH584322_alt 326027 JH584322.1 NT_187021.1 new # chr3_GL456042v2_alt 689363 GL456042.2 NT_114257.4 new # chr3_GL456044v2_alt 466430 GL456044.2 NT_166287.2 new # chr3_GL456045v2_alt 161821 GL456045.2 NT_039247.2 new # chr3_GL456048_alt 292416 GL456048.1 NT_039252.1 new # chr3_GL456049v2_alt 243366 GL456049.2 NT_039257.4 new # chr3_JH584323_alt 1153325 JH584323.1 NT_187022.1 new # chr4_GL456053v2_alt 3205425 GL456053.2 NT_166299.2 new # chr4_JH584324_alt 3050841 JH584324.1 NT_187023.1 new # chr4_JH584325_alt 215601 JH584325.1 NT_187024.1 new # chr4_JH584326_alt 1625541 JH584326.1 NT_187025.1 new # chr11_GL456060v2_alt 1462311 GL456060.2 NT_166317.2 new # chr11_JH584327_alt 1550185 JH584327.1 NT_187026.1 new # chr17_JH584328_alt 4256209 JH584328.1 NT_187027.1 new # chrna_GL456050_alt 142341 GL456050.1 NT_080256.1 new # chr6_GL456054v2_alt 5956088 GL456054.2 NT_166305.2 new # chr6_JH584264_alt 1759414 JH584264.1 NT_187002.1 new # chr11_JH584265_alt 3086744 JH584265.1 NT_187003.1 new # chr17_JH584266_alt 4910976 JH584266.1 NT_187004.1 new # chr17_JH584267_alt 1908920 JH584267.1 NT_187005.1 new # chr12_GL456349_alt 83612 GL456349.1 NT_166433.1 new # chr18_GL456070_alt 85548 GL456070.1 NT_165784.1 new # chr4_GL456009_alt 240498 GL456009.1 NT_109318.2 new # chr4_GL456064_alt 138473 GL456064.1 NT_039292.1 new # chr10_GL456015_alt 77891 GL456015.1 NT_078651.1 new # chr17_GL456069_alt 138490 GL456069.1 NT_097336.1 new # chr19_JH584319_alt 25407 JH584319.1 NT_187018.1 new # chr4_GL456075_alt 264822 GL456075.1 NT_166293.1 new # chr4_GL456076_alt 191695 GL456076.1 NT_166294.1 new # chr4_GL456077_alt 83218 GL456077.1 NT_166295.1 new # chr4_JH584268_alt 69753 JH584268.1 NT_186999.1 new # chr4_JH584269_alt 284739 JH584269.1 NT_187000.1 new # chr12_GL456078_alt 99963 GL456078.1 NT_166322.1 new # chr15_JH584270_alt 89177 JH584270.1 NT_187001.1 new # how much sequence: ~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \ | grep new | awk '{sum += $2; printf "%d\t%s\n", sum, $0}' | tail # 72697066 chr15_JH584270_alt 89177 JH584270.1 NT_187001.1 new ~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \ | grep new > new.sequences.list cut -f3 new.sequences.list > extract.new.list awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed ~/kent/src/hg/makeDb/doc/mm10Patch4/scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCm38.p4.txt genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \ | grep -v new > existing.sequences.list cut -f3 existing.sequences.list > extract.exist.list faSomeRecords genbank/GCA_000001635.6_GRCm38.p4_genomic.fna.gz \ extract.new.list stdout | sed -e 's/ .*//;' | \ sed -f genbankToUCSC.sed | gzip -c > mm10Patch4.fa.gz faSomeRecords genbank/GCA_000001635.6_GRCm38.p4_genomic.fna.gz \ extract.exist.list stdout | sed -e 's/ .*//;' | gzip -c > existing.fa.gz # verify same amount of sequence here as mm10: faSize existing.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 1686409493 upper # 966374007 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 (JH584295.1) # max 195471971 (CM000994.2) median 184189 # %35.39 masked total, %36.43 masked real # mm10 has different masking, but totals are the same head -1 ../../faSize.mm10.2bit.txt # 2730871774 bases (78088274 N's 2652783500 real 1454267808 upper 1198515692 lower) in 66 sequences in 1 files # verify correct amount of patch4 sequence here: faSize mm10Patch4.fa.gz # 72697066 bases (1268704 N's 71428362 real 45477488 upper # 25950874 lower) in 130 sequences in 1 files # Total size: mean 559208.2 sd 959105.8 min 25407 (chr19_JH584319_alt) # max 5956088 (chr6_GL456054v2_alt) median 221588 # %35.70 masked total, %36.33 masked real # this is the same total obtained before: # 72697066 chr15_JH584270_alt 89177 JH584270.1 NT_187001.1 new # and both together should equal the original full patch4 sequence zcat existing.fa.gz mm10Patch4.fa.gz | faSize stdin # 2803568840 bases (79356978 N's 2724211862 real 1731886981 upper 992324881 lower) in 196 sequences in 1 files # Total size: mean 14303922.7 sd 41517244.4 min 1976 (JH584295.1) max 195471971 (CM000994.2) median 207968 # %35.40 masked total, %36.43 masked real # construct locations file: ~/kent/src/hg/makeDb/doc/mm10Patch4/regionScan.pl extract.new.list \ ucsc.genbank.tab genbank/GCA_000001635.6_GRCm38.p4_assembly_regions.txt \ | egrep -v "MMCHR6_CHORI29_IDD6_3" > patchLocations.bed # what are these: # not finding PAR na on the newList # not finding PAR na on the newList # not done GL456050.1 # not done GL456071.1 # not done GL456072.1 # not done GL456073.1 # not done GL456080.1 # not done GL456081.1 # not done GL456082.1 # the PAR is merely these two items: # PAR X 169969759 170931299 PAR na na na # PAR Y 90745845 91644698 PAR na na na # Message from Valerie Schneider # These are alternate loci for which there is no placement defined on the # GRCm38 B57BL/6J primary assembly. # The rules for alternate loci in mouse, where the primary assembly unit # represents a single strain, are a little different than they are for human. # In mouse, each alt unit represents a different strain. In contrast to # human, mouse alt loci do not contain an "anchor" sequence that is part # of the primary assembly unit, b/c we never mix strain # sequences (on either primary or alt). # The lack of placements is allowed for mouse, but not for human. # All human alt loci must have a placement, and the anchor ensures # that placement can be made. # separate haplotypes from fix patches for two tracks: # the sed command turn the UCSC chrom names for these new sequences back # into their genbank names for external URL reference grep -v fix patchLocations.bed | sed -e 's/_alt//;' \ | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v/./;' > mm10Patch4Haplotypes.bed hgLoadBed -type=bed4 mm10 mm10Patch4Haplotypes mm10Patch4Haplotypes.bed # Read 93 elements of size 4 from mm10Patch4Haplotypes.bed grep fix patchLocations.bed | sed -e 's/_fix//;' \ | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v\([0-9]\)$/.\1/;' \ > mm10Patch4Patches.bed # verify none lost: wc -l mm10Patch4Haplotypes.bed mm10Patch4Patches.bed # 93 mm10Patch4Haplotypes.bed # 29 mm10Patch4Patches.bed # 122 total wc -l patchLocations.bed # 122 patchLocations.bed hgLoadBed -type=bed4 mm10 mm10Patch4Patches mm10Patch4Patches.bed # Read 41 elements of size 4 from mm10Patch4Patches.bed # construct 2bit file: faToTwoBit mm10Patch4.fa.gz mm10Patch4.unmasked.2bit twoBitInfo mm10Patch4.unmasked.2bit stdout | sort -k2nr > mm10Patch4.chrom.sizes # take a look at that to verify it looks OK: cat mm10Patch4.chrom.sizes | sed -e 's/^/# /;' # chr6_GL456054v2_alt 5956088 # chr17_JH584266_alt 4910976 # chr17_JH584328_alt 4256209 # chr1_JH584321_alt 3760957 # chr4_GL456053v2_alt 3205425 # chr11_JH584265_alt 3086744 # chr4_JH584324_alt 3050841 # chr17_GL456022v2_alt 1934410 # chr17_JH584267_alt 1908920 # chr6_JH584264_alt 1759414 # chr12_GL456017v2_alt 1714434 # chr14_GL456019_alt 1666365 # chr4_JH584326_alt 1625541 # chr16_GL456001v2_alt 1618162 # chr11_JH584327_alt 1550185 # chr11_GL456060v2_alt 1462311 # chr17_JH590470_alt 1154078 # chr3_JH584323_alt 1153325 # chr17_KB469742_fix 1059955 # chr1_JH584320_alt 1003859 # chr3_GL456042v2_alt 689363 # chr6_GL456026v2_alt 636141 # chr6_KK082443_fix 620323 # chrY_JH792832_fix 544189 # chr19_KQ030494_fix 506812 # chrX_KQ030495_fix 490000 # chr16_JH584306_alt 480699 # chr7_JH792828_fix 473738 # chr3_GL456044v2_alt 466430 # chr1_KK082441_alt 456798 # chrX_GL456003v2_alt 414761 # chr2_GL456024v2_alt 410905 # chr4_KQ030486_fix 399265 # chr16_JH584313_alt 374389 # chr4_JH792826_fix 368286 # chr18_JH792829_fix 352455 # chrY_JH792834_fix 331480 # chr11_KB469739_fix 331111 # chr1_JH584322_alt 326027 # chr3_GL456006_alt 320308 # chr4_KQ030487_fix 316842 # chr12_KB469740_fix 316140 # chr6_GL456012v2_alt 312568 # chr3_GL456048_alt 292416 # chr17_GL456002v2_alt 285971 # chr4_JH584269_alt 284739 # chr1_GL455991_alt 281255 # chr17_JH584308_alt 273566 # chr3_GL456007_alt 273369 # chr1_GL455993_alt 272653 # chr15_KQ030493_fix 269476 # chr16_KB469741_fix 267241 # chr4_GL456075_alt 264822 # chr5_GL455995_alt 262166 # chr16_JH584310_alt 261336 # chrX_KQ030496_fix 260447 # chr8_GL455997_alt 251056 # chr16_JH584307_alt 250595 # chrX_GL456031_alt 247850 # chr11_JH584316_alt 247497 # chr19_JH792830_fix 246751 # chr3_GL456049v2_alt 243366 # chr4_GL456009_alt 240498 # chr18_JH584318_alt 231969 # chrY_JH792833_fix 221588 # chr1_JH584315_alt 220020 # chrX_KQ030497_fix 219721 # chr4_JH584325_alt 215601 # chr3_KQ030484_fix 214957 # chr9_KB469738v3_fix 210641 # chrY_GL456082_alt 209635 # chr5_JH792827_fix 205713 # chrY_GL456072_alt 204663 # chr6_GL456065_alt 198515 # chrY_GL456073_alt 196014 # chrX_GL456032_alt 194271 # chr4_GL456076_alt 191695 # chrY_GL456080_alt 191521 # chr16_JH584311_alt 189707 # chrY_GL456071_alt 189119 # chr4_KQ030489_fix 188269 # chr11_JH584317_alt 186379 # chr4_KQ030485_fix 185548 # chrY_GL456081_alt 185346 # chr16_JH584312_alt 184044 # chr8_GL455996_alt 183135 # chrX_JH792831_fix 182256 # chr5_GL456011_alt 180394 # chr13_JH584305_alt 179252 # chr16_GL456028v2_alt 176974 # chr16_JH584314_alt 170763 # chr4_GL455994_alt 170592 # chr11_GL455998_alt 170397 # chr15_KQ030492_fix 166012 # chr17_JH584309_alt 162935 # chr3_GL456045v2_alt 161821 # chr11_GL456016_alt 161334 # chr6_KK082442_fix 154766 # chr1_GL456005_alt 154284 # chr6_GL456025_alt 145152 # chrna_GL456050_alt 142341 # chr17_GL456069_alt 138490 # chr4_GL456064_alt 138473 # chr13_GL455999v2_alt 138435 # chr10_KQ030491_fix 129865 # chr3_GL456008_alt 124933 # chrX_GL456004_alt 123835 # chr1_GL455992v2_alt 120856 # chr9_KQ030490_fix 120154 # chr14_GL456020_alt 105813 # chr13_GL455990_alt 100803 # chr12_GL456078_alt 99963 # chr7_GL456014_alt 91251 # chr15_JH584270_alt 89177 # chr15_GL456000_alt 88442 # chr18_GL456070_alt 85548 # chr12_GL456074_alt 84250 # chr12_GL456349_alt 83612 # chr4_GL456077_alt 83218 # chrX_GL456033v2_alt 78689 # chr10_GL456015_alt 77891 # chr4_GL456010_alt 76099 # chr4_JH584268_alt 69753 # chr12_GL456068_alt 59641 # chr7_GL455989_alt 55856 # chr19_GL456079_alt 46440 # chr4_KQ030488_fix 45901 # chr7_GL456013_alt 34498 # chr17_GL456021v2_alt 28130 # chr19_JH584319_alt 25407 zcat genbank/GCA_000001635.6_GRCm38.p4_assembly_structure/*/alt_scaffolds/AGP/*.agp.gz \ | sed -f genbankToUCSC.sed > mm10Patch4.agp checkAgpAndFa mm10Patch4.agp mm10Patch4.unmasked.2bit | tail -1 # All AGP and FASTA entries agree - both files are valid ############################################################################# # build mm10Patch4 database (DONE - 2016-05-16 - Hiram) # need this database for netClass operations during the chain/net # construction mkdir /hive/data/genomes/mm10Patch4 cd /hive/data/genomes/mm10Patch4 mkdir /gbdb/mm10Patch4 ln -s /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit ./ ln -s /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.agp ./ twoBitInfo mm10Patch4.unmasked.2bit stdout | sort -k2nr > chrom.sizes mkdir -p bed/chromInfo awk '{printf "%s\t%d\t/gbdb/mm10Patch4/mm10Patch4.2bit\n", $1, $2}' \ chrom.sizes > bed/chromInfo/chromInfo.tab hgsql -e 'create database mm10Patch4;' mm10 hgsql mm10Patch4 < $HOME/kent/src/hg/lib/grp.sql hgLoadSqlTab mm10Patch4 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql \ bed/chromInfo/chromInfo.tab hgGoldGapGl -noGl mm10Patch4 mm10Patch4.agp featureBits -or -countGaps mm10Patch4 gold gap # 72697066 bases of 72697066 (100.000%) in intersection hgsql hgcentraltest -e 'INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, htmlPath, hgNearOk, hgPbOk, sourceName, taxId) VALUES ("mm10Patch4", "Mar. 2015", "/gbdb/mm10Patch4", "GRCm38.p4", "chr4_KQ030486_fix:1-399265", 1, 7764, "GRCm38.p4", "Mus musculus", "/gbdb/mm10Patch4/html/description.html", 0, 0, "GRCm38 Genome Reference Consortium Mouse Build 38 patch release 4", 10090); INSERT INTO defaultDb (genome, name) VALUES ("GRCm38.p4", "mm10Patch4"); INSERT INTO genomeClade (genome, clade, priority) VALUES ("GRCm38.p4", "haplotypes", 198);' mkdir html # copy description.html from mm10Patch1/html/description.html # edit to update for Patch4 mkdir -p /hive/data/genomes/mm10Patch4/bed/gc5Base cd /hive/data/genomes/mm10Patch4/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 mm10Patch4 \ ../../mm10Patch4.unmasked.2bit | wigEncode stdin gc5Base.{wig,wib} # Converted stdin, upper limit 100.00, lower limit 0.00 mkdir /gbdb/mm10Patch4/wib ln -s `pwd`/bed/gc5Base/gc5Base.wib /gbdb/mm10Patch4/wib cd bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm10Patch4/wib mm10Patch4 gc5Base \ gc5Base.wig mkdir /hive/data/genomes/mm10Patch4/bed/repeatMasker cd /hive/data/genomes/mm10Patch4/bed/repeatMasker time (doRepeatMasker.pl -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` mm10Patch4) \ > do.log 2>&1 # real 64m1.494s cat faSize.rmsk.txt # 72697066 bases (1268704 N's 71428362 real 39545014 upper 31883348 lower) # in 130 sequences in 1 files # Total size: mean 559208.2 sd 959105.8 min 25407 (chr19_JH584319_alt) # max 5956088 (chr6_GL456054v2_alt) median 221588 # %43.86 masked total, %44.64 masked real mkdir /hive/data/genomes/mm10Patch4/bed/simpleRepeat cd /hive/data/genomes/mm10Patch4/bed/simpleRepeat time (doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \ -smallClusterHub=ku -buildDir=`pwd` mm10Patch4) > do.log 2>&1 # real 6m33.708s cat fb.simpleRepeat # cat 2578582 bases of 71430661 (3.610%) in intersection # the simpleRepeat procedure fails in the cleanup step since there # is no TrfPart directory mkdir /hive/data/genomes/mm10Patch4/bed/windowMasker cd /hive/data/genomes/mm10Patch4/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev mm10Patch4) > do.log 2>&1 # real 2m44.252s cat fb.mm10Patch4.rmsk.windowmaskerSdust.txt # 14367720 bases of 72697066 (19.764%) in intersection cd /hive/data/genomes/mm10Patch4 twoBitMask mm10Patch4.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed mm10Patch4.2bit twoBitToFa mm10Patch4.2bit stdout | faSize stdin # 72697066 bases (1268704 N's 71428362 real 39482408 upper 31945954 lower) # in 130 sequences in 1 files # Total size: mean 559208.2 sd 959105.8 min 25407 (chr19_JH584319_alt) # max 5956088 (chr6_GL456054v2_alt) median 221588 # %43.94 masked total, %44.72 masked real ln -s `pwd`/mm10Patch4.2bit /gbdb/mm10Patch4 ######################################################################### # ucscToINSDC table/track (DONE - 2016-04-14 - Hiram) # the sequence here is working for a 'refseq' assembly with a chrM # situation may be specific depending upon what is available in the assembly mkdir /hive/data/genomes/mm10Patch4/bed/ucscToINSDC cd /hive/data/genomes/mm10Patch4/bed/ucscToINSDC # special script to get the names directly out of the assembly report: ./processReport.pl | sort > ucscToINSDC.txt awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt # there is no name for chrM/NC_001323.1 sequence, there is no such # sequence with an INSDC name grep -v "^#" /hive/data/genomes/mm10/bed/mm10Patch4/genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt | cut -f5,7 \ | sed -e 's/na\b/notAvailable/;' | awk '{printf "%s\t%s\n", $1, $2}' \ | sort > insdc.refseq.txt # the sed \b means to match word awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join insdc.refseq.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \ > ucscToINSDC.bed # should be same line counts throughout (except for insdc.refseq.txt): wc -l * # 196 insdc.refseq.txt # 130 insdcToUcsc.txt # 130 name.coordinate.tab # 130 ucscToINSDC.bed # 130 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab mm10Patch4 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords mm10Patch4 # should cover %100 entirely: featureBits -countGaps mm10Patch4 ucscToINSDC # 72697066 bases of 72697066 (100.000%) in intersection join -1 2 <(sort -k2 ucscToINSDC.txt) insdc.refseq.txt | tr '[ ]' '[\t]' \ | sort -k2 | join -2 2 name.coordinate.tab - | tr '[ ]' '[\t]' \ | cut -f1-4 > ucscToRefSeq.bed cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab mm10Patch4 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed checkTableCoords mm10Patch4 -table=ucscToRefSeq # should cover %100 all bases: featureBits -countGaps mm10Patch4 ucscToRefSeq # 72697066 bases of 72697066 (100.000%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2016-05-17 - Hiram) mkdir /hive/data/genomes/mm10Patch4/bed/cpgIslandsUnmasked cd /hive/data/genomes/mm10Patch4/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/mm10Patch4/mm10Patch4.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku mm10Patch4) > do.log 2>&1 # real 1m16.641s cat fb.mm10Patch4.cpgIslandExtUnmasked.txt # 563715 bases of 71430661 (0.789%) in intersection ########################################################################## # cpgIslands - (DONE - 2016-05-17 - Hiram) mkdir /hive/data/genomes/mm10Patch4/bed/cpgIslands cd /hive/data/genomes/mm10Patch4/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku mm10Patch4) > do.log 2>&1 # real 1m14.891s cat fb.mm10Patch4.cpgIslandExt.txt # 558038 bases of 71430661 (0.781%) in intersection ############################################################################## # cytoBandIdeo - (DONE - 2016-05-17 - Hiram) mkdir /hive/data/genomes/mm10Patch4/bed/cytoBand cd /hive/data/genomes/mm10Patch4/bed/cytoBand makeCytoBandIdeo.csh mm10Patch4 ######################################################################### # CREATE MICROSAT TRACK (DONE - 2016-04-14 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10Patch4/bed/microsat cd /hive/data/genomes/mm10Patch4/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed mm10Patch4 microsat microsat.bed # Read 2030 elements of size 4 from microsat.bed ############################################################################# # lastz alignments to mm10 (DONE - 2016-05-17 - Hiram) ############################################################################# mkdir /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17 cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17 # prepare bits of mm10 sequence to lastz align to the patches, # this is selecting out the specific section of mm10 where the patch # is supposed to match, and setting up lastz parameters rm -fr mm10Bits run.blastz run.blastz/tParts run.blastz/qParts psl mm10Bits.lift mkdir -p mm10Bits run.blastz run.blastz/tParts run.blastz/qParts psl cut -f4 ../patchLocations.bed | while read FIX do chr=`grep "${FIX}" ../patchLocations.bed | cut -f1` start=`grep "${FIX}" ../patchLocations.bed | cut -f2` end=`grep "${FIX}" ../patchLocations.bed | cut -f3` bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'` chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2` # printf "grep '${FIX}' ../mm10Patch4.chrom.sizes | cut -f2\n" 1>&2 fixSize=`grep "${FIX}" ../mm10Patch4.chrom.sizes | cut -f2` # printf "${chr}:${start}-${end} vs. ${FIX}:0-${fixSize}\n" 1>&2 twoBitToFa /hive/data/genomes/mm10/mm10.unmasked.2bit:${chr}:${start}-${end} stdout \ | sed -e "s/${chr}:/${FIX}_/g" > mm10Bits/${FIX}.fa fixName=`head -1 mm10Bits/${FIX}.fa | sed -e 's/>//;'` printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" 1>&2 printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" >> mm10Bits.lift printf "/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit:${fixName}:0-${bitSize}\n" 1>&2 printf "/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit:${fixName}:0-${bitSize}\n" > run.blastz/qParts/${fixName}.lst printf "/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit:${FIX}:0-${fixSize}\n" > run.blastz/tParts/${fixName}.lst printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" 1>&2 printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" >> run.blastz/jobList done faToTwoBit mm10Bits/*.fa mm10Bits.2bit twoBitInfo mm10Bits.2bit stdout | sort -k2n > mm10Bits.chrom.sizes printf 'BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # mouse vs mouse # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Mouse Mm10Patch4 SEQ1_DIR=/hive/data/genomes/mm10Patch4/mm10Patch4.unmasked.2bit SEQ1_LEN=/hive/data/genomes/mm10Patch4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 SEQ1_IN_CONTIGS=0 # QUERY: Mouse Mm10 SEQ2_DIR=/hive/data/genomes/mm10/mm10.unmasked.2bit SEQ2_LEN=/scratch/data/mm10/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit SEQ2_CTGLEN=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.chrom.sizes SEQ2_LIFT=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.lift SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17 TMPDIR=/dev/shm ' > DEF ssh ku cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/run.blastz para create jobList para try ... check ... push ... etc para time > run.time # Completed: 122 of 122 jobs # CPU time in finished jobs: 233s 3.89m 0.06h 0.00d 0.000 y # IO & Wait Time: 376s 6.26m 0.10h 0.00d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 22s 0.37m 0.01h 0.00d # Submission to last job: 47s 0.78m 0.01h 0.00d # put together the individual results cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/mm10Patch4.mm10.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/axtChain/run cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/axtChain/run time zcat ../../pslParts/mm10Patch4.mm10.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \ /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \ /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/mm10Bits.2bit \ stdin mm10Patch4.mm10.preLift.chain # real 0m16.050s # -rw-rw-r-- 1 6410864 May 17 11:03 mm10Patch4.mm10.preLift.chain liftUp -chainQ mm10Patch4.mm10.lifted.chain \ ../../mm10Bits.lift carry mm10Patch4.mm10.preLift.chain # constructing the net files: cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10.2016-05-17/axtChain chainMergeSort run/mm10Patch4.mm10.lifted.chain \ | gzip -c > mm10Patch4.mm10.all.chain.gz chainSplit chain mm10Patch4.mm10.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet -inclHap mm10Patch4.mm10.all.chain.gz \ ../../mm10Patch4.chrom.sizes \ /hive/data/genomes/mm10/chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 ../../mm10Patch4.chrom.sizes \ /hive/data/genomes/mm10/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # real 0m0.811s # -rw-rw-r-- 1 1398298 May 17 11:06 noClass.net hgLoadChain -tIndex mm10Patch4 chainMm10 mm10Patch4.mm10.all.chain.gz # Loading 33147 chains into mm10Patch4.chainMm10 featureBits mm10Patch4 chainMm10Link # 65748666 bases of 71430661 (92.045%) in intersection netClass -verbose=0 -noAr noClass.net mm10Patch4 mm10 mm10Patch4.mm10.net netFilter -minGap=10 mm10Patch4.mm10.net \ | hgLoadNet -verbose=0 mm10Patch4 netMm10 stdin # Make liftOver chains: netChainSubset -verbose=0 noClass.net mm10Patch4.mm10.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > mm10Patch4.mm10.over.chain.gz # -rw-rw-r-- 1 225547 May 17 11:09 mm10Patch4.mm10.over.chain.gz # Make axtNet for download: one .axt per mm10Patch4 seq. netSplit noClass.net net cd .. mkdir -p axtNet # beware, tcsh scripting here: foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ ../mm10Patch4.unmasked.2bit \ /hive/data/genomes/mm10/mm10.unmasked.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.mm10Patch4.mm10.net.axt.gz end # Make mafNet for multiz: one .maf per mm10Patch4 seq. mkdir -p mafNet # beware, tcsh scripting here: foreach f (axtNet/*.mm10Patch4.mm10.net.axt.gz) axtToMaf -tPrefix=mm10Patch4. -qPrefix=mm10. $f \ ../mm10Patch4.chrom.sizes \ /hive/data/genomes/mm10/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end ############################################################################# # run this same business with mm10 as target, Patch4 sequence as query # (DONE - 2016-05-17 - Hiram) mkdir /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17 cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17 printf '# mouse vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Mouse Mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.unmasked.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit SEQ1_CTGLEN=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.chrom.sizes SEQ1_LIFT=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.lift SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=1 # QUERY: Mouse Mm10Patch4 SEQ2_DIR=/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit SEQ2_LEN=/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17 TMPDIR=/dev/shm ' > DEF rm -fr mm10Bits run.blastz psl mm10Bits.lift mkdir -p mm10Bits run.blastz/tParts run.blastz/qParts psl cut -f4 ../patchLocations.bed | while read FIX do chr=`grep "${FIX}" ../patchLocations.bed | cut -f1` start=`grep "${FIX}" ../patchLocations.bed | cut -f2` end=`grep "${FIX}" ../patchLocations.bed | cut -f3` bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'` chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2` fixSize=`grep "${FIX}" ../mm10Patch4.chrom.sizes | cut -f2` echo ${chr}:${start}-${end} vs. ${FIX}:0-${fixSize} 1>&2 twoBitToFa /hive/data/genomes/mm10/mm10.unmasked.2bit:${chr}:${start}-${end} stdout \ | sed -e "s/${chr}:/${FIX}_/g" > mm10Bits/${FIX}.fa fixName=`head -1 mm10Bits/${FIX}.fa | sed -e 's/>//;'` printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" >> mm10Bits.lift printf "/hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit:${fixName}:0-${bitSize}\n" > run.blastz/tParts/${FIX}.lst printf "/hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit:${FIX}:0-${fixSize}\n" > run.blastz/qParts/${FIX}.lst printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${FIX}.lst qParts/${FIX}.lst ../DEF {check out exists ../psl/${FIX}.psl}\n" >> run.blastz/jobList done faToTwoBit mm10Bits/*.fa mm10Bits.2bit twoBitInfo mm10Bits.2bit stdout | sort -k2nr > mm10Bits.chrom.sizes ssh ku cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/run.blastz para create jobList para try ... check ... push ... etc para time # Completed: 122 of 122 jobs # CPU time in finished jobs: 225s 3.75m 0.06h 0.00d 0.000 y # IO & Wait Time: 361s 6.01m 0.10h 0.00d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 20s 0.33m 0.01h 0.00d # Submission to last job: 43s 0.72m 0.01h 0.00d # put together the individual results cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/mm10.mm10Patch4.psl.gz # -rw-rw-r-- 1 3139605 May 17 11:27 mm10.mm10Patch4.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain/run cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain/run time zcat ../../pslParts/mm10.mm10Patch4.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit \ /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/mm10Bits.2bit \ /hive/data/genomes/mm10/bed/mm10Patch4/mm10Patch4.unmasked.2bit \ stdin mm10.mm10Patch4.preLift.chain # real 0m16.904s # -rw-rw-r-- 1 6251918 May 17 11:28 mm10.mm10Patch4.preLift.chain liftUp mm10.mm10Patch4.lifted.chain \ ../../mm10Bits.lift carry mm10.mm10Patch4.preLift.chain # constructing the net files: cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain chainMergeSort run/mm10.mm10Patch4.lifted.chain \ | gzip -c > mm10.mm10Patch4.all.chain.gz # -rw-rw-r-- 1 1423248 May 17 11:30 mm10.mm10Patch4.all.chain.gz hgLoadChain -tIndex mm10 chainMm10Patch4 mm10.mm10Patch4.all.chain.gz # Loading 31778 chains into mm10.chainMm10Patch4 featureBits mm10 chainMm10Patch4Link # 56269294 bases of 2652783500 (2.121%) in intersection chainSplit chain mm10.mm10Patch4.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet -inclHap mm10.mm10Patch4.all.chain.gz \ /hive/data/genomes/mm10/chrom.sizes \ ../../mm10Patch4.chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 /hive/data/genomes/mm10/chrom.sizes \ ../../mm10Patch4.chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # real 0m0.576s netClass -verbose=0 -noAr noClass.net mm10 mm10Patch4 mm10.mm10Patch4.net # -rw-rw-r-- 1 2854952 Jun 29 16:44 mm10.mm10Patch4.net netFilter -minGap=10 mm10.mm10Patch4.net \ | hgLoadNet -verbose=0 mm10 netMm10Patch4 stdin # Make liftOver chains: netChainSubset -verbose=0 noClass.net mm10.mm10Patch4.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > mm10.mm10Patch4.over.chain.gz # -rw-rw-r-- 1 202518 Jun 29 16:47 mm10.mm10Patch4.over.chain.gz # Make axtNet for download: one .axt per mm10Patch4 seq. netSplit noClass.net net cd .. mkdir -p axtNet # more tcsh here foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /hive/data/genomes/mm10/mm10.unmasked.2bit \ ../mm10Patch4.unmasked.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.mm10.mm10Patch4.net.axt.gz end # Make mafNet for multiz: one .maf per mm10Patch4 seq. mkdir -p mafNet # tcsh loop again foreach f (axtNet/*.mm10.mm10Patch4.net.axt.gz) axtToMaf -tPrefix=mm10. -qPrefix=mm10Patch4. $f \ /hive/data/genomes/mm10/chrom.sizes \ ../mm10Patch4.chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end cd /hive/data/genomes/mm10/bed/mm10Patch4/lastzMm10Patch4.2016-05-17/axtChain mkdir -p queryChains chainSplit -q queryChains mm10.mm10Patch4.all.chain.gz # then run a 'lift over' chain/net on each single one mkdir -p singleLiftOver for F in queryChains/*.chain do C=`basename ${F}` B=`echo ${C} | sed -e "s/.chain//"` chainPreNet -inclHap ${F} /hive/data/genomes/mm10/chrom.sizes \ ../../mm10Patch4.chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 /hive/data/genomes/mm10/chrom.sizes \ ../../mm10Patch4.chrom.sizes singleLiftOver/${B}.raw.net \ /dev/null netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \ ${F} stdout \ | chainStitchId stdin stdout > singleLiftOver/${C} printf "${F} -> singleLiftOver/${C}" 1>&2 done # put the chains together into one file chainMergeSort singleLiftOver/chr*.chain | gzip -c \ > mm10.mm10Patch4.single.over.chain.gz # -rw-rw-r-- 1 268273 May 17 11:38 mm10.mm10Patch4.single.over.chain.gz # construct psl files from those chains chainToPsl mm10.mm10Patch4.single.over.chain.gz \ /hive/data/genomes/mm10/chrom.sizes \ ../../mm10Patch4.chrom.sizes \ /hive/data/genomes/mm10/mm10.unmasked.2bit \ ../../mm10Patch4.unmasked.2bit \ mm10.mm10Patch4.over.psl # pslCheck reports no errors pslCheck -db=mm10 mm10.mm10Patch4.over.psl # checked: 1417 failed: 0 errors: 0 # load this PSL track # this table name prefix altSeqLiftOverPsl is recognized in hgc clicks hgLoadPsl mm10 -table=altSeqLiftOverPslP4 mm10.mm10Patch4.over.psl mkdir /hive/data/genomes/mm10/bed/mm10Patch4/seqExt cd /hive/data/genomes/mm10/bed/mm10Patch4/seqExt twoBitToFa ../mm10Patch4.unmasked.2bit mm10Patch4.fa mkdir -p /gbdb/mm10/mm10Patch4 mm10Patch4 faSplit byname mm10Patch4.fa ./mm10Patch4/ ln -s `pwd`/mm10Patch4/*.fa /gbdb/mm10/mm10Patch4 hgLoadSeq -drop -seqTbl=seqMm10Patch4 -extFileTbl=extMm10Patch4 mm10 \ /gbdb/mm10/mm10Patch4/*.fa ############################################################################# # wrap up scripts and file editing (trackDb, hgdownload) # (DONE - 2016-05-17 - Hiram) # edit ${HOME}/kent/src/hg/makeDb/doc/mm10/patchDescr.pl to reflect this most # recent patch version. cd /hive/data/genomes/mm10/bed/mm10Patch4 ${HOME}/kent/src/hg/makeDb/doc/mm10Patch4/patchDescr.pl \ > ${HOME}/kent/src/hg/makeDb/trackDb/mouse/mm10/mm10Patch4.html mkdir /hive/data/genomes/mm10/bed/mm10Patch4/downloads cd /hive/data/genomes/mm10/bed/mm10Patch4/downloads ln -s ../mm10Patch4.unmasked.2bit mm10Patch4.2bit ln -s ../mm10Patch4.agp ln -s ../mm10Patch4.fa.gz md5sum * > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/mm10Patch4 ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/mm10Patch4/ # copy README.txt from previous patch and edit as needed - replace the # patch description lines with the output of running # /hive/data/genomes/mm10/bed/mm10Patch4/readmeLines.pl /cluster/home/hiram/kent/src/hg/makeDb/doc/mm10Patch4/readmeLines.pl \ genbank/GCA_000001635.6_GRCm38.p4_assembly_report.txt \ genbank/GCA_000001635.6_GRCm38.p4_assembly_regions.txt \ patchLocations.bed new.sequences.list #############################################################################