############################################################################ # DBSNP B147 / SNP147 (DONE 2/3/17 jcasper) # RedMine #16847 screen mkdir -p /hive/data/outside/dbSNP/147/chicken_galGal5 cd /hive/data/outside/dbSNP/147/chicken_galGal5 # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (chicken_9031 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # has B128 as its most recent build. # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. # Had to fetch # ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000003055.6.assembly.txt # manually and install it in the config file because NCBI's FTP server # appears to be having some problems with providing the right header # information (curl and wget on our server complain about it, anyway). cp /hive/data/genomes/galGal5/refseq/GCF_000002315.4_Gallus_gallus-5.0_assembly_report.txt . cat > config.ra <& do.log & tail -f do.log #*** 23439 lines of b147_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/147/chicken_galGal5/cantLiftUpSeqNames.txt . # #*** You must account for all 23870 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 23870 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp cat cantLiftUpSeqNames.txt #unlocalized-scaffold (no chr1_AADN04000798v1_random or chr1_AADN04000798_random in chrom.sizes) NT_456096 chr1 #unlocalized-scaffold (no chr1_AADN04000799v1_random or chr1_AADN04000799_random in chrom.sizes) NT_456097 chr1 #... # Some trial and error is always required to get the config.ra just right. # This time, it turns out that the UCSC contig names for unplaced and # unlocalized scaffolds are based on the RefSeq names instead of GenBank contig # names, which the script doesn't expect. Time to generate some additional liftUp lines. # genLift.pl is as follows: # while ($refName = ) # { # chomp($refName); # $ucscLine = `grep $refName /hive/data/genomes/galGal5/chrom.sizes`; # chomp($ucscLine); # ($ucscName, $size) = split /\t/, $ucscLine; # printf ("0\t%s\t%s\t%s\t%s\t+\n", $refName, $size, $ucscName, $size); # } awk '{print $(NF-1)}' cantLiftUpSeqNames.txt | perl genLift.pl > supplemental.lft cat suggested.lft supplemental.lft > all.lft cat >> config.ra <> do.log 2>&1 & tail -f do.log # *** All done! # # This is a bit disingenuous. There was a further problem, which was that (at # the time of this build, anyway), once dbSNP maps SNPs to a contig, they # continue mapping to the contig in that orientation. This holds even if a # later assembly has that contig in reversed orientation on an assembled # chromosome. This build was the first time where we encountered an assembly # for which that mattered - a number of contigs were reversed in the galGal5 # assembly from how dbSNP mapped to them. doDbSnp.pl was not equipped to # handle the situation, and a few changes had to be made to support it. # # Furthermore, there was a second problem where it turns out that dbSNP was # mistakenly providing coordinates on assembled chromosomes (the phys_pos_from field) # for SNPs on unlocalized scaffolds. The snpNcbiToUcsc program had to be briefly # modified to ignore those values. zcat snp147.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 73129 #Common: 18675 #Flagged: 0 #leftover: 20951514 # 18,675 SNPs out of ~21M that meet the threshold for Common... that's # pretty sparse. Not worth pushing out as a separate subtrack.