########################################################################### ### obtain the Ensembl sequence to create idKeys ########################################################################### mkdir -p /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent wget --timestamping \ http://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz # alternate sequences are too large to place everything into one 2bit file mkdir split faSplit byname Homo_sapiens.GRCh38.dna.toplevel.fa.gz split/ # make 2bit files for each sequence mkdir -p twoBit for F in split/* do echo $F | sed -e 's/.fa$//; s#split/##;' | while read C do printf "faToTwoBit ${F} twoBit/${C}.2bit\n" done done > run.2bit.list # run 32 of them at a time parallel -j 32 < run.2bit.list # create chrom.sizes from those 2bit files: for F in twoBit/*.2bit do twoBitInfo $F stdout done | sort -k2,2nr > ensembl.GRCh38.p13.chrom.sizes # create twoBitDup idKeys for each sequence mkdir -p keys for F in twoBit/* do echo $F | sed -e 's/.2bit$//; s#twoBit/##;' | while read C do printf "twoBitDup ${F} -keyList=keys/${C}.txt\n" done done > run.twoBitDup.list # run 32 at a time: parallel -j 32 < run.twoBitDup.list # put all those keys together into one idKeys.txt file: cat keys/* | sort > ensembl.GRCh38.p13.idKeys.txt ########################################################################### # ready to match idKeys ########################################################################### # chrY doesn't match since Ensembl has masked the PAR on Y # but it is the same length sequence and does match except for the PAR # NOT doing the 'grep -v 270752' to eliminate the contamination sequence cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent ( printf "chrY\tY\n" join -t$'\t' ../../../idKeys.p14/hg38.idKeys.txt \ ensembl.GRCh38.p13.idKeys.txt | cut -f2-) | sort > perfect.ucsc.ensembl.tsv # the ones that don't match are because of the construction of the alt # sequence at Ensembl: join -v2 -t$'\t' ../../../idKeys.p14/hg38.idKeys.txt \ ensembl.GRCh38.p13.idKeys.txt | grep -v -w "Y" | cut -f2- \ | sort > noKeyMatch.ucsc.ensembl.tsv ########################################################################### # Extract the alt sequences from the artifical Ensembl chromosomes ########################################################################### cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent # this script will remove the leading and trailing N sequences from # the fasta, and it will measure the leading N length to get the lift # coordinate printf '#!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: ./trimEnds.pl file.fa > file.fa 2> lift.file\n"; exit 255; } my $skipBeginning = 1; my @endTrim; my $liftOffset = 0; my $inFile = shift; my $seqName = $inFile; $seqName =~ s/.fa//; $seqName =~ s#split/##; open (FH, "<$inFile") or die "can not read $inFile"; while (my $line = ) { chomp $line; if ($line =~ m/^>/) { printf "%s\n", $line; } elsif ($skipBeginning && ($line =~ m/^NN*N$/)) { $liftOffset += length($line); } elsif ($skipBeginning) { $skipBeginning = 0; my $lineLength = length($line); $line =~ s/^N+//; $liftOffset += $lineLength - length($line); printf STDERR "%s\t%d\n", $seqName, $liftOffset; push @endTrim, $line; } else { push @endTrim, $line; } } close (FH); my $endTrimmed = 0; my $arrayLength = scalar(@endTrim); for (my $i = $arrayLength - 1; $endTrimmed < 1 && $i > 0; --$i) { if ($endTrim[$i] =~ m/^$/) { delete $endTrim[$i]; } elsif ($endTrim[$i] =~ m/^N*$/) { delete $endTrim[$i]; } else { $endTrimmed = 1; $endTrim[$i] =~ s/N+$//; } } $arrayLength = scalar(@endTrim); for (my $i = 0; $i < $arrayLength; ++$i) { printf "%s\n", $endTrim[$i]; } ' > trimEnds.pl chmod +x trimEnds.pl mkdir -p noNs cut -f2 ../../noKeyMatch.ucsc.ensembl.tsv | grep -v -w Y | while read C do if [ -s "noNs/${C}.fa" ]; then printf "# done ${C}\n" 1>&2 else printf "./runTrim ${C}\n" fi done > trim.run.list mkdir noNs lift printf '#!/bin/bash export C=$1 ./trimEnds.pl split/${C}.fa > noNs/${C}.fa 2> lift/${C}.tsv ' > runTrim chmod +x runTrim # run 32 at a time parallel -j32 < trim.run.list sort lift/*.tsv > ensemblLift.tsv ### and make up a 2bit file with all these trimmed sequences faToTwoBit noNs/*.fa noNs.2bit ### and idKeys via twoBitDup twoBitDup -keyList=stdout noNs.2bit | sort > noNs.idKeys.txt ########################################################################### # continue to match idKeys for these alternate sequences ########################################################################### cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent join -t$'\t' ../../../idKeys.p14/hg38.idKeys.txt noNs.idKeys.txt \ | cut -f2- | sort > alts.OK.ucsc.ensembl.tsv # the outstanding mis-matches are due to Ensembl reversed sequence comm -23 <(cut -f2 ../../../idKeys.p14/hg38.idKeys.txt | sort) \ <(cut -f1 alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv | sort) \ > to.be.determined.ucsc.ensembl.txt # reverse those UCSC sequences: mkdir -p ucscRc for C in `cat to.be.determined.ucsc.ensembl.txt` do printf "twoBitToFa /hive/data/genomes/hg38/hg38.2bit:${C} stdout | faRc stdin ucscRc/${C}_r.fa\n" done > faRc.run.list # run 32 at a time parallel -j 32 < faRc.run.list faToTwoBit ucscRc/*_r.fa ./ucscRc.2bit twoBitDup -keyList=stdout ucscRc.2bit | sort > ucscRc.idKeys.txt join -t$'\t' ucscRc.idKeys.txt ./update.2021-09-20/ensemblRecent/noNs.idKeys \ | cut -f2- | sed -e 's/RC_//;' > reversedAlts.ucsc.ensembl.tsv ### and finally, all together now: sort alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv \ reversedAlts.ucsc.ensembl.tsv > ucscToEnsembl.hg38.p14.tsv # verify have matches to each Ensembl sequence: wc -l ensembl.GRCh38.p13.chrom.sizes ucscToEnsembl.hg38.p14.tsv # 639 ensembl.GRCh38.p13.chrom.sizes # 639 ucscToEnsembl.hg38.p14.tsv # save previous ucscToEnsembl table: hgsql -N -e 'select * from ucscToEnsembl;' hg38 \ > hg38.ucscToEnsembl.previous.tsv # this new one is only different since the contamination sequence # has not been removed: diff ucscToEnsembl.hg38.p14.tsv hg38.ucscToEnsembl.previous.tsv 619d618 < chrUn_KI270752v1 KI270752.1 # load the ucscToEnsembl table: printf ' # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(23)) ); ' > ucscToEnsembl.sql hgsql hg38 -e 'drop table ucscToEnsembl;' hgsql hg38 < ucscToEnsembl.sql hgsql hg38 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.hg38.p14.tsv" INTO TABLE ucscToEnsembl' # save previous ensemblLift table: hgsql -N -e 'select * from ensemblLift;' hg38 \ | sort > hg38.ensemblLift.previous.tsv # appear to have identical results: sum hg38.ensemblLift.previous.tsv ensemblLift.tsv # 59375 13 hg38.ensemblLift.previous.tsv # 59375 13 ensemblLift.tsv # 'offset' is now a reserved MariaDb word, need to back quote it so it will work printf ' CREATE TABLE ensemblLift ( chrom varchar(255) not null, # Ensembl chromosome name `offset` int unsigned, # offset to add to UCSC position #Indices PRIMARY KEY(chrom(38)) ); ' > ensemblLift.sql hgsql hg38 -e 'drop table ensemblLift;' hgsql hg38 < ensemblLift.sql hgsql hg38 \ -e 'LOAD DATA LOCAL INFILE "ensemblLift.tsv" INTO TABLE ensemblLift' ########################################################################### ### obtain the Ensembl sequence to create idKeys ########################################################################### mkdir -p /hive/data/genomes/hg38/bed/chromAlias/update.2021-09-20/ensemblRecent cd /hive/data/genomes/hg38/bed/chromAlias/update.2021-09-20/ensemblRecent wget --timestamping \ http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz # alternate sequences are too large to place everything into one 2bit file faSplit byname Homo_sapiens.GRCh38.dna.toplevel.fa.gz split/ # make 2bit files for each sequence mkdir -p twoBit for F in split/* do echo $F | sed -e 's/.fa$//; s#split/##;' | while read C do printf "faToTwoBit ${F} twoBit/${C}.2bit\n" done done # create twoBitDup idKeys for each sequence mkdir -p keys for F in twoBit/* do echo $F | sed -e 's/.2bit$//; s#twoBit/##;' | while read C do twoBitDup ${F} -keyList=keys/${C}.txt done done # put all those keys together into one idKeys.txt file: cat keys/* | sort > ensembl.GRCh38.p13.idKeys.txt ########################################################################### # ready to match idKeys ########################################################################### # chrY doesn't match since Ensembl has masked the PAR on Y # but it is the same length sequence and does match except for the PAR # the 'grep -v 270752' eliminates the contamination sequence cd /hive/data/genomes/hg38/bed/chromAlias ( printf "chrY\tY\n" join -t$'\t' ../idKeys.p13/hg38.idKeys.txt \ ./update.2021-09-20/ensemblRecent/ensembl.GRCh38.p13.idKeys.txt \ | cut -f2- ) | grep -v 270752 | sort > perfect.ucsc.ensembl.tsv # the ones that don't match are because of the construction of the alt # sequence at Ensembl: join -v2 -t$'\t' ../idKeys.p13/hg38.idKeys.txt \ ./update.2021-09-20/ensemblRecent/ensembl.GRCh38.p13.idKeys.txt \ | grep -v -w "Y" | cut -f2- | sort > noKeyMatch.ucsc.ensembl.tsv ########################################################################### # Extract the alt sequences from the artifical Ensembl chromosomes ########################################################################### cd /hive/data/genomes/hg38/bed/chromAlias/update.2021-09-20/ensemblRecent # this script will remove the leading and trailing N sequences from # the fasta, and it will measure the leading N length to get the lift # coordinate printf '#!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: ./trimEnds.pl file.fa > file.fa 2> lift.file\n"; exit 255; } my $skipBeginning = 1; my @endTrim; my $liftOffset = 0; my $inFile = shift; my $seqName = $inFile; $seqName =~ s/.fa//; $seqName =~ s#split/##; open (FH, "<$inFile") or die "can not read $inFile"; while (my $line = ) { chomp $line; if ($line =~ m/^>/) { printf "%s\n", $line; } elsif ($skipBeginning && ($line =~ m/^NN*N$/)) { $liftOffset += length($line); } elsif ($skipBeginning) { $skipBeginning = 0; my $lineLength = length($line); $line =~ s/^N+//; $liftOffset += $lineLength - length($line); printf STDERR "%s\t%d\n", $seqName, $liftOffset; push @endTrim, $line; } else { push @endTrim, $line; } } close (FH); my $endTrimmed = 0; my $arrayLength = scalar(@endTrim); for (my $i = $arrayLength - 1; $endTrimmed < 1 && $i > 0; --$i) { if ($endTrim[$i] =~ m/^$/) { delete $endTrim[$i]; } elsif ($endTrim[$i] =~ m/^N*$/) { delete $endTrim[$i]; } else { $endTrimmed = 1; $endTrim[$i] =~ s/N+$//; } } $arrayLength = scalar(@endTrim); for (my $i = 0; $i < $arrayLength; ++$i) { printf "%s\n", $endTrim[$i]; } ' > trimEnds.pl chmod +x trimEnds.pl mkdir -p noNs cut -f2 ../../noKeyMatch.ucsc.ensembl.tsv | grep -v -w Y | while read C do if [ -s "noNs/${C}.fa" ]; then printf "# done ${C}\n" else printf "# working ${C}\n" ./trimEnds.pl split/${C}.fa > noNs/${C}.fa fi done 2> ensemblLift.tsv ### and make up a 2bit file with all these trimmed sequences faToTwoBit noNs/*.fa noNs.2bit ### and idKeys via twoBitDup twoBitDup -keyList=stdout noNs.2bit | sort > noNs.idKeys.txt ########################################################################### # continue to match idKeys for these alternate sequences ########################################################################### cd /hive/data/genomes/hg38/bed/chromAlias join -t$'\t' ../idKeys.p13/hg38.idKeys.txt \ ./update.2021-09-20/ensemblRecent/noNs.idKeys.txt \ | cut -f2- | sort > alts.OK.ucsc.ensembl.tsv # the outstanding mis-matches are due to Ensembl reversed sequence comm -23 <(cut -f2 ../idKeys.p13/hg38.idKeys.txt | sort) \ <(cut -f1 alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv | sort) \ | grep -v 270752 > to.be.determined.ucsc.ensembl.txt # reverse those UCSC sequences: mkdir -p ucscRc for C in `cat to.be.determined.ucsc.ensembl.txt` do twoBitToFa ../../hg38.p13.2bit:${C} stdout | faRc stdin ucscRc/${C}_r.fa done faToTwoBit ucscRc/*_r.fa ./ucscRc.2bit twoBitDup -keyList=stdout ucscRc.2bit | sort > ucscRc.idKeys.txt join -t$'\t' ucscRc.idKeys.txt ./update.2021-09-20/ensemblRecent/noNs.idKeys \ | cut -f2- | sed -e 's/RC_//;' > reversedAlts.ucsc.ensembl.tsv ### and finally, all together now: sort alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv \ reversedAlts.ucsc.ensembl.tsv > ucscToEnsembl.hg38.p13.tsv ### after all this, there is one sequence in UCSC: chr15_KN538374v1_fix ### which can not be found in Ensembl. I don't know why. When I blat ### it at Ensembl, it says it is a section of chr15. This sequence ### is also highly related to chr15_KI270905v1_alt # load the ucscToEnsembl table: printf ' # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(23)) ); ' > ucscToEnsembl.sql hgsql hg38 -e 'drop table ucscToEnsembl;' hgsql hg38 < ucscToEnsembl.sql hgsql hg38 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.hg38.p13.tsv" INTO TABLE ucscToEnsembl' printf ' CREATE TABLE ensemblLift ( chrom varchar(255) not null, # Ensembl chromosome name offset int unsigned not null, # offset to add to UCSC position #Indices PRIMARY KEY(chrom(38)) ); ' > ensemblLift.sql hgsql hg38 -e 'drop table ensemblLift;' hgsql hg38 < ensemblLift.sql hgsql hg38 \ -e 'LOAD DATA LOCAL INFILE "update.2021-09-20/ensemblRecent/ensemblLift.tsv" INTO TABLE ensemblLift'