# download and load ncbiGene track kent=$HOME/kent db=hg38 mkdir /cluster/data/genomes/$db/bed/ncbiGene cd /cluster/data/genomes/$db/bed/ncbiGene ftpFile=ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz gff3File=`basename $ftpFile` echo "select * from ucscToRefSeq" | hgsql $db | tail -n +2 | awk '{print 0, $4, $3, $1, $3}' > refSeqToUcsc.lft wget $ftpFile /cluster/home/braney/bin/x86_64/gff3ToGenePred -useName -warnAndContinue -attrsOut=attrs -bad=bad.gp $gff3File stdout 2> convertErr.txt | liftUp -type=.gp -extGenePred lift.gp refSeqToUcsc.lft warn stdin 2> liftErr.txt wc -l lift.gp # 149502 wc -l bad.gp # 124 liftUp -type=.gp -extGenePred liftBad.gp refSeqToUcsc.lft warn bad.gp 2> liftErr2.txt awk '{count=$8; split($9,starts,","); split($10, stops, ","); split($15, frames, ","); for(ii=2; ii <=count; ii++) { if (starts[ii] < stops[ii-1]) {print $2, starts[ii-1], stops[ii-1], $1, frames[ii-1]; print $2,starts[ii], stops[ii], $1, frames[ii];}}}' liftBad.gp > ~/public_html/overlaps.txt tawk -f fixGp.awk liftBad.gp > liftFix.gp genePredCheck liftFix.gp #Error: liftFix.gp:1: NM_001242356.2 no exonFrame on CDS exon 4 #Error: liftFix.gp:2: NM_004829.6 no exonFrame on CDS exon 5 #Error: liftFix.gp:3: NM_001242356.2 no exonFrame on CDS exon 4 #Error: liftFix.gp:4: NM_004829.6 no exonFrame on CDS exon 5 #Error: liftFix.gp:5: NM_001242356.2 no exonFrame on CDS exon 4 #Error: liftFix.gp:6: NM_004829.6 no exonFrame on CDS exon 5 #Error: liftFix.gp:7: NM_001242356.2 no exonFrame on CDS exon 4 #Error: liftFix.gp:8: NM_004829.6 no exonFrame on CDS exon 5 #Error: liftFix.gp:9: NM_001242356.2 no exonFrame on CDS exon 4 #Error: liftFix.gp:10: NM_004829.6 no exonFrame on CDS exon 5 #Error: liftFix.gp:11: NM_001242356.2 no exonFrame on CDS exon 4 #Error: liftFix.gp:12: NM_004829.6 no exonFrame on CDS exon 5 #Error: liftFix.gp:20: NM_025259.5 no exonFrame on CDS exon 4 #Error: liftFix.gp:21: NM_172166.3 no exonFrame on CDS exon 4 #Error: liftFix.gp:22: NM_172165.3 no exonFrame on CDS exon 4 #Error: liftFix.gp:23: NM_002441.4 no exonFrame on CDS exon 4 #Error: liftFix.gp:32: NM_006781.4 exon 0 start 3664574 >= end 3664574 #Error: liftFix.gp:32: NM_006781.4 exonFrame on non-CDS exon 0 #Error: liftFix.gp:44: NM_018406.6 no exonFrame on CDS exon 30 #Error: liftFix.gp:81: NM_006781.4 exon 0 start 3655065 >= end 3655065 #Error: liftFix.gp:81: NM_006781.4 exonFrame on non-CDS exon 0 #Error: liftFix.gp:94: NM_007028.3 exon 3 start 1368529 >= end 1368529 #checked: 122 failed: 22 tawk '{print $1}' lift.gp | sort | uniq | wc #142893 tawk '{print $1}' attrs | sort | uniq > meta wc -l meta # 143746 meta for i in product Dbxref gene gbkey do echo $i tawk -v attr=$i '$2==attr {print $1,$3}' attrs | sort | uniq | join -t $'\t' /dev/stdin meta > out mv out meta done wc -l meta # 142767 meta egrep "^N(M|R)|^YP" lift.gp > curated.gp egrep "^X(M|R)" lift.gp > predicted.gp wc -l curated.gp predicted.gp # 54771 curated.gp # 89031 predicted.gp # 143802 total cat curated.gp predicted.gp | awk '{print $1}' | sort -u > tmp1 cat meta | awk '{print $1}' | sort -u > tmp2 join -v 1 tmp1 tmp2 | wc -l # 0 grep dropping convertErr.txt | wc -l # 124 awk '/isn/ {print $1}' liftErr.txt | sort -u # NW_009646208.1 # NW_009646209.1 hgLoadGenePred -genePredExt $db ncbiRefCurated curated.gp hgLoadGenePred -genePredExt $db ncbiRefPredicted predicted.gp hgLoadSqlTab $db ncbiRefLink $kent/src/hg/lib/ncbiRefLink.sql meta hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("hg38", "ncbiRefSeq", "braney", "107", now(), \ "http://www.ncbi.nlm.nih.gov/genome/annotation_euk/Homo_sapiens/107/", \ "ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens", \ "12 March 2015" );' hgFixed