# download and load ncbiGene track db=danRer10 mkdir /cluster/data/genomes/$db/bed/ncbiGene cd /cluster/data/genomes/$db/bed/ncbiGene ftpFile=ftp://ftp.ncbi.nlm.nih.gov/genomes/D_rerio/GFF/ref_GRCz10_top_level.gff3.gz gff3File=`basename $ftpFile` echo "select * from ucscToRefSeq" | hgsql $db | tail -n +2 | awk '{print 0, $4, $3, $1, $3}' > refSeqToUcsc.lft wget $ftpFile /cluster/home/braney/bin/x86_64/gff3ToGenePred -useName -warnAndContinue -attrsOut=attrs -bad=bad.gp $gff3File stdout 2> convertErr.txt | liftUp -type=.gp -extGenePred lift.gp refSeqToUcsc.lft warn stdin 2> liftErr.txt wc -l lift.gp # 54816 lift.gp wc -l bad.gp # 0 bad.gp tawk '{print $1}' attrs | sort | uniq > meta wc -l meta # 63880 meta for i in product Dbxref gene gbkey do echo $i tawk -v attr=$i '$2==attr {print $1,$3}' attrs | sort | uniq | join -t $'\t' /dev/stdin meta > out mv out meta done wc -l meta # 63745 meta cat curated.gp predicted.gp | awk '{print $1}' | sort -u > tmp1 cat meta | awk '{print $1}' | sort -u > tmp2 join -v 1 tmp1 tmp2 | wc -l # 0 egrep "^N(M|R|P)" lift.gp > curated.gp egrep "^X(M|R)" lift.gp > predicted.gp wc -l curated.gp predicted.gp # 15143 curated.gp # 39686 predicted.gp # 54829 total grep dropping convertErr.txt | wc -l # 0 awk '/isn/ {print $1}' liftErr.txt | sort -u # nothing hgLoadGenePred -genePredExt $db ncbiRefCurated curated.gp hgLoadGenePred -genePredExt $db ncbiRefPredicted predicted.gp hgLoadSqlTab $db ncbiRefLink $kent/src/hg/lib/ncbiRefLink.sql meta hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("danRer10", "ncbiRefSeq", "braney", "104", now(), \ "http://www.ncbi.nlm.nih.gov/genome/annotation_euk/Danio_rerio/104/", \ "ftp://ftp.ncbi.nlm.nih.gov/genomes/D_rerio", \ "24 September 2014" );' hgFixed