#!/usr/bin/perl 
#===============================================================================
#
#         FILE:  knownGeneExtractor.pl
#
#        USAGE:  ./knownGeneExtractor.pl <knownGene>
#
#  DESCRIPTION:  Extract boundary, tss/pas and generange information from UCSC
#                knownGene table directly
#
#      OPTIONS:  ---
# REQUIREMENTS:  ---
#         BUGS:  ---
#        NOTES:  ---
#       AUTHOR:  Jianxing Feng (feeldead), feeldead@gmail.com
#      COMPANY:  THU
#      VERSION:  1.0
#      CREATED:  05/29/2010 01:05:14 PM
#     REVISION:  ---
#===============================================================================
#changed by Zheng Xia, 12/20/2010
#do not output the boundary information in Gene range.

if( @ARGV < 1 ){
  die("Usage: ./knownGeneExtractor.pl <knownGene> \n");
}

open(KnownGene, "<$ARGV[0]") or die("Error: cannot open file '$ARGV[0]'\n");

#open(Bound, ">Bound") or die("Error: cannot open file 'Bound'\n");
open(TSSPAS, ">TSSPAS") or die("Error: cannot open file 'TSSPAS'\n");
open(GeneRange, ">GeneRange") or die("Error: cannot open file 'GeneRange'\n");

while (<KnownGene>)
{
	chomp;
	@tmp = split(/\t/);
	$genename = $tmp[0];
	if ($genename =~ "#.*")
	{
		next;
	}

	$chr = $tmp[1];
	$strand = $tmp[2];
	$start = $tmp[3];
	$end = $tmp[4];
	#@starts = split(/,/, $tmp[8]);
	#@ends = split(/,/, $tmp[9]);

	#foreach $pos (@starts)
	#{
	#	print Bound "$chr\t$strand\t$pos\t0\n";
	#}

	#foreach $pos (@ends)
	#{
	#	print Bound"$chr\t$strand\t$pos\t1\n";
	#}

	print GeneRange "$genename\t$chr\t$strand\t$start\t$end\n";


#	print GeneRange "\n";

	print TSSPAS "$genename\t$start\t$end\n";
}

#close(Bound);
close(TSSPAS);
close(GeneRange);
print "Known gene extraction finished!\n"



