/*
 * convert gtf to .bed files
 * Notice that all coordinates in .gtf file is 1-based, but in prediction files, start coordinates are 0-based, and end coordinates are 1-based.
 */
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <vector>
using namespace std;

int main(int argc, char* argv[]){
	if(argc!=2){
		cerr<<"This program is used to convert cufflinks gtf file to isoform prediction file, which is the same output as isoinfer.\n";
		cerr<<"Usage: gtf2pred gtffile.\n";
		return -1;
	}
	ifstream ifs(argv[1]);
	if(!ifs.is_open()){
		cerr<<"Error opening gtf file "<<argv[1]<<endl;
		return -1;
	}
	string ofile(argv[1]);
	ofile+=".predict.txt";
	ofstream ofs(ofile.c_str());
	if(!ofs.is_open()){
		cerr<<"Error opening output file "<<ofile<<endl;
		return -1;
	}
	vector<pair<long, long> > oneisoform;

	string oneline;//read one line

	string chrname;
	string geneid;
	string transname;
	string prevtransname="";
	string prevchroname="";
	char prevdirection;

	pair<long,long> current;
	int transcounter=0;
	while(true){
		getline(ifs,oneline);
		if(ifs.eof())break;
		stringstream ss(oneline);
		ss>>chrname;//1st chromosome name
		string tmp;
		ss>>tmp; //2nd, in cufflinks, should be "Cufflinks"
		string stype;
		ss>>stype; //3rd, should be transcript or exon
		if(stype!="exon")continue; //ignore none-exon part
		ss>>current.first; //4th, start
		//current.first--;//rm one to the boundary
		ss>>current.second; //5th, end
		//current.second++; //add one to the end boundary

		int score;
		ss>>score; //6th, score
		char direction;
		ss>>direction; //7th, direction
		if(direction=='.')direction='+';
		char tmpchar;
		ss>>tmpchar; //8th, reserved, should be .
		ss>>tmp; //9th, should be "gene_id"
		ss>>geneid; //10th, gene_id
		ss>>tmp; //11th, should be "transcript_id"
		ss>>transname; //12th, transcript_id
		string fpkmstr;
		float fpkmval;
		ss>>tmp; //should be FPKM
		ss>>fpkmstr;

		if(transname[0]=='"')transname=transname.substr(1);
		if(transname[transname.size()-1]==';')transname=transname.substr(0,transname.size()-1);
		if(transname[transname.size()-1]=='"')transname=transname.substr(0,transname.size()-1);
		if(transname!=prevtransname){
			if(prevtransname!=""){
				transcounter++;
				if(transcounter%10000==1)cout<<"Writing transcripts "<<transcounter<<"...\n";
				//write to output file
				ofs<<prevchroname<<"\t"
					//<<prevtransname<<"\t"
					<<oneisoform[0].first-1<<"\t"<<oneisoform[oneisoform.size()-1].second<<"\t"
					<<transname<<"\t"
					<<prevdirection<<"\t";
				for(int i=0;i<oneisoform.size();i++){
					ofs<<oneisoform[i].first-1;
					if(i!=oneisoform.size()-1)ofs<<",";
					else ofs<<" ";
				}
				for(int i=0;i<oneisoform.size();i++){
					ofs<<oneisoform[i].second;
					if(i!=oneisoform.size()-1)ofs<<",";
					
				}
				ofs<<endl;
					
			}
			prevtransname=transname;
			prevchroname=chrname;
			prevdirection=direction;
			oneisoform.clear();
		}
		else{
		}
		oneisoform.push_back(current);

	}


	//write the last record
	if(prevtransname!=""){
		transcounter++;
		if(transcounter%10000==1)cout<<"Writing transcripts "<<transcounter<<"...\n";
		//write to output file
		ofs<<transcounter<<"\t"
			//<<prevtransname<<"\t"
			<<"Pred"<<transcounter<<"\t"
			<<prevchroname<<"\t"<<prevdirection<<"\t"
			<<oneisoform[0].first-1<<"\t"<<oneisoform[oneisoform.size()-1].second<<"\t";
		for(int i=0;i<oneisoform.size();i++){
			ofs<<oneisoform[i].first-1;
			if(i!=oneisoform.size()-1)ofs<<",";
			else ofs<<" ";
		}
		for(int i=0;i<oneisoform.size();i++){
			ofs<<oneisoform[i].second;
			if(i!=oneisoform.size()-1)ofs<<",";
			
		}
		ofs<<endl;
			
	}

	ofs.close();
		
	ifs.close();
	return 0;
}
