/**
 * This program is used to compare predicted isoforms (gtf) with the true reference isoforms (gtf).
 * Both gtf files must be sorted according to the chromosome id and start position.

 */

#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <string>
#include <sstream>
#include <cstdlib>
#include <cstdio>
//#include <cmath>

long abs2(long a){
  return (a>0?a:-a);
}

using namespace std;


vector<vector<pair<long,long> > > allisoforms;//save all possible isoforms
map<string,int> chrboundary;//the starting boundary of one chromosome
vector<string> isonames;//the names of isoforms
vector<char> isodirection;//directions

map<string,vector<string> > refinfo;

bool verbose=false;
double minfrac=0.5;

bool nisoform=false;

//the prefix to the map file
string mapprefix="";

bool longgenename=false;

/**
 * Condense isoforms with continuous exons.
 */
void condenseiso(vector<pair<long, long> >&oneiso){
  if(oneiso.size()<2)return;
  pair<long,long> crt=oneiso[0];
  vector<pair<long,long> >tmpiso;
  bool ifchange=false;
  for(int i=1;i<oneiso.size();i++){
    if(oneiso[i].first==crt.second){
      crt.second=oneiso[i].second;
      ifchange=true;
    }
    else{
      tmpiso.push_back(crt);
      crt=oneiso[i];
    }
  }
  tmpiso.push_back(crt);
  if(ifchange==false)return;
  oneiso=tmpiso;
}


/*
Parse the last field of gtf, return a <key,value> set
return -1 if fail
*/
int parsegtflastfield(stringstream& ss, map<string,string>&ret){
                
                vector<string> tk;
                while(ss.good()){
                  string tsp;
                  ss>>tsp;
                  if(tsp.size()>=2 && tsp[0]=='"' && tsp.substr(tsp.size()-2,2)=="\";")
                  tsp=tsp.substr(1,tsp.size()-3);
                  if(tsp!="")
                    tk.push_back(tsp);
                }
                if(int(tk.size())%2==1)return -1;
                for(int i=0;i<tk.size();i+=2){
                  ret[tk[i]]=tk[i+1];
                }
                return 0;
}


/**
 * Read reference isoforms
 */
int readrefgtf(char* filename) 
{ 
  ifstream ifs(filename);
  if(!ifs.is_open()){
    cerr<<"Error opening reference gtf file "<<filename<<endl;
    return -1;
  }

  string oneline;//line buffer
  string prevchrname="";
  string prevtransid="";


  string chrname; //1st, chromosome name
  string rectype; //3rd, record type 
  long startp,endp; //4th-5th, start position and end position
  char direction;//7th, direction (+ or -)
  string gene_id, trans_id; //9th-12th, gene id and transcript id;

  map<int,int> haschrboundary;//is used to check the order of two ref transcripts

  while(true){
    getline(ifs,oneline);
    if(ifs.eof())break;
    stringstream olb(oneline);
    olb>>chrname; //chromosome name
    if(chrname!=prevchrname){
      if(verbose)cout<<"Switching to chromosome "<<chrname<<"...\n";
      if(chrboundary.count(chrname)!=0){
        cerr<<"Error: the reference gtf should be sorted.\n";
      }
      chrboundary[chrname]=isonames.size();
      haschrboundary[isonames.size()]=1;
      prevchrname=chrname;
    }
    string tmp;
    olb>>tmp;
    olb>>rectype; //exon, CDS, start_codon, or stop_codon. Use exon only.
    if(rectype!="exon")continue;
    olb>>startp>>endp;
    string conf; //6th, confidence
    olb>>conf;
    olb>>direction;//7th, direction
    if(direction!='+' && direction != '-' && direction != '.' && direction != '*'){
      cerr<<"Error: direction should be either + or - or ., but I receive "<<direction<<" .\n";
    }
    char fshift; //8th, frame shift (0, 1, 2 or .)
    olb>>fshift;
    if(fshift!='0' && fshift!='1' && fshift!='2' && fshift!='.'){
      cerr<<"Error: frame shift should be 0, 1, 2 or .\n";
    }
                map<string,string> lfd;
                parsegtflastfield(olb,lfd);
                if(lfd.count("gene_id")==0 || lfd.count("transcript_id")==0){
                  cerr<<"Error: missing gene_id and transcript_id field.\n";
                  continue;
                }
                gene_id=lfd["gene_id"];
                trans_id=lfd["transcript_id"];
                if(longgenename)trans_id=gene_id+":"+trans_id;
                
                string exonnum="";
                string fpkmv="";
                if(lfd.count("exon_number")>0 && lfd.count("FPKM")>0){
                  exonnum=lfd["exon_number"];
                  fpkmv=lfd["FPKM"];
                }
               
    if(trans_id!=prevtransid){
      //a new record
      isonames.push_back(trans_id);
      isodirection.push_back(direction);
      allisoforms.push_back(vector<pair<long,long> >()); //a new record
      //
      prevtransid=trans_id;
                        if(fpkmv!=""){
                          vector<string> vs;
                          vs.push_back(fpkmv);
                          vs.push_back(exonnum);
                          vs.push_back("0");
                          refinfo[trans_id]=vs;
                        }
    }
    //save start and stop CDS
    allisoforms.back().push_back(make_pair<long,long>(startp,endp));
                if(fpkmv!=""){
                  vector<string>& nvs=refinfo[trans_id];
                  nvs[1]=exonnum;
                  int clen=atoi(nvs[2].c_str());
                  clen+=(endp-startp+1);
                  stringstream sss;
                  sss<<clen;
                  string clens;
                  sss>>clens;
                  nvs[2]=clens;
                }

  }//end ifs loop

  //check if the reference is sorted or not
  int nincorrect=0;
  for(int i=0;i<allisoforms.size()-1;i++){
    if(allisoforms[i][0].first>allisoforms[i+1][0].first && haschrboundary.count(i+1)==0){
      if(nincorrect<5)
        cerr<<"Warning: reference gtf is not sorted. Check "<<isonames[i]<<" and "<<isonames[i+1]<<endl;
      nincorrect++;
    }
  }
  if(nincorrect>=5)cout<<"...[More such errors]..."<<endl;
  cout<<"Unsorted pairs: "<<nincorrect<<endl;

  ifs.close();

  //condense
  for(int i=0;i<allisoforms.size();i++){
    condenseiso(allisoforms[i]);
  }
  return 0;

}
/**
 * Compare one isoform.
 * Return the corresponding reference isoform names if one match is found on reference isoforms (and the corresponding rmap is set to 1).
 * Return "" if no match is found.
 */
string compareoneisoform(vector<pair<long,long> >& oneiso, 
    int startr, int endr, //the start and end search indicator
    vector<int> &rmap, //reference map to be updated
    vector<string>& rmapcorr, //the correspondence of refmap to tmap
    string oneisoname //the name of this isoform
    ){
  long onestart=oneiso[0].first;
  long oneend=oneiso[oneiso.size()-1].second;
  for(int i=startr;i<endr;i++){
    vector<pair<long, long> >& thisref=allisoforms[i];

    //filter range
    long rst=thisref[0].first;
    long red=thisref[thisref.size()-1].second;
    if(oneend<rst || onestart >red)continue; //their range don't overlap

    //check if their boundary are completely identical, except the start and the end
    if(oneiso.size()!=thisref.size())continue;
    int n=oneiso.size();
    if(n>1){
      if(abs2(thisref[0].second-oneiso[0].second)>=2)continue;
      if(abs2(thisref[n-1].first-oneiso[n-1].first)>=2)continue;
    }
    else if(n==1){
      //if(thisref[0].first<=oneiso[0].first && thisref[0].second>=oneiso[0].second)
      //{ 
        //must contain 
      //}
      //else
      //  continue;
      long a=thisref[0].first,b=thisref[0].second,c=oneiso[0].first,d=oneiso[0].second;
      //if(d<a || c>b)continue;
      if(d<a){
        // c<d<a<b, no overlap
        continue;
      }
      else{
        //d>=a
        if(d<b){
          //a<d<b
          if(c<a){
            //c<a<d<b
            if( (d-a)*1.0/(b-a)<minfrac || (d-a)*1.0/(d-c))continue;
          }
          else{
            //a<c<d<b
            if ( (d-c)*1.0/(b-a)<minfrac)continue;
          }
        }
        else{
          //d>b
          if(c<a){
            //c<a<b<d
            if( (b-a)*1.0/(d-c)<minfrac)continue;
          }
          else if(c<b){
            //a<c<b<d
            if ((b-c)*1.0/(b-a)<minfrac || (b-c)*1.0/(d-c)<minfrac)continue;
          }
          else
            //a<b<c<d, no overlap
            continue;
        }
      }
    }
    bool hasmismatch=false;
    for(int j=1;j<n-1;j++){
      if(thisref[j].first!=oneiso[j].first || thisref[j].second!=oneiso[j].second){hasmismatch=true;break;}
    }
    if(hasmismatch)continue;
    if(rmap[i]==1){
      if(verbose)cout<<"Possible duplicate tmaps: "<<isonames[i]<<" vs "<<rmapcorr[i]<<" and "<<oneisoname<<"("<<oneiso.size()<<" exons).\n";
    }
    else{
      rmap[i]=1;
      rmapcorr[i]=oneisoname;
      if(verbose)cout<<"One match found with "<<oneiso.size()<<" exons: "<<isonames[i]<<" vs "<<oneisoname<<".\n";
    }
    return isonames[i];
  }
  return "";
}

void printstat(vector<int>& refmap, vector<int>& tmap,vector<int>&tmapexons,
    map<string,int>& isocount){
  int refone=0,tone=0;
  //refmap
        int nbin=12;
  vector<int>refexonscat(nbin,0);
  vector<int> refexons(nbin,0);
  for(int i=0;i<refmap.size();i++){
    int refex=allisoforms[i].size();
    if(refex>=nbin)refex=nbin-1;
    refexons[refex]++;
    if(refmap[i]==1){
      refone++;
      //stat according to # of exons
      refexonscat[refex]++;
    }
  }
  //tmap exons stat
  vector<int> tmapexonscat(nbin,0),tmapexonst(nbin,0);
  for(int i=0;i<tmap.size();i++){
    int tex=tmapexons[i];
    if(tex>=nbin)tex=nbin-1;
    tmapexonst[tex]++;
    if(tmap[i]==1){
      tone++;
      tmapexonscat[tex]++;
    }
  }
  cout<<"Reference transcriptome: total: "<<refmap.size()<<", matched: "<<refone<<" (" <<refone*1.0/refmap.size()<<").\n";
  cout<<"# of matched reference isoforms grouped by # of exons:\n";
  for(int i=1;i<refexonscat.size();i++)cout<<i<<"\t";cout<<endl;
  for(int i=1;i<refexonscat.size();i++)cout<<refexonscat[i]<<"\t";cout<<endl;
  cout<<"# of reference isoforms (true) grouped by # of exons:\n";
  for(int i=1;i<refexonscat.size();i++)cout<<refexons[i]<<"\t";cout<<endl;
  
  cout<<"Predicted transcriptome: total: "<<tmap.size()<<", matched: "<<tone<<" (" <<tone*1.0/tmap.size()<<").\n";
  cout<<"# of matched transcript isoforms grouped by # of exons:\n";
  for(int i=1;i<tmapexonscat.size();i++)cout<<i<<"\t";cout<<endl;
  for(int i=1;i<tmapexonscat.size();i++)cout<<tmapexonscat[i]<<"\t";cout<<endl;
  cout<<"# of transcipt isoforms (true) grouped by # of exons:\n";
  for(int i=1;i<tmapexonscat.size();i++)cout<<tmapexonst[i]<<"\t";cout<<endl;


  if(isocount.size()>0){
    cout<<"==============================\n";
    vector<int> refisoscat(10,0);
    vector<int> reftisoscat(10,0);

    for(int i=0;i<refmap.size();i++){
      if(isocount.count(isonames[i])==0){
        cerr<<"Warning: missing isoform count information for "<<isonames[i]<<endl;
        continue;
      }
      int nis=isocount[isonames[i]];
      if(nis>9)nis=9;
      refisoscat[nis]++;
      if(refmap[i]==1)reftisoscat[nis]++;
    }
    cout<<"# of matched reference isoforms grouped by # of isoforms:\n";
    for(int i=1;i<reftisoscat.size();i++)cout<<reftisoscat[i]<<"\t";
    cout<<endl;
    cout<<"# of reference isoforms (true) grouped by # of isoforms:\n";
    for(int i=1;i<refisoscat.size();i++)cout<<refisoscat[i]<<"\t";
    cout<<endl;

    
  }
}

int getisolen(vector<pair<long,long> >&oneiso){
  int s=0;
  for(int i=0;i<oneiso.size();i++){
    s+=oneiso[i].second-oneiso[i].first+1;
  }
  return s;
}


/**
 * compare object isoforms
 */
int compareobjgtf(char* filename,map<string,int>& isocount,bool outmapfile) 
{ 
  ifstream ifs(filename);
  if(!ifs.is_open()){
    cerr<<"Error opening obj gtf file "<<filename<<endl;
    return -1;
  }

  string oneline;//line buffer
  string prevchrname="";
  string prevtransid="";
  long prevstart=-1;
  double prevexplv=0;

  int nmsgerr=0;


  string chrname; //1st, chromosome name
  string rectype; //3rd, record type 
  long startp,endp; //4th-5th, start position and end position
  char direction;//7th, direction (+ or -)
  string gene_id, trans_id; //9th-12th, gene id and transcript id;

  //ref map
  vector<int> refmap(isonames.size(),0);
  //the correspondence of the refmap to tmap names
  vector<string> refmapcorr(isonames.size());
  //transcript map
  vector<int> tmap;
  vector<int> tmapnexons;
  vector<string> tmapnames;//names of the tmap transcripts
  vector<string> tmapcorr; //names of the tmap correspondence
  vector<double> tmapexplv;
  vector<int> tmaplen;


  vector<pair<long,long> > oneiso;
  int nerrdirmsg=0;
  double fpkm=0;
  while(true){
    getline(ifs,oneline);
    if(ifs.eof())break;
    stringstream olb(oneline);
    olb>>chrname; //chromosome name
    if(chrname!=prevchrname){
      if(verbose)cout<<"Switching to chromosome "<<chrname<<"...\n";
      prevstart=-1;
    }
    string tmp;
    olb>>tmp;
    olb>>rectype; //exon, CDS, start_codon, or stop_codon. Use exon only.

    olb>>startp>>endp; //start and end coordinates
    string conf; //6th, confidence
    olb>>conf;
    olb>>direction; //7th, direction
    if(direction!='+' && direction != '-' && direction !='.' && direction!='*'){
      if(nerrdirmsg<5)
        cerr<<"Error: direction should be either + or - or ., but I receive "<<direction<<" .\n";
      if(nerrdirmsg==5)
        cerr<<"[more such errors...]\n";
      nerrdirmsg++;
    }
    char fshift; //8th, frame shift (0, 1, 2 or .)
    olb>>fshift;
    if(fshift!='0' && fshift!='1' && fshift!='2' && fshift!='.'){
      cerr<<"Error: frame shift should be 0, 1, 2 or .\n";
    }
    
                map<string,string> lfd;
                parsegtflastfield(olb,lfd);
                if(lfd.count("gene_id")==0 || lfd.count("transcript_id")==0){
                  cerr<<"Error: missing gene_id and transcript_id field.\n";
                  continue;
                }
                gene_id=lfd["gene_id"];
                trans_id=lfd["transcript_id"];
                if(longgenename)trans_id=gene_id+":"+trans_id;
                fpkm=0;
                if(lfd.count("FPKM")>0){
                   fpkm=atof(lfd["FPKM"].c_str());
                }

    if(rectype!="exon")continue;
    if(trans_id!=prevtransid){
      if(prevtransid!=""){
        //compare here
        long startr=0,endr=isonames.size();
        map<string,int>::iterator tx=chrboundary.find(prevchrname);
        int cprs;//indicating if one match is found or not
        string retnames="";
        if(tx!=chrboundary.end()){
          startr=tx->second;
          tx++;
          if(tx!=chrboundary.end())
            endr=tx->second;
        
          condenseiso(oneiso);
          retnames=compareoneisoform(oneiso, startr,endr,refmap, refmapcorr,prevtransid);
          if(retnames=="")cprs=0;
          else cprs=1;
          //cout<<"Comparing isoform "<<prevtransid<<" with ref range ["<<startr<<","<<endr<<")"<<endl;
        }
        else{
          if(verbose)cout<<"Warning: unknown chromosome name "<<chrname<<" found in reference isoform.\n";
          cprs=0;
        }
        tmap.push_back(cprs);
        tmapnexons.push_back(oneiso.size());
        tmapnames.push_back(prevtransid);
        tmapcorr.push_back(retnames);
        tmapexplv.push_back(prevexplv);
        tmaplen.push_back(getisolen(oneiso));

      }
      oneiso.clear();
      //not sorted ones
      if(prevstart>startp){
        if(nmsgerr<5)
          cerr<<"Warning: the object transcript is not sorted. Check "<<prevtransid<<" and "<<trans_id<<endl;
        nmsgerr++;
      }
      //
      prevtransid=trans_id;
      prevexplv=fpkm;
      prevstart=startp;
      prevchrname=chrname;
    }// end if(trans_id!=prevtransid)
    //save start and stop CDS
    oneiso.push_back(make_pair<long,long>(startp,endp));


  }//end ifs loop
  //compare the last record
  if(oneiso.size()!=0){
        long startr=0,endr=isonames.size();
        map<string,int>::iterator tx=chrboundary.find(chrname);
        int cprs;
        string retnames="";
        if(tx!=chrboundary.end()){
          startr=tx->second;
          tx++;
          if(tx!=chrboundary.end())
            endr=tx->second;
        
          retnames=compareoneisoform(oneiso, startr,endr,refmap,refmapcorr,trans_id);
          if(retnames=="")cprs=0;
          else cprs=1;
          //cout<<"Comparing isoform "<<trans_id<<" with ref range ["<<startr<<","<<endr<<")"<<endl;
        }
        else{
          if(verbose)  cout<<"Warning: unknown chromosome name "<<chrname<<"found in reference isoform.\n";
          cprs=0;
        }
        tmap.push_back(cprs);
        tmapnexons.push_back(oneiso.size());
        tmapnames.push_back(trans_id);
        tmapcorr.push_back(retnames);
        tmapexplv.push_back(fpkm);
        tmaplen.push_back(getisolen(oneiso));

  }
  ifs.close();

  if(nmsgerr>5)cout<<"...[More such errors]..."<<endl;
  cout<<"Total unsorted object transcript pairs: "<<nmsgerr<<endl;


  //get statistics from refmap and tmap
  printstat(refmap,tmap,tmapnexons,isocount);  


  if(outmapfile==true){
    ofstream rifs((mapprefix+"reference.map.txt").c_str());
    if(!rifs.is_open()){
      cerr<<"Error opening reference.map.txt...\n";
    }
    else{
      //header
      int nsupp=0;
      if(refinfo.size()>0) nsupp=(refinfo.begin())->second.size();
      rifs<<"ID\tMatched\tMatchID";
      for(int i=0;i<nsupp;i++){
        rifs<<"\tV"<<i;
      }
      rifs<<endl;
      for(int i=0;i<refmap.size();i++){
        rifs<<isonames[i]<<"\t"<<refmap[i]<<"\t";
        if(refmapcorr[i]!="")rifs<<refmapcorr[i];
        else rifs<<"NULL";  
        if(refinfo.size()>0){
          if (refinfo.count(isonames[i])>0){
            for(int j=0;j<refinfo[isonames[i]].size();j++){
            rifs<<"\t"<<refinfo[isonames[i]][j];
            }
          }
          else{
            for(int j=0;j<(refinfo.begin())->second.size();j++){

              rifs<<"\tNA";
            }
          }
          
        }
        rifs<<endl;
      }
    }
    rifs.close();
    ofstream tifs((mapprefix+"transcript.map.txt").c_str());  
    if(!tifs.is_open()){
      cerr<<"Error opening transcript.map.txt...\n";
    }
    else{
      tifs<<"ID\tMatched\tMatchID\tFPKM\tExon\tLength\n";
      for(int i=0;i<tmap.size();i++){
        tifs<<tmapnames[i]<<"\t"<<tmap[i]<<"\t";
        if(tmapcorr[i]!="")tifs<<tmapcorr[i];
        else tifs<<"NULL";
        tifs<<"\t"<<tmapexplv[i];
        tifs<<"\t"<<tmapnexons[i]; // Number of exons
        tifs<<"\t"<<tmaplen[i];
        tifs<<endl;
      }
    }
    tifs.close();

    

  }

  return 0;

}


int main(int argc, char* argv[]){
  if(argc<3){
    cout<<"Usage: comparegtf {options} <reference gtf> <obj gtf>\n\n";
    cout<<"This program is used to compare isoforms between reference annotation (reference gtf) and the result of predictions (obj gtf).\n\n";
    cout<<"Options:\n\n";
    cout<<"\t-v\t\tverbose output.\n\n";
    cout<<"\t-l\t\tUse gene_id:transcript_id as the name of the transcript.\n\n";
    cout<<"\t-n <string>\tAdditionally grouping results by the number of isoforms. Isoform count file is needed.\n\n";
    cout<<"\t-m\t\tOutput the correspondence map file (transcript.map.txt and reference.map.txt) to the current directory.\n\n";
    cout<<"\t-b <string>\tSpecify the prefix of the map file. The map file will become [prefix.]transcript.map.txt and [prefix.]reference.map.txt.\n\n";
    cout<<"\t-t <string>\tPrint the reference information from refinfofile. This file is a tab separated file, each line including the information of each reference file. The first field of each line should be the reference id matched to records in refgtf.\n\n";
    return -1;
  }
  string nisoformfile="";
  bool outmapfile=false;
  for(int i=1;i<argc;i++){
    if(string(argv[i])=="-v"){
      cout<<"Verbose output...\n";
      verbose=true;
    }
    if(string(argv[i])=="-l"){
      longgenename=true;
    }
    if(string(argv[i])=="-m"){
      outmapfile=true;
      cout<<"Output transcript.map.txt and reference.map.txt...\n";
    }
    if(i<argc-1 && string(argv[i])=="-n"){
      nisoform=true;
      nisoformfile=string(argv[i+1]);
    }
    if(i<argc-1 && string(argv[i])=="-b"){
      mapprefix=string(argv[i+1])+".";
    }
    if(i<argc-1 && string(argv[i])=="-f"){
      minfrac=atof(argv[i+1]);
      cout<<"Minfrac:"<<minfrac<<endl;
    }
    if(string(argv[i])=="-t"){
      ifstream riif(argv[i+1]);
      if(riif.is_open()){
        string ol;
        int rinc=0;
        while(true){
          getline(riif,ol);
          if(riif.eof())break;
          rinc=rinc+1;
          if(rinc==1){
            // check the number of fields
          }
          stringstream iss(ol);
          string rrid;
          iss>>rrid;
          refinfo[rrid]=vector<string>();
          while(!iss.eof()){
            string tmp;
            iss>>tmp;
            refinfo[rrid].push_back(tmp);
          }


        }
        cout<<"Reading "<<rinc<<" reference records from "<<argv[i+1]<<endl;
        riif.close();
      }
    }
  }
  //reading isoform count file
  map<string,int>isocount;
  if(nisoform==true){
    ifstream ifs(nisoformfile.c_str());
    if(!ifs.is_open()){
      cerr<<"Error opening isoform count file "<<nisoformfile<<endl;
      return -1;
    }
    cout<<"Reading isoform names and counts from "<<nisoformfile<<"...\n";
    string oneline;
    while(true){
      getline(ifs,oneline);
      if(ifs.eof())break;
      stringstream ss(oneline);
      string isoname;int isoct;
      ss>>isoname>>isoct;
      isocount[isoname]=isoct;
      //cout<<"Reading "<<isoname<<" count: "<<isoct<<endl;
    }
    cout<<"Total number of isoformnames: "<<isocount.size()<<endl;
    ifs.close();
  }
  cout<<"Reference gtf file: "<<argv[argc-2]<<endl;
  cout<<"Object gtf file: "<<argv[argc-1]<<endl;

  int ret=readrefgtf(argv[argc-2]);

  if(ret==-1)return -1;
  cout<<"Total number of reference isoforms: "<<allisoforms.size()<<endl;

  cout<<"Comparing transcripts...\n";
  compareobjgtf(argv[argc-1],isocount,outmapfile);
  return 0;
}
