/*
 * lassoinvoke.cpp
 * This file is part of isoLasso library; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; version 2.1 of the License.
 *
 *  Created on: 2011-10-14
 *      Author: daniel
 */
#include <stdio.h>
#include <vector>
#include <fstream>
#include <sstream>
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include "library.h"
#include "predExpLevel.h"
#include "NewInstance.h"
using namespace std;

long begintime, checkpoint;


long pedist=200; 
long pestd=20; 
int minrcount=-1; 
long totalreads=40000000;

string instancefile;
string outputfile;

bool directref = false;
bool forceref=false;

float expcutoff = 0.1;

//fix instance id
string instid="";

/* Get the total number of reads */
long getTotalReads(string instfile){
  long total=0;
  ifstream ifs(instfile.c_str());
  if(!ifs.is_open()){
    cerr<<"Error opening file "<<instfile<<endl;
    return -1;
  }
  string oneline;
  while(true){
    getline(ifs,oneline);
    if(ifs.eof())break;
    string firstworld;
    stringstream ss(oneline);
    ss>>firstworld;
    if(firstworld!="Reads")continue;
    long cread;
    ss>>cread;
    total+=cread;
  }
  ifs.close();
  return total;
}


void printmsg(){
  cerr<<"isolasso: the C++ version of IsoLasso program.\n\n";
  cerr<<"Version: 2.6\n\n";
  cerr<<"Usage: isolasso {Options} <Instance file>\n\n";
  cerr<<"Input: the instance file generated by processsam.\n\n";
  cerr<<"Options:\n\n";
  cerr<<"Parameters:\n\n";
  cerr<<"\t-p/--pairend <int,int>\t\tSpecify the paired-end read span and standard derivation. Default 200,20. You may use this Python script to estimate both values from a given SAM/BAM file (http://www.cs.ucr.edu/~liw/scripts.html#insertsam).\n\n";
  cerr<<"\t-c/--min-read-num <int>\t\tThe minimum number of clustered reads to output. Default 0.\n\n";
  cerr<<"\nIO Options:\n\n";
  cerr<<"\t--minexp <float>\t\tThe minimum expression level threshold cutoff. Default 0.1.\n\n";
  cerr<<"\t--verbose\t\t\tEnable verbose output.\n\n";
  cerr<<"\t-o/--prefix <string>\t\tSpecify the prefix of the  output files. The default value is the instance file.\n\n"; 
  cerr<<"\t--no-filter\t\t\tDo not automatically filter isoforms. If this option is on and --minexp is 0, the predicted expression levels of some isoforms will be 0.\n\n";
  cerr<<"\t--id <string>\t\t\tOnly predict the instance with specified ID.\n\n";
  cerr<<"\nReference Options:\n\n";
  cerr<<"\t-d/--directref\t\t\tOutput gene annotation (the Refs field in the instance file) directly. All expression levels are assigned 1.\n\n";
  cerr<<"\t--forceref\t\t\tCalculate the expression levels of gene annotations (the Refs field in the instance file). Using this option will automatically turn on the '--no-filter' option.\n\n";
  cerr<<"\nCEM Options:\n\n";
  cerr<<"\t--useem\t\t\t\tUse EM algorithm instead of LASSO algorithm (which is default) to estimate expression levels.\n\n";
  cerr<<"\t--usebias\t\t\tUse quasi-multinomial bias correction.\n\n";
  cerr<<"\t--elim\t\t\t\tAllow CEM to eliminate low probability isoforms during the iteration.\n\n";
  cerr<<"\t--correctn\t\t\tCorrect the gene read counts according to the quasi-multinomial bias parameter.\n";
  cerr<<"\t\t\t\t\tWarning: this is an experimental option so use it at your own risk. Due to the sample uncertainty, the calculation of the bias parameter may skew the distribution of some of the highly expressed genes.\n\n";
  cerr<<"\t--alpha <float>\t\t\tSpecify the parameter of the negative Dirichlet prior. Default 5.\n\n";
  cerr<<"\t--min-frac <float>\t\tThe minimum fraction of isoforms to be reported. Default 0.01. This option is invalid if --no-filter option is set.\n\n";
}

/* Parse options */
int parseopt(int argc, char* argv[], vector<string>& varargin){
  instancefile = string(argv[argc-1]);
  outputfile = instancefile+".pred";
  if(argc==1){
    printmsg();
    return -1;
  }
  for(int i=0;i<argc;i++){
    if ( string(argv[i])=="-d" || string(argv[i])== "--directref") {
      //output the annotation directly, without expression level calculation
      directref = true;
      printf("Output annotation directly. All expression levels are assigned 1.");
      minrcount = -1;
    }
    if (string(argv[i])=="--useposbias") {
      //TODO
      printf("The --useposbias optin is not supported yet\n");
    }
    if(i<argc-1){
      if ( string(argv[i])=="-o" || string(argv[i])== "--prefix") {
        outputfile=string(argv[i+1])+".pred";
      }
      if ( string(argv[i])=="-p" || string(argv[i])== "--pairend") {
        sscanf(argv[i+1],"%d,%d",&pedist,&pestd);
      }
      if ( string(argv[i])=="-c" || string(argv[i])== "--min-read-num") {
        sscanf(argv[i+1],"%d", &minrcount);
      }
      if (  string(argv[i])== "--minexp") {
        sscanf(argv[i+1],"%f", &expcutoff);
      }
      if(string(argv[i])=="--forceref"){
        forceref=true;
        varargin.push_back("--no-filter");//add no-filter option
        expcutoff=-1.0;
        minrcount=0;
      }
      if ( string(argv[i])== "--id") {
        instid=string(argv[i+1]);
      }
    }
  }
  //total read count
  totalreads = getTotalReads(instancefile);
  if(totalreads==-1) return -1;

  for (int i = 0; i < argc; ++i) {
    varargin.push_back(string(argv[i]));
  }

  return 0;
}
int main(int argc, char *argv[]) {

  long npred = 0, nneg = 0;
  double lambda = 100; 
  vector<string> varargin;
  //TODO posprofile
  //if (argc < 7) printf("Invalid number of parameter\n");

  if(parseopt(argc,argv,varargin)==-1) return -1;

  FILE * fid = fopen(instancefile.c_str(),"r");
  FILE * foutid = fopen(outputfile.c_str(),"w");

  long totalreadcnt=0;
  NewInstance thisitem;
  long readcnt;
  int success;
  char tmp[MAXSTRLEN];
  while (1){
    int cc = fgetc(fid);
    if (cc == '@') fgets(tmp, MAXSTRLEN, fid);
    else {
      ungetc(cc,fid);
      break;
    }
  }
  fprintf(foutid,"#");
  for(int i=0;i<argc;i++)fprintf(foutid,"%s ",argv[i]); fprintf(foutid,"\n");
  fprintf(foutid, "#count\tID\tChr\tDir\tIsoformStart\tIsoformEnd\tExonStart\tExonEnd\tFPKM\n");

  while(!feof(fid)) {
    if (testRunningTime) {
      begintime = clock();
      printf("Begin to reading instance from %ld\n", begintime);
    }
    thisitem = readoneinstance(fid, readcnt, success);
    if (success==0 || success==-1) {
      if(success==0)
        printf("IsoLasso:IO','Warning: unsuccessful instance reading...\n");
      break;
    }
    thisitem.totalReadCnt = totalreads;
    if (thisitem.readCnt < minrcount) {
      continue;
    }
    thisitem.peReadDis = pedist;
    thisitem.peReadDisSTD = pestd;
    totalreadcnt = totalreadcnt + readcnt;
    printf("Reading instance %s\n", thisitem.id.c_str());
    if(instid!="" && thisitem.id!=instid)continue;

    printf("#####Predicting Instance id: %s nexons: %ld Reads: %ld, boundary: %s:%ld-%ld\n",
      thisitem.id.c_str(), thisitem.nExons, thisitem.readCnt, thisitem.chromosome.c_str(), thisitem.boundary[0], thisitem.boundary[1]);
    if (directref) {
      thisitem.Pred = thisitem.Refs;
      for (int i = 0; i < thisitem.Pred.size(); ++i){
        thisitem.PredExp.push_back(1);
        thisitem.PredSubInst.push_back(1);
      }
      npred = npred + thisitem.Pred.size();
    }
    else {

      STAT stat = predExpLevel(thisitem, lambda, varargin );
      if (stat.nw == -1) continue;
      npred += stat.nw;
      nneg += stat.negw;
    }
    if (testRunningTime) {
      checkpoint = clock();
      printf("End calculating expression level at %ld\t", checkpoint);
      printf("Previous phase cost %ld\n", checkpoint - begintime);
      begintime = checkpoint;
    }
    //WL: 


    writeoneinstance(foutid, thisitem, thisitem.id, expcutoff, minrcount);
    if (testRunningTime) {
      checkpoint = clock();
      printf("End writting instance at %ld\t", checkpoint);
      printf("Previous phase cost %ld\n", checkpoint - begintime);
      begintime = checkpoint;
    }

    if(instid!="" && thisitem.id==instid)break;
  }
  fclose(fid);
  fclose(foutid);
  return 0;
}
