/*
 * Converting .sam into .read_info files
 * Notice that .sam coordinates are 1-base, but read_info is 0-based for start, and 1-base for end coordinats.

 */



#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <cstdlib>
#include <cstdio>
#include <ctime>
#include "samio.h"
#include "structdef.h"

using namespace std;


void printMsg(){
    cerr<<"processsam: generate the instance file required for IsoLasso.\n\n";
    cerr<<"version: 2.5.2, last update: 02/20/2012.\n\n";
    //cerr<<"Error: input file required.\n";
    cerr<<"Usage: processsam {options} <in.sam|->\n\n";
    cerr<<"Input:\n";
    cerr<<"A SAM format file containing the read mapping information, or '-' to read from STDIN. See NOTE for further information.\n";
    cerr<<"\nOptions:\n\n";
    cerr<<"PARAMETERS:\n\n";
    cerr<<"  -g/--min-gap-length <int>\tThe minimum length of the gap between two reads to be considered as separate genes. Default 100.\n\n";
    cerr<<"  -c/--min-read-num <int>\tThe minimum number of clustered reads to output. Default 4.\n\n";
    //cerr<<"-i\t\t\tWrite instance file.\n";
    cerr<<"  -k/--max-pe-span <int> \tThe maximum pair-end spanning. Paired-end reads whose spanning exceeds this number will be discarded. Default 700000.\n\n";
    cerr<<"  -s/--max-num-instance \tThe maximum number of instances be written to the file. Default -1 (no limit)\n\n";
    cerr<<"  -u/--min-cvg-cut <0.0-1.0>\tThe fraction for coverage cutoff, should be between 0-1. A higher value will be more sensitive to coverage discrepancies in one gene. Default 0.05.\n\n";
    cerr<<"  -b/--single-only \t\tTreat reads as single-end reads, even if they are paired-end reads.\n\n";
    cerr<<"  -j/--min-junc-count <int>\tMinimum junction count. Only junctions  with no less than this number of supporting reads are considered. Default 1.\n\n";
    cerr<<"  -d/--direction <+|->\tIf this is the stranded RNA-Seq data, specify the direction of the data. Default '.' (non-stranded).\n\n";
    //cerr<<"-f [generange file]\tprovide gene range file. Only reads within the specified gene range file will be clustered. The gene range file MUST be sorted according to the chromosome name and starting position.\n";
    cerr<<"IO OPTIONS:\n\n";
    cerr<<"  -n/--isoinfer\t\t\tGenerate IsoInfer input files (.readinfo, .bound and .generange). Default off.\n\n";
    cerr<<"  -x/--annotation <string>\tProvide existing gene annotation file (in BED format). Adding this parameter will automatically incorporate existing gene annotation information into instance file. The bed file should be sorted according to the chromosome name and starting position of isoforms. This option is mutually exclusive to the -r/--range option.\n\n";
    //cerr<<"  -r/--ref-only\t\t\tUse existing gene annotation file to define gene range and boundary. If this option is used, -x/--annotation option must be provided. And, only exon boundaries appeared in the annotation are considered.\n";
    cerr<<"  -r/--range <string>\t\tUse the provided gene ranges specified by the file (in BED format). This option is mutually exclusive to the -x/--annotation option.\n\n";
    cerr<<"  -e/--segment-bound <string>\tProvide the exon-intron boundary information specified by the filename. See NOTE for more information about the file format.\n\n";
    cerr<<"  -a/--annotation\t\tOutput annoation files, including read coverage (.real.wig), read coverage considering junctions and paired-end read spans (.wig), instance range and boundary (.bound.bed), junctions (.bed) and  junction summary (.junction.bed).\n\n";
    cerr<<"  -o/--prefix <string>\t\tSpecify the prefix of all generated files. The default value is the provided file name. \n\n";
    cerr<<"  -v/--no-coverage\t\tDon't output coverage information to the instance file.\n\n";
    cerr<<endl;
    cerr<<endl;
    cerr<<"NOTE\n\n\t1. processsam acceptes STDIN input of sam file by using '-' as filename. This is especially useful if you have the .bam file (e.g., from Tophat output), or you want to do some read filtering before running IsoLasso.  For example, if Samtools is installed, then use the following command to run processsam on only chromosome 1 reads:\n";
    cerr<<endl;
    cerr<<"\tsamtools view accepted_hits.bam chr1 | processsam -a -o accepted_hits -\n\n";
    cerr<<endl;
    cerr<<"\t2. The sam/bam file must be sorted according to the chromosome name and starting position. The bam file format can be sorted using 'samtools sort' command, while for the sam file, you can use the sort command. In Unix or Mac systems, use the following command: \n";
    cerr<<endl;
    cerr<<"\tsort -k 3,3 -k 4,4n in.sam > in.sorted.sam\n";
    cerr<<"to sort in.sam  into in.sorted.sam, or use the pipe:\n";
    cerr<<"\tsort -k 3,3 -k 4,4n in.sam | processsam -a -o accepted_hits -\n";
    cerr<<endl;
    cerr<<"\t3. The exon-intron boundary file (specified by -e/--segment-bound option) records the exon-intron boundary used by IsoLasso. Each line in the file represents one boundary information, and should include chromosome name, start position, end position (equal to start position) and direction (+/-). These fields should be tab-separated, and only the first 4 fields are used. For example,\n";
    cerr<<endl;
    cerr<<"\tchr1\t15796\t15796\t+\n";
    cerr<<endl;
    //cerr<<"\t4. The gene range file (specified by -r/--range option) records the gene range used by IsoLasso. Each line in the file represents one range information, and should include chromosome name, start position, end position and direction. These fields should be tab-separated, and only the first 4 fields are used. For example,\n";
    //cerr<<endl;
    //cerr<<"\tchr1\t1558850\t1559210\t+\n";
}

int main(int argc, char* argv[]){
  //cout<<"ATTENTION: for paired-end .sam file, the program will create some temporary files and try to sort them using the unix command 'sort' and 'sed'. Also, in the .sam file, '%' character should not appear, otherwise the program will behave incorrectly.\n";

  if(argc<2){
    printMsg();
    return 0;
  }
  //vector<Instance> allIns;
  //readInstance(insFile,allIns,-1);
  //for(int i=0;i<allIns.size();i++)
  //  printInstance(cout, allIns[i]);
  
  //buildInstanceIndex(allIns);
  string insFile=string(argv[argc-1]);
  vector<string> args;
  for(int i=0;i<argc;i++){
    args.push_back(string(argv[i]));
  }
  clock_t init=clock();
  readSamFile(insFile, args);
  clock_t exec=clock()-init;
  cout<<"running time: "<<((double)exec/((double)CLOCKS_PER_SEC))<<" seconds."<<endl;
  return 0;
}


