/* hgLoadOut - load RepeatMasker .out files into database. */

/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "dystring.h"
#include "options.h"
#include "cheapcgi.h"
#include "hCommon.h"
#include "hdb.h"
#include "jksql.h"
#include "rmskOut.h"
#include "repMask.h"

boolean noBin = FALSE;
boolean split = FALSE;
boolean noSplit = FALSE;
char *tabFileName = NULL;
char *suffix = NULL;
int badRepCnt = 0;

/* command line option specifications */
static struct optionSpec optionSpecs[] = {
    {"tabFile", OPTION_STRING},
    {"tabfile", OPTION_STRING},
    {"nosplit", OPTION_BOOLEAN},
    {"noSplit", OPTION_BOOLEAN},
    {"split",   OPTION_BOOLEAN},
    {"table", OPTION_STRING},
    {NULL, 0}
};

void usage()
/* Explain usage and exit. */
{
errAbort(
  "hgLoadOut - load RepeatMasker .out files into database\n"
  "usage:\n"
  "   hgLoadOut database file(s).out\n"
  "For multiple files chrN.out this will create the single table 'rmsk'\n"
  "in the database, use the -split argument to obtain separate chrN_rmsk tables.\n"
  "options:\n"
  "   -tabFile=text.tab - don't actually load database, just create tab file\n"
  "   -split - load chrN_rmsk separate tables even if a single file is given\n"
  "   -table=name - use a different suffix other than the default (rmsk)\n"
  "note: the input file.out can also be a compressed file.out.gz file,\n"
  "      or a URL to a file.out or file.out.gz");
}

void badFormat(struct lineFile *lf, int id)
/* Print generic bad format message. */
{
errAbort("Badly formatted line %d in %s\n", lf->lineIx, lf->fileName);
}

int makeMilli(char *s, struct lineFile *lf)
/* Convert ascii floating point to parts per thousand representation. */
{
/* Cope with -0.0  and -0.2 etc.*/
if (s[0] == '-')
    {
    if (!sameString(s, "-0.0"))
        warn("Strange perc. field %s line %d of %s", s, lf->lineIx, lf->fileName);
    s = "0.0";
    }
if (!isdigit(s[0]))
    badFormat(lf,1);
return round(10.0*atof(s));
}

int parenSignInt(char *s, struct lineFile *lf)
/* Convert from ascii notation where a parenthesized integer is negative
 * to straight int. */
{
if (s[0] == '(')
    return -atoi(s+1);
else
    return atoi(s);
}

boolean checkRepeat(struct rmskOut *r, struct lineFile *lf)
/* check for bogus repeat */
{
/* this is bogus on both strands */
if (r->repEnd < 0 || r->repStart > r->repEnd)
    {
    badRepCnt++;
    if (verboseLevel() > 1)
        {
        verbose(2, "bad rep range [%d, %d] line %d of %s \n",
		r->repStart, r->repEnd, lf->lineIx, lf->fileName);
        }
    return FALSE;
    }
return TRUE;
}

FILE *theFile = NULL;
struct hash *chromFpHash = NULL;
char *defaultTempName = "rmsk.tab";

FILE *getFileForChrom(char *chrom)
/* Return the appropriate file pointer for the given chrom if -split.
 * If not using -split, then there is only one file pointer in action. */
{
static int chromCount = 0;
char *tempName = tabFileName ? tabFileName : defaultTempName;

if (split)
    {
    if (chromFpHash == NULL)
	chromFpHash = hashNew(10);
    theFile = (FILE *)hashFindVal(chromFpHash, chrom);
    if (theFile == NULL)
	{
	char fName[512];
	if (chromCount > 1000)
	    {
	    /* We will probably run into the operating system limit on open
	     * files well before this point, but just in case... */
	    errAbort("-split should not be used when there are >1000 sequences"
		     " -- too many split tables.");
	    }
	safef(fName, sizeof(fName), "%s_%s", chrom, tempName);
	theFile = mustOpen(fName, "w");
	hashAdd(chromFpHash, chrom, theFile);
	chromCount++;
	}
    }
else
    {
    if (theFile == NULL)
	theFile = mustOpen(tempName, "w");
    }
return theFile;
}

void helCarefulClose(struct hashEl *hel)
/* Call carefulClose on hashed file pointer. */
{
FILE *f = (FILE *)(hel->val);
carefulClose(&f);
}

void closeFiles()
/* If split, close each file pointer in chromFpHash.  Otherwise close the
 * single file pointer that we've been using. */
{
if (split)
    {
    hashTraverseEls(chromFpHash, helCarefulClose);
    }
else
    carefulClose(&theFile);
}

void readOneOut(char *rmskFile)
/* Read .out file rmskFile, check each line, and print OK lines to .tab. */
{
struct lineFile *lf;
char *line, *words[24];
int lineSize, wordCount;

/* Open .out file and process header. */
lf = rmskLineFileOpen(rmskFile);

/* Process line oriented records of .out file. */
while (lineFileNext(lf, &line, &lineSize))
    {
    static struct rmskOut r;
    char *s;

    wordCount = chopLine(line, words);
    if (wordCount < 14)
        errAbort("Expecting 14 or 15 words line %d of %s", 
	    lf->lineIx, lf->fileName);
    r.swScore = atoi(words[0]);
    r.milliDiv = makeMilli(words[1], lf);
    r.milliDel = makeMilli(words[2], lf);
    r.milliIns = makeMilli(words[3], lf);
    r.genoName = words[4];
    r.genoStart = atoi(words[5])-1;
    r.genoEnd = atoi(words[6]);
    r.genoLeft = parenSignInt(words[7], lf);
    r.strand[0]  = (words[8][0] == '+' ? '+' : '-');
    r.repName = words[9];
    r.repClass = words[10];
    char *repClassTest = cloneString(r.repClass);
    stripChar(repClassTest, '(');
    stripChar(repClassTest, ')');
    int nonDigitCount = countLeadingNondigits(repClassTest);
    int wordOffset = 0;
    // this repClass is only digits, (or only (digits) with surrounding parens)
    //   this is the sign of an empty field here
    // due to custom library in use that has no class/family indication
    if (0 == nonDigitCount)
        {
        wordOffset = 1;
        r.repClass = cloneString("Unspecified");
        r.repFamily = cloneString("Unspecified");
        }
    else
        {
        s = strchr(r.repClass, '/');
        if (s == NULL)
            r.repFamily = r.repClass;
        else
           {
           *s++ = 0;
           r.repFamily = s;
           }
        }
    r.repStart = parenSignInt(words[11-wordOffset], lf);
    r.repEnd = atoi(words[12-wordOffset]);
    r.repLeft = parenSignInt(words[13-wordOffset], lf);
    r.id[0] = ((wordCount > (14-wordOffset)) ? words[14-wordOffset][0] : ' ');
    if (checkRepeat(&r, lf))
        {
	FILE *f = getFileForChrom(r.genoName);
        if (!noBin)
            fprintf(f, "%u\t", hFindBin(r.genoStart, r.genoEnd));
        rmskOutTabOut(&r, f);
        }
    }
}

void loadOneTable(char *database, struct sqlConnection *conn, char *tempName, char *tableName)
/* Load .tab file tempName into tableName and remove tempName. */
{
struct dyString *query = dyStringNew(1024);

verbose(1, "Loading up table %s\n", tableName);
if (sqlTableExists(conn, tableName))
    {
    sqlDyStringPrintf(query, "DROP table %s", tableName);
    sqlUpdate(conn, query->string);
    }

/* Create first part of table definitions, the fields. */
dyStringClear(query);
sqlDyStringPrintf(query, 
"CREATE TABLE %s (\n"
"   bin smallint unsigned not null,     # bin index field for range queries\n"
"   swScore int unsigned not null,	# Smith Waterman alignment score\n"
"   milliDiv int unsigned not null,	# Base mismatches in parts per thousand\n"
"   milliDel int unsigned not null,	# Bases deleted in parts per thousand\n"
"   milliIns int unsigned not null,	# Bases inserted in parts per thousand\n"
"   genoName varchar(255) not null,	# Genomic sequence name\n"
"   genoStart int unsigned not null,	# Start in genomic sequence\n"
"   genoEnd int unsigned not null,	# End in genomic sequence\n"
"   genoLeft int not null,		# -#bases after match in genomic sequence\n"
"   strand char(1) not null,		# Relative orientation + or -\n"
"   repName varchar(255) not null,	# Name of repeat\n"
"   repClass varchar(255) not null,	# Class of repeat\n"
"   repFamily varchar(255) not null,	# Family of repeat\n"
"   repStart int not null,		# Start (if strand is +) or -#bases after match (if strand is -) in repeat sequence\n"
"   repEnd int not null,		# End in repeat sequence\n"
"   repLeft int not null,		# -#bases after match (if strand is +) or start (if strand is -) in repeat sequence\n"
"   id char(1) not null,		# First digit of id field in RepeatMasker .out file.  Best ignored.\n"
"             #Indices\n"
, tableName);

/* Create the indexes */
if (!noSplit)
    {
    sqlDyStringPrintf(query, "   INDEX(bin))\n");
    }
else
    {
    int indexLen = hGetMinIndexLength(database);
    sqlDyStringPrintf(query, "   INDEX(genoName(%d),bin))\n", indexLen);
    }

sqlUpdate(conn, query->string);

/* Load database from tab-file. */
dyStringClear(query);
sqlDyStringPrintf(query, "LOAD data local infile '%s' into table %s",
	       tempName, tableName);
sqlUpdate(conn, query->string);
remove(tempName);
}

void processOneOut(char *database, struct sqlConnection *conn, char *rmskFile, char *suffix)
/* Read one RepeatMasker .out file and load it into database. */
{
verbose(1, "Processing %s\n", rmskFile);

if (split || noSplit)
    errAbort("program error: processOneOut doesn't do -split or -nosplit.");

readOneOut(rmskFile);

/* Create database table (if not -tabFile). */
if (tabFileName == NULL)
    {
    char dir[256], base[128], extension[64];
    char tableName[256];
    splitPath(rmskFile, dir, base, extension);
    chopSuffix(base);
    safef(tableName, sizeof(tableName), "%s_%s", base, suffix);
    closeFiles();
    loadOneTable(database, conn, defaultTempName, tableName);
    }
}

struct sqlConnection *theConn = NULL;

void loadOneSplitTable(struct hashEl *hel)
/* Load the table for the given chrom. */
{
char tempName[512];
char tableName[256];
char *chrom = hel->name;
safef(tempName, sizeof(tempName), "%s_%s", chrom, defaultTempName);
safef(tableName, sizeof(tableName), "%s_%s", chrom, suffix);
loadOneTable(sqlGetDatabase(theConn), theConn, tempName, tableName);
}

void loadSplitTables(char *database, struct sqlConnection *conn)
/* For each chrom in chromHash, load its tempfile into table chrom_suffix. */
{
theConn = conn;
hashTraverseEls(chromFpHash, loadOneSplitTable);
}


void hgLoadOut(char *database, int rmskCount, char *rmskFileNames[], char *suffix)
/* hgLoadOut - load RepeatMasker .out files into database. */
{
struct sqlConnection *conn = NULL;
int i;

if (tabFileName == NULL)
    {
    conn = hAllocConn(database);
    verbose(2,"#\thgLoadOut: connected to database: %s\n", database);
    }
for (i=0; i<rmskCount; ++i)
    {
    if (split || noSplit)
	readOneOut(rmskFileNames[i]);
    else
	processOneOut(database, conn, rmskFileNames[i], suffix);
    }
closeFiles();
if (tabFileName == NULL)
    {
    if (split)
	loadSplitTables(database, conn);
    else if (noSplit)
	loadOneTable(database, conn, defaultTempName, suffix);
    }
hFreeConn(&conn);
if (badRepCnt > 0)
    {
    warn("note: %d records dropped due to repEnd < 0 or repStart > repEnd\n", badRepCnt);
    if (verboseLevel() < 2)
        warn("      run with -verbose=2 for details\n");
    }
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);
if (argc < 3)
    usage();
// default changed to noSplit Jan 2014 , ignore noSplit arguments
split = optionExists("split");
noSplit = ! split;
suffix = optionVal("table", "rmsk");
tabFileName = optionVal("tabFile", tabFileName);
if (tabFileName == NULL)
    tabFileName = optionVal("tabfile", tabFileName);
if (split && noSplit)
    errAbort("-split and -nosplit cannot be used together.");
hgLoadOut(argv[1], argc-2, argv+2, suffix) ;
return 0;
}
