/* kgAliasP read and parse the SWISS-PROT data file to generate 
   the protein part of the gene alias list for Known Genes track */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "hCommon.h"
#include "hdb.h"

void usage()
/* Explain usage and exit. */
{
errAbort(
  "kgAliasP - create gene alias (protein part) .tab files "
  "usage:\n"
  "   kgAliasM xxxx yyyy zzzz\n"
  "            xxxx is genome  database name\n"
  "            yyyy is protein database name\n"
  "            zzzz is .tab output file name\n"
  "Example: kgAliasP hg15 /cluster/store5/fan/data/swissprot/0405/trembl_new.dat new.tab\n");
}

int main(int argc, char *argv[])
{
struct sqlConnection *conn;
    
FILE *inf;
FILE *o1;

char cond_str[256];
char *database;
char *proteinFileName;
char *outputFileName;
char *answer;
char *alias;

char *id;
char *chp0, *chp1, *chp2, *chp;

char *kgID;
char line[2000];

if (argc != 4) usage();
    
database         = cloneString(argv[1]);
proteinFileName  = cloneString(argv[2]);
outputFileName   = cloneString(argv[3]);

conn = hAllocConn(database);

o1 = mustOpen(outputFileName, "w");
    
if ((inf = mustOpen(proteinFileName, "r")) == NULL)
    {		
    fprintf(stderr, "Can't open file %s.\n", proteinFileName);
    exit(8);
    }
	
while (fgets(line, 1000, inf) != NULL)
    {
    chp = strstr(line, "ID   ");
    if (chp != line)
	{
	fprintf(stderr, "expected ID line, but got: %s\n", line);
	exit(1);
	} 
    chp = chp + strlen("ID   ");
    id = chp;
    chp = strstr(id, " ");
    *chp = '\0';
    id = strdup(id);
        
    sqlSafef(cond_str, sizeof cond_str, "proteinID = '%s'", id);
    answer = sqlGetField(database, "knownGene", "name", cond_str);
    kgID = NULL;
    if (answer != NULL)
	{
	kgID = strdup(answer);
	}

    if (fgets(line, 1000, inf) == NULL) 
	{
	break;
	}
    do 
	{
	/* "//" signal end of a record */		
	if ((line[0] == '/') && (line[1] == '/')) break;

	// work on GN (Gene Name) line only
	chp = strstr(line, "GN   ");
	if (chp != NULL)
	    {
	    chp = line + strlen(line) -2;
	    if (*chp == '.') 
		{
		*chp = '\0';
		}
	    else
		{
		chp++;
		*chp = '\0';
		}
	    		
	    chp0 = line + 5;
	    while (chp0 != NULL)
	    	{
            	while (*chp0 == ' ') chp0++;

            	chp1 = strstr(chp0, " OR ");
            	chp2 = strstr(chp0, " AND ");

		chp = NULL;
		if (chp1 != NULL)
		    {
		    if (chp2 != NULL)
			{	
			if (chp1 < chp2)
			    {
			    chp = chp1;
			    }
			else
			    {
			    chp = chp2;
			    }
			}
		    else
			{
			chp = chp1;
			}
		    }

		if (chp2!= NULL)
		    {
		    if (chp1 != NULL)
			{	
			if (chp1 < chp2)
			    {
			    chp = chp1;
			    }
			else
			    {
			    chp = chp2;
			    }
			}
		    else
			{
			chp = chp2;
			}
		    }

            	if (chp == NULL)
            	    {
                    alias = strdup(chp0);
                    chp0 = NULL;
                    }
            	else 
                    {
                    *chp = '\0';
                    alias = strdup(chp0);
                    chp0 = chp+4;
                    }

 	    	if (kgID != NULL)
		    {
		    // clean up "(XXXX" or "XXXX)"
		    if (*alias == '(') alias++;
		    chp = strstr(alias, ")");
		    if (chp != NULL) *chp = '\0';

		    fprintf(o1, "%s\t%s\n", kgID, alias);
		    }
	    	}
	    }
    	} while (fgets(line, 1000, inf) != NULL);
    }
fclose(o1);
hFreeConn(&conn);
return(0);
}

