/* gensatFtpList - Given CHECKSUM.MD5 file with the file names munged, 
 * convert to names can actually download from ftp.ncbi.nlm.gov/pub/gensat. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "portable.h"
#include "options.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "gensatFtpList - Given CHECKSUM.MD5 file with the file names munged,\n"
  "convert to names can actually download from ftp.ncbi.nlm.gov/pub/gensat\n"
  "usage:\n"
  "   gensatFtpList in.md5 out.md5\n"
  "options:\n"
  "   -xxx=XXX\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};

char *monthNames[] = 
    {
    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
    };
char *monthNums[] =
    {
    "01", "02", "03", "04", "05", "06",
    "07", "08", "09", "10", "11", "12",
    };

char *fixName(char *name, struct lineFile *lf)
/* Convert somethign that looks like:
 *   StJude_2004Aug31_rep2/g00005/g00005_sj_g44_142_ads.jpg
 * to something that looks like:
 *   StJude_2004Aug31_rep2/g00005/g00005_sj_g44_142_ads.jpg
 * This will consume name, converting it in place. 
 * Some of the input names may already be in the correct format. */
{
/* Essentially all we do is look for three letter month,
 * and replace it with two letter number. */
char *month;
static char abbrMonth[4];
int monthIx;
char *firstUnder = strchr(name, '_');
char *s, c;
if (firstUnder == NULL)
    errAbort("Can't find underbar in %s", name);

month = firstUnder + 5;
if (!(isdigit(month[0]) && isdigit(month[1])))
    {
    memcpy(abbrMonth, month, 3);
    monthIx = stringIx(abbrMonth, monthNames);
    if (monthIx < 0)
	errAbort("Can't find month %s in %s", abbrMonth, name);
    else
	{
	memcpy(month, monthNums[monthIx], 2);
	strcpy(month+2, month+3);
	}
    }

/* Convert from _ to - up to first slash */
s = name;
for (;;)
    {
    c = *s;
    if (c == 0 || c == '/')
       break;
    if (c == '_')
       *s = '-';
    s += 1;
    }
return name;
} 

void gensatFtpList(char *in, char *out)
/* gensatFtpList - Given CHECKSUM.MD5 file with the file names munged, 
 * convert to names can actually download from ftp.ncbi.nlm.gov/pub/gensat. */
{
struct lineFile *lf = lineFileOpen(in, TRUE);
FILE *f = mustOpen(out, "w");
char *line, *md5;
char *oldPrefix = "/am/ftp-gensat/";
int oldPrefixSize = strlen(oldPrefix);
struct hash *suffixHash = newHash(0);

while (lineFileNext(lf, &line, NULL))
    {
    char dir[PATH_LEN], file[PATH_LEN], extension[PATH_LEN];
    md5 = nextWord(&line);
    line = skipLeadingSpaces(line);
    if (stringIn(".icon.", line))
        continue;
    if (stringIn(".med.", line))
        continue;
    if (!startsWith(oldPrefix, line))
        errAbort("Missing %s line %d of %s", oldPrefix, lf->lineIx, 
		lf->fileName);
    line += oldPrefixSize;
    if (startsWith("duplicates/", line))
        continue;
    if (endsWith(line, ".txt") || endsWith(line, ".psd")
       || endsWith(line, ".zip") || endsWith(line, ".doc")
       || endsWith(line, ".list"))
       continue;
    splitPath(line, dir, file, extension);
    hashStore(suffixHash, extension);
    fprintf(f, "%s\t%s\n", md5, fixName(line, lf));
    }

}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 3)
    usage();
gensatFtpList(argv[1], argv[2]);
return 0;
}
