/* vgLoadAllen - Create .ra and .tab files for loading Allen Brain Atlas images into VisiGene. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "obscure.h"
#include "portable.h"
#include "jpegSize.h"
#include "dnaseq.h"
#include "fa.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "vgLoadAllen - Create .ra and .tab files for loading Allen Brain Atlas images\n"
  "into VisiGene\n"
  "usage:\n"
  "   vgLoadAllen sourceImageDir allen.tab probes.fa name.tab outDir\n"
  "where sourceImageDir is the full-size jpg image dir converted from the ABA image Disk\n"
  "allen.tab is the ABA input \n"
  "probes.fa contains the sequence of all the probes\n"
  "name.tab is two columns: biological name and the aba probe id (used with the gene sorter)\n"
  "outputDir is where the output .ra and tab files go.\n"
  "options:\n"
  "   -xxx=XXX\n"
  );
}

/*  
 vgLoadAllen 
   sourceImageDir = /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain 
     (if san is down can use /cluster/store11/visiGene/offline/allenBrain/imageDisk with .jp2)
   allen.tab     = /cluster/store11/visiGene/offline/allenBrain/probesAndData/allen20051021.tab 
   probes.fa      = /cluster/data/mm7/bed/allenBrain/allProbes.fa 
   name.tab       = /cluster/data/mm7/bed/allenBrain/allProbes.tab
   outDir         = output

to map refSeq to RP_ probe id used in track, use this:
   /cluster/data/mm7/bed/allenBrain/allProbes.tab
   NM_026675       RP_040914_02_A02
   NM_133764       RP_050317_04_H06

/cluster/data/mm7/bed/allenBrain> head allProbes.fa
>RP_040914_02_A02
TTGGAGAGCTGCCCTGTCAGACTATGGACCCCGAGGTGTCCCTGCTGCTG

Assumes that sourceImageDir has .jp2 files in subdirs,
but only one level deep.

*/   

static struct optionSpec options[] = {
   {NULL, 0},
};


struct hash *makeImageHash(char *sourceImageDir)
/* look in each subdir for .jpg files
 * but only look in subdirs, and only one level deep.
 * hash key is the gene name which is the first part of filename up to "_" 
 * and the hash value is the relative path to the file from sourceImageDir. */
{
struct hash *hash = newHash(0);
struct fileInfo *dList = NULL, *dEntry;
dList = listDirX(sourceImageDir, "*", FALSE);
for (dEntry = dList; dEntry != NULL; dEntry = dEntry->next)
    {
    if (dEntry->isDir)
	{
	char newDir[256];
	struct fileInfo *fList = NULL, *fEntry;
	safef(newDir,sizeof(newDir),"%s/%s",sourceImageDir,dEntry->name);
	fList = listDirX(newDir, "*.jpg", FALSE);   
	for (fEntry = fList; fEntry != NULL; fEntry = fEntry->next)
	    {
	    char newPath[256];
	    char *underBar=NULL;
	    safef(newPath,sizeof(newPath),"%s/%s",dEntry->name,fEntry->name);
	    underBar = strchr(fEntry->name,'_');
	    if (underBar)
		{
		char *key = cloneStringZ(fEntry->name,underBar-fEntry->name);
		char *val = cloneString(newPath);
    		hashAdd(hash, key, val);
		verbose(2, "imageHash key=%s value=%s\n", key, val);
		}
	    }
	slFreeList(&fList);
	}
    }
slFreeList(&dList);    
return hash;
}


struct hash *makeNameHash(char *fileName)
/* Parse file in format:
 *    name  probeId
 * into hash keyed by probeId with name values. */
{
struct hash *hash = newHash(0);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[2];

while (lineFileRow(lf, row))
    {
    hashAdd(hash, cloneString(row[0]), cloneString(row[1]));
    }
lineFileClose(&lf);
return hash;
}

void writeRa(char *fileName)
/* Write our .ra file with information common to all NIBB images. */
{
FILE *f = mustOpen(fileName, "w");
fprintf(f, "submitSet abaMouse1\n");
fprintf(f, "fullDir http://hgwdev.gi.ucsc.edu/visiGene/full/inSitu/Mouse/allenBrain\n");
fprintf(f, "thumbDir http://hgwdev.gi.ucsc.edu/visiGene/200/inSitu/Mouse/allenBrain\n");
fprintf(f, "itemUrl http://www.brain-map.org/search.do?queryText=%%s\n");
fprintf(f, "priority 1200\n");
fprintf(f, "sliceType sagittal\n");
fprintf(f, "submissionSource Allen Brain Atlas (ABA)\n");
fprintf(f, "taxon 10090\n");
fprintf(f, "genotype wild type\n");
fprintf(f, "strain C57BL/6\n");
fprintf(f, "age 56\n");
fprintf(f, "sex male\n");
fprintf(f, "bodyPart brain\n");
fprintf(f, "contributor Boe A.,Dang C.,Jeung D.,Luong L.,Sunkin S.,Sutram M.,Youngstrom B.,Lein E.,Jones A.,ABA ABA,ABI ABI,Allen Institute for Brain Science,Allen Brain Atlas,\n");
fprintf(f, "acknowledgement "
    "Thanks to the Allen Institute for Brain "
    "Science, particularly to Andrew Boe, Chinh Dang, Darren Jeung, Lon "
    "Luong, Susan Sunkin, Madhavi Sutram, and Brian Youngstrom for helping "
    "make these images available in VisiGene."
);
/* Still need to fill in contributor, publication, journal, journalUrl */
fprintf(f, "contributor Allen Institute for Brain Science\n");
fprintf(f, "year 2006\n");
fprintf(f, "setUrl http://www.brainatlas.org/\n");
fprintf(f, "copyright &copy; 2004-2006 Allen Institute for Brain Science\n");
fprintf(f, "probeColor blue/purple\n");
carefulClose(&f);
}



void writeTab(
	struct hash *imageHash, 
	struct hash *seqHash, 
	char *sourceImageDir, char *allenTab, 
	struct hash *nameHash, char *outName)
/* Synthesize data and write out tab-separated file with one line for
 * each image. */
{
char sourceImage[PATH_LEN];
FILE *f = mustOpen(outName, "w");
struct lineFile *lf = lineFileOpen(allenTab, TRUE);
char *row[5];

/* Write header. */
fprintf(f, "#");
fprintf(f, "gene\t");
fprintf(f, "refSeq\t");
fprintf(f, "locusLink\t"); 
fprintf(f, "submitId\t");    /* egeneid=68323 or genesym=1110003F05Rik */
fprintf(f, "fileName\t");
fprintf(f, "imageWidth\t");
fprintf(f, "imageHeight\t");
fprintf(f, "probeId\t");     /* actually, this not supported yet but would be great. */
fprintf(f, "seq\n");

while (lineFileRowTab(lf, row))
    {
    char *gene = row[0];
    /* char *geneName = row[1]; */
    char *entrez = row[2];
    char *refSeq = row[3];
    char *url = row[4];
    char *probeId = hashFindVal(nameHash, refSeq);
    struct dnaSeq *seq = NULL; 
    int width=0, height=0;
    char *relPath = hashFindVal(imageHash, gene);
    char *submitId = strchr(url,'='); 
    if (probeId)
    	seq = hashFindVal(seqHash, probeId);
    if (submitId)
	++submitId;  /* we want the string following first '=' */
    if (sameString(entrez,"0"))
	entrez = NULL;

    if (relPath)
	{
	safef(sourceImage, sizeof(sourceImage), "%s/%s", 
	    sourceImageDir, relPath);
	jpegSize(sourceImage, &width, &height);

	fprintf(f, "%s\t", gene);
	fprintf(f, "%s\t", refSeq);
	fprintf(f, "%s\t", entrez?entrez:"");
	fprintf(f, "%s\t", submitId);
	fprintf(f, "%s\t", relPath);
	fprintf(f, "%d\t", width);
	fprintf(f, "%d\t", height);
	fprintf(f, "%s\t", probeId?probeId:"");
	if (seq != NULL)
	    fprintf(f, "%s\n", seq->dna);
	else
	    fprintf(f, "\n");
	}
    }
lineFileClose(&lf);    
carefulClose(&f);
}

void vgLoadAllen(char *sourceImageDir, char *allenTab, char *probesFa,
	char *nameTab, char *outDir)
/* vgLoadAllen - Create .ra and .tab files for loading Xenopus images from NIBB 
 * into VisiGene. */
{

struct hash *imageHash = makeImageHash(sourceImageDir);
struct hash *nameHash =  makeNameHash(nameTab);
struct hash *seqHash = dnaSeqHash(faReadAllDna(probesFa));
char outPath[PATH_LEN];

verbose(1, "Got %d images\n", imageHash->elCount);
verbose(1, "Got %d named probes\n", nameHash->elCount);
verbose(1, "Got %d probe sequences\n", seqHash->elCount);

/* Make output directory and ra file. */
makeDir(outDir);
safef(outPath, sizeof(outPath), "%s/%s", outDir, "aba.ra");
writeRa(outPath);

/* Make tab separated file. */
safef(outPath, sizeof(outPath), "%s/%s", outDir, "aba.tab");
writeTab(imageHash, seqHash, sourceImageDir, allenTab, nameHash, outPath);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 6)
    usage();
vgLoadAllen(argv[1], argv[2], argv[3], argv[4], argv[5]);
return 0;

}
