/* validateCv - validate controlled vocabulary file and metadata. */

/* Copyright (C) 2012 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "cv.h"
#include "ra.h"

void usage()
/* Explain usage and exit. */
{
errAbort(
    "validateCv - validates a controlled vocabulary file\n"
    "usage:\n"
    "   validateCv [-type={} [-setting={}]] [-level] cv.ra\n"
    "options:\n"
    "   -type={type} Type of terms to check, otherwise all types are checked.\n"
    "          -setting={setting} Check for just a single setting (only valid with -type).\n"
    "   -level       Level of scrutiny (ignored if -setting is used):\n"
    "                0 (default) Only must haves.\n"
    "                1 Must haves and should haves.\n"
    "                2 Must haves, should haves and the kitchen sink.\n"
    "   -verbose=2   Will list all errors.  Otherwise just counts errors.\n"
    );
}

static struct optionSpec options[] = {
    {"level",  OPTION_INT},
    {"type",   OPTION_STRING},
    {"setting",OPTION_STRING},
    {NULL, 0},
};

int cvTagsAreTermDuplicates(struct hash *cvHashOfHashes)
// returns count of tags that are also terms
{
int count = 0;
struct hashCookie brownie = hashFirst(cvHashOfHashes);
struct hashEl* el = NULL;
while ((el = hashNext(&brownie)) != NULL)
    {
    struct hash *hash = el->val;
    char *val = hashFindVal(hash,CV_TAG);
    if (val != NULL)
        {
        if (sameString(val,el->name)) // term and tag can be identical in the same stanza
            continue;

        struct hashEl* conflictingEl = hashLookup(cvHashOfHashes,val);
        if (conflictingEl != NULL)
            {
            count++;
            char *type = hashFindVal(hash,CV_TYPE);
            char *conflictingType = hashFindVal(conflictingEl->val,CV_TYPE);
            if (type != NULL && conflictingType)
                verbose(2,"Tag '%s' in '%s => %s' is itself a term '%s => %s'.\n",
                        val,type,el->name,conflictingType,conflictingEl->name);
            else
                verbose(2,"Tag '%s' in '%s' is itself a term.\n",val,el->name);
            continue;
            }
        }
    }
return count;
}

int cvHashesWithoutSetting(struct hash *cvHashOfHashes,char *setting,
                           boolean mustHave,boolean unique)
// returns count of hashes that do not have a given setting.
{
int count = 0;
struct hash *valHash = NULL;
struct hashCookie brownie = hashFirst(cvHashOfHashes);
struct hashEl* el = NULL;
while ((el = hashNext(&brownie)) != NULL)
    {
    struct hash *hash = el->val;
    char *type = NULL;
    if (differentString(setting,CV_TYPE))
        type = hashFindVal(hash,CV_TYPE);
    char *val = hashFindVal(hash,setting);
    if (val == NULL)
        {
        if (mustHave)
            {
            count++;
            if (type != NULL)
                verbose(2,"Term '%s => %s' is missing '%s'\n",type,el->name,setting);
            else
                verbose(2,"Term '%s' is missing '%s'\n",el->name,setting);
            }
        continue;
        }

    if (unique)
        {
        if (valHash == NULL)
            valHash = hashNew(0);
        else if (hashLookup(valHash,val) != NULL)
            {
            count++;
            if (type != NULL)
                verbose(2,"Term '%s => %s' setting '%s' is not unique\n",type,el->name,setting);
            else
                verbose(2,"Term '%s' setting '%s' is not unique\n",el->name,setting);
            continue;
            }
        hashAdd(valHash, val, (void *)1);
        }
    }
if (valHash != NULL)
    hashFree(&valHash);

return count;
}

struct slName *cvGetAllTypes(struct hash *cvHashOfHashes)
// returns a list of all types in the cvHashOfHashes)
{
struct slName *cvTypes = NULL;

struct hashCookie brownie = hashFirst(cvHashOfHashes);
struct hashEl* el = NULL;
while ((el = hashNext(&brownie)) != NULL)
    {
    struct hash *hash = el->val;
    char *type = hashFindVal(hash,CV_TYPE);
    if (type != NULL)
        slNameStore(&cvTypes, type);
    }
return cvTypes;
}

int cvTypeMustHaveSettings(struct slName **cvTypes,const char *type,const char *mustHaveSettings)
// checks that each memeber of the cvHash for the term has all settings required.
// returns count of errors and removes type from list of types
{
int count = 0;
char *settings = cloneString(mustHaveSettings); // will spill this memory
int ix = slNameFindIx(*cvTypes, (char *)type);
if (ix > -1)
    {
    struct slName *cvType = slElementFromIx(*cvTypes,ix);
    assert(cvType != NULL);
    char *normalizedTerm = (char *)cvTermNormalized(cvType->name);

    const struct hash *termHash = cvTermHash(normalizedTerm);
    if (termHash != NULL)
        {
        char *setting = NULL;
        while ((setting = nextWord(&settings)) != NULL)
            count += cvHashesWithoutSetting((struct hash *)termHash,setting,TRUE,FALSE);
        }
    else
        {
        count++;
        verbose(2,"Type %s has no members.\n",cvType->name);
        }
    slRemoveEl(cvTypes, cvType);
    slNameFree(&cvType);
    }
else
    {
    count++;
    verbose(2,"Type '%s' cannot be found.\n",type);
    }
return count;
}

int validateCv(char *cvName,char *type,char *setting,int level)
/* validateCv - validate controlled vocabulary file and metadata. */
{

int count = 0;
struct slName *cvTypes = NULL;
if (type == NULL)
    {
    struct hash *cvHash = raReadAll(cvName, CV_TERM);

    // Now we can walk through some checks
    // All stanzas have unique terms - already shown by reading in cvHash?
    // All stanzas have types
    count += cvHashesWithoutSetting(cvHash,CV_TYPE,TRUE,FALSE); // must have, unique not necessary

    // All terms must have uniq tags
    count += cvHashesWithoutSetting(cvHash,CV_TAG,FALSE,TRUE); // Not necessary but must be unique
    count += cvTagsAreTermDuplicates(cvHash);

    // Get a list of all types, then walk throgh the types with specific or general restrictions
    cvTypes = cvGetAllTypes(cvHash);

    // At this point we are done with looking at cv as a single hash
    // and will use standard cv routines to examine the file.
    hashFree(&cvHash);
    }
else
    {
    if (sameWord(type,CV_TERM_ANTIBODY))
        cvTypes = slNameNew((char *)cvTypeNormalized(CV_TERM_ANTIBODY));
    else if (sameWord((char *)cvTermNormalized(type),CV_TERM_CELL))
        {
        // Curretly this is shielded in the lib and there is no code to get it
        #define CV_UGLY_TERM_CELL_LINE  "Cell Line"
        cvTypes = slNameNew(CV_UGLY_TERM_CELL_LINE);
        }
    else
        cvTypes = slNameNew(type);
    }


// override looking for the cv.ra file in the standard place.
cvFileDeclare(cvName);
struct dyString *dySettings = dyStringNew(512);
char *checkSettings = setting;

// typeOfTerms is the set of type definitions
if (type == NULL || sameWord(type,CV_TOT))
    {
    dyStringClear(dySettings);
    if (setting != NULL)
        dyStringAppend(dySettings,setting);
    else
        {
        dyStringAppend(dySettings,CV_LABEL " " CV_DESCRIPTION " " CV_VALIDATE " "
                CV_TOT_PRIORITY " " CV_TOT_CV_DEFINED);
        if (type != NULL && setting == NULL)
            verbose(1,"Must haves: %s\n",dyStringContents(dySettings));
        if (level > 0)
            {
            checkSettings = " " CV_TOT_SEARCHABLE " " CV_TOT_HIDDEN;
            if (type != NULL && setting == NULL)
                verbose(1,"Should haves:%s\n",checkSettings);
            dyStringAppend(dySettings,checkSettings);
            }
        }
    count += cvTypeMustHaveSettings(&cvTypes,CV_TOT,dyStringContents(dySettings));
    }

// Antibody: is special
if (type == NULL || sameWord(type,CV_TERM_ANTIBODY))
    {
    dyStringClear(dySettings);
    if (setting != NULL)
        dyStringAppend(dySettings,setting);
    else
        {
        dyStringAppend(dySettings,CV_TAG " " CV_TERM_LAB " " CV_VENDER_NAME " " CV_VENDOR_ID
                                " antibodyDescription " CV_TARGET " targetDescription");
        if (type != NULL && setting == NULL)
            verbose(1,"Must haves: %s\n",dyStringContents(dySettings));
        if (level > 0)
            {
            checkSettings = " " CV_ORDER_URL " validation targetId targetUrl";
            dyStringAppend(dySettings,checkSettings);
            if (type != NULL && setting == NULL)
                verbose(1,"Should haves:%s\n",checkSettings);
            }
        }
    count += cvTypeMustHaveSettings(&cvTypes,cvTypeNormalized(CV_TERM_ANTIBODY),
                                     dyStringContents(dySettings));
    }

// "Cell Line" is very special
if (type == NULL || sameWord((char *)cvTermNormalized(type),CV_TERM_CELL))
    {
    dyStringClear(dySettings);
    if (setting != NULL)
        dyStringAppend(dySettings,setting);
    else
        {
        dyStringAppend(dySettings,CV_TAG " " CV_DESCRIPTION " " CV_ORGANISM " " CV_SEX);
        if (type != NULL && setting == NULL)
            verbose(1,"Must haves: %s\n",dyStringContents(dySettings));
        if (level > 0)
            {
            checkSettings = " " CV_PROTOCOL " " CV_VENDER_NAME " " CV_VENDOR_ID
                            " " CV_ORDER_URL " " CV_TERM_ID  " " CV_TERM_URL;
            dyStringAppend(dySettings,checkSettings);
            if (type != NULL && setting == NULL)
                verbose(1,"Should haves:%s\n",checkSettings);
            }
        if (level > 1)
            {
            checkSettings = " " CV_LINEAGE " " CV_TIER " " CV_TISSUE " color karyotype";
            if (type != NULL && setting == NULL)
                verbose(1,"Kitchen sink:%s\n",checkSettings);
            dyStringAppend(dySettings,checkSettings);
            }
        }
    count += cvTypeMustHaveSettings(&cvTypes,CV_UGLY_TERM_CELL_LINE,dyStringContents(dySettings));
    }

// Other types with non-standard requirements
checkSettings = setting;
if (type == NULL || sameWord(type,CV_TERM_LAB))
    {
    if (setting == NULL)
        checkSettings = CV_TAG " " CV_DESCRIPTION " " CV_LABEL " " CV_ORGANISM
                        " labInst labPi labPiFull grantPi";
    if (type != NULL && setting == NULL)
        verbose(1,"Must haves: %s\n",checkSettings);
    count += cvTypeMustHaveSettings(&cvTypes,CV_TERM_LAB,checkSettings);
    }
if (type == NULL || sameWord(type,CV_TERM_GRANT))
    {
    if (setting == NULL)
        checkSettings = CV_TAG " " CV_DESCRIPTION " grantInst projectName";
    if (type != NULL && setting == NULL)
        verbose(1,"Must haves: %s\n",checkSettings);
    count += cvTypeMustHaveSettings(&cvTypes,CV_TERM_GRANT,checkSettings);
    }
if (type == NULL || sameWord(type,CV_TERM_LOCALIZATION))
    {
    if (setting == NULL)
        checkSettings = CV_TAG " " CV_DESCRIPTION " " CV_TERM_ID " " CV_TERM_URL;
    if (type != NULL && setting == NULL)
        verbose(1,"Must haves: %s\n",checkSettings);
    count += cvTypeMustHaveSettings(&cvTypes,CV_TERM_LOCALIZATION,checkSettings);
    }
if (type == NULL || sameWord(type,CV_TERM_SEQ_PLATFORM))
    {
    if (setting == NULL)
        checkSettings = CV_TAG " " CV_DESCRIPTION " geo";
    if (type != NULL && setting == NULL)
        verbose(1,"Must haves: %s\n",checkSettings);
    count += cvTypeMustHaveSettings(&cvTypes,CV_TERM_SEQ_PLATFORM,checkSettings);
    }

// walk through all the rest of types with standard requirements: tag and description
if (setting == NULL)
    checkSettings = CV_TAG " " CV_DESCRIPTION;
while (cvTypes != NULL)
    {
    if (type != NULL && setting == NULL)
        verbose(1,"Must haves: %s\n",checkSettings);
    count += cvTypeMustHaveSettings(&cvTypes,cvTypes->name,checkSettings);
    }

if (count > 0 || type != NULL)
    verbose(1,"Found %d error%s.\n",count,(count==1?"":"s"));

return count;
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 2)
    usage();
int level = optionInt("level", 0);
char *type   = optionVal("type",NULL);
char *setting   = optionVal("setting",NULL);
if (setting != NULL && type == NULL)
    {
    verbose(1,"ERROR: -setting=%s requires -type=?.\n",setting);
    usage();
    }

return validateCv(argv[1],type,setting,level);
}
