/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

/* htmlPage - stuff to read, parse, and submit  htmlPages and forms. 
 *
 * typical usage is:
 *   struct htmlPage *page = htmlPageGet(url);
 *   htmlPageValidateOrAbort(page);
 *   var = htmlPageGetVar(page, page->forms, "org");
 *   if (var != NULL)
 *      printf("Organism = var->org);
 *   htmlPageSetVar(page, page->forms, "org", "Human");
 *   newPage = htmlPageFromForm(page, page->forms, "submit", "Go");
 */

#include "common.h"
#include "errAbort.h"
#include "errCatch.h"
#include "memalloc.h"
#include "linefile.h"
#include "hash.h"
#include "dystring.h"
#include "cheapcgi.h"
#include "obscure.h"
#include "filePath.h"
#include "net.h"
#include "htmshell.h"
#include "htmlPage.h"


void htmlStatusFree(struct htmlStatus **pStatus)
/* Free up resources associated with status */
{
struct htmlStatus *status = *pStatus;
if (status != NULL)
    {
    freeMem(status->version);
    freez(pStatus);
    }
}

void htmlStatusFreeList(struct htmlStatus **pList)
/* Free a list of dynamically allocated htmlStatus's */
{
struct htmlStatus *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlStatusFree(&el);
    }
*pList = NULL;
}

void htmlCookieFree(struct htmlCookie **pCookie)
/* Free memory associated with cookie. */
{
struct htmlCookie *cookie = *pCookie;
if (cookie != NULL)
    {
    freeMem(cookie->name);
    freeMem(cookie->value);
    freeMem(cookie->domain);
    freeMem(cookie->path);
    freeMem(cookie->expires);
    freez(pCookie);
    }
}

void htmlCookieFreeList(struct htmlCookie **pList)
/* Free a list of dynamically allocated htmlCookie's */
{
struct htmlCookie *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlCookieFree(&el);
    }
*pList = NULL;
}

struct htmlCookie *htmlCookieFileRead(char *fileName)
/* Read cookies from a line oriented file.  First word in line
 * is the cookie name, the rest of the line the cookie value. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct htmlCookie *list = NULL, *cookie;
char *line, *word;
while (lineFileNextReal(lf, &line))
    {
    word = nextWord(&line);
    line = skipLeadingSpaces(line);
    if (line == NULL)
        errAbort("Missing cookie value line %d of %s", lf->lineIx, lf->fileName);
    AllocVar(cookie);
    cookie->name = cloneString(word);
    cookie->value = cloneString(line);
    slAddHead(&list, cookie);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

static void cookieOutput(struct dyString *dy, struct htmlCookie *cookieList)
/* Write cookies to dy. */
{
struct htmlCookie *cookie;
if (cookieList != NULL)
    {
    dyStringAppend(dy, "Cookie:");
    for (cookie = cookieList; cookie != NULL; cookie = cookie->next)
	{
	if (cookie != cookieList)
	    dyStringAppendC(dy, ';');
	dyStringAppendC(dy, ' ');
	dyStringAppend(dy, cookie->name);
	dyStringAppendC(dy, '=');
	dyStringAppend(dy, cookie->value);
	}
    dyStringAppend(dy, "\r\n");
    }
}


void htmlAttributeFree(struct htmlAttribute **pAttribute)
/* Free up resources associated with attribute. */
{
struct htmlAttribute *att = *pAttribute;
if (att != NULL)
    {
    freeMem(att->name);
    freeMem(att->val);
    freez(pAttribute);
    }
}

void htmlAttributeFreeList(struct htmlAttribute **pList)
/* Free a list of dynamically allocated htmlAttribute's */
{
struct htmlAttribute *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlAttributeFree(&el);
    }
*pList = NULL;
}

void htmlTagFree(struct htmlTag **pTag)
/* Free up resources associated with tag. */
{
struct htmlTag *tag = *pTag;
if (tag != NULL)
    {
    htmlAttributeFreeList(&tag->attributes);
    freeMem(tag->name);
    freez(pTag);
    }
}

void htmlTagFreeList(struct htmlTag **pList)
/* Free a list of dynamically allocated htmlTag's */
{
struct htmlTag *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlTagFree(&el);
    }
*pList = NULL;
}

void htmlFormVarFree(struct htmlFormVar **pVar)
/* Free up resources associated with form variable. */
{
struct htmlFormVar *var = *pVar;
if (var != NULL)
    {
    freeMem(var->curVal);
    slFreeList(&var->values);
    slFreeList(&var->tags);
    freez(pVar);
    }
}

void htmlFormVarFreeList(struct htmlFormVar **pList)
/* Free a list of dynamically allocated htmlFormVar's */
{
struct htmlFormVar *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlFormVarFree(&el);
    }
*pList = NULL;
}


void htmlFormFree(struct htmlForm **pForm)
/* Free up resources associated with form variable. */
{
struct htmlForm *form = *pForm;
if (form != NULL)
    {
    htmlFormVarFreeList(&form->vars);
    freez(pForm);
    }
}

void htmlFormFreeList(struct htmlForm **pList)
/* Free a list of dynamically allocated htmlForm's */
{
struct htmlForm *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlFormFree(&el);
    }
*pList = NULL;
}

void htmlPageFree(struct htmlPage **pPage)
/* Free up resources associated with htmlPage. */
{
struct htmlPage *page = *pPage;
if (page != NULL)
    {
    freez(&page->url);
    htmlStatusFree(&page->status);
    freeHashAndVals(&page->header);
    htmlCookieFreeList(&page->cookies);
    freez(&page->fullText);
    htmlTagFreeList(&page->tags);
    htmlFormFreeList(&page->forms);
    freez(pPage);
    }
}

void htmlPageFreeList(struct htmlPage **pList)
/* Free a list of dynamically allocated htmlPage's */
{
struct htmlPage *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    htmlPageFree(&el);
    }
*pList = NULL;
}

static int findLineNumber(char *start, char *pos)
/* Figure out line number of given position relative to start. */
{
char *s;
int line = 1;
for (s = start; s <= pos; ++s)
    {
    if (s[0] == '\n')
       ++line;
    }
return line;
}

struct htmlTag *findNextMatchingTag(struct htmlTag *list, char *name)
/* Return first tag in list that is of type name or NULL if not found*/
{
struct htmlTag *tag;
for (tag = list; tag != NULL; tag = tag->next)
    {
    if (sameWord(name, tag->name))
	return tag;
    }
return NULL;
}

static void tagVaWarn(struct htmlPage *page, struct htmlTag *tag, char *format, 
	va_list args)
/* Print warning message and some context of tag. */
{
char context[80];
strncpy(context, tag->start, sizeof(context));
context[sizeof(context)-1] = 0;
warn("Error near line %d of %s:\n %s", findLineNumber(page->htmlText, tag->start), 
	page->url, context);
vaWarn(format, args);
}

static void tagWarn(struct htmlPage *page, struct htmlTag *tag, char *format, ...)
/* Print warning message and some context of tag. */
{
va_list args;
va_start(args, format);
tagVaWarn(page, tag, format, args);
va_end(args);
}

static void tagAbort(struct htmlPage *page, struct htmlTag *tag, char *format, ...)
/* Print abort message and some context of tag. */
{
va_list args;
va_start(args, format);
tagVaWarn(page, tag, format, args);
va_end(args);
noWarnAbort();
}

struct htmlStatus *htmlStatusParse(char **pText)
/* Read in status from first line.  Update pText to point to next line. 
 * Note unlike many routines here, this does not insert zeros into text. */
{
char *text = *pText;
char *end = strchr(text, '\n');
struct htmlStatus *status;
if (end != NULL)
   *pText = end+1;
else
   *pText = text + strlen(text);
end = skipToSpaces(text);
if (end == NULL)
    {
    warn("Short status line.");
    return NULL;
    }
AllocVar(status);
status->version = cloneStringZ(text, end-text);
end = skipLeadingSpaces(end);
if (!isdigit(end[0]))
    {
    warn("Not a number in status field");
    return NULL;
    }
status->status = atoi(end);
return status;
}

char *htmlNextCrLfLine(char **pS)
/* Return zero-terminated line and advance *pS to start of
 * next line.  Return NULL at end of file.  Warn if there is
 * no <CR>. */
{
char *s = *pS, *e;
if (s == NULL || s[0] == 0)
    return NULL;
e = strchr(s, '\n');
if (e == NULL)
    verbose(1, "End of file in header\n");
else 
    {
    *e = 0;
    if (e == s || e[-1] != '\r')
	verbose(1, "Missing <CR> in header line\n");
    else
       e[-1] = 0;
    e += 1;
    }
*pS = e;
return s;
}

static void cookieParseNameValuePair(char *s, char **retName, char **retVal)
/* Parse out name/value pair. Warn and return FALSE if there's a problem. */
{
char *val = strchr(s, '=');
if (val == NULL)
    {
    val = s + strlen(s);
    }
*val++ = 0;
*retName = s;
*retVal = val;
}

static struct htmlCookie *parseCookie(char *s)
/* Parse out cookie line to the right of Set-Cookie. */
{
char *e, *name, *val;
struct htmlCookie *cookie;

/* Grab up to semicolon, which is the cookie name/value pair. */
e = strchr(s, ';');
if (e == NULL)
    {
    warn("Missing ';' in cookie");
    return NULL;
    }
*e++ = 0;

/* Allocate cookie and fill out name/value pair. */
AllocVar(cookie);
cookieParseNameValuePair(s, &name, &val);
cookie->name = cloneString(name);
cookie->value = cloneString(val);

/* Loop through to grab the other info - domain and so forth. */
s = e;
for (;;)
    {
    /* Find next semicolon and zero-terminate it. */
    s = skipLeadingSpaces(s);
    e = strchr(s, ';');
    if (e == NULL)
        break;
    *e++ = 0;

    /* Parse out name/value pairs and save it away if it's one we know about. */
    cookieParseNameValuePair(s, &name, &val);
    if (sameString(name, "domain"))
        cookie->domain = cloneString(val);
    else if (sameString(name, "path"))
        cookie->path = cloneString(val);
    else if (sameString(name, "expires"))
        cookie->expires = cloneString(val);
    else if (sameString(name, "secure"))
        cookie->secure = TRUE;

    s = e;
    }
return cookie;
}

static struct hash *htmlHeaderRead(char **pHtml, struct htmlCookie **pCookies)
/* Read in from second line through first blank line and
 * save in hash.  These lines are in the form name: value. */
{
struct hash *hash = hashNew(6);
for (;;)
    {
    char *line = htmlNextCrLfLine(pHtml);
    char *word;
    if (line == NULL)
	{
        warn("End of file in header");
	break;
	}
    word = nextWord(&line);
    if (word == NULL)
        break;
    line = skipLeadingSpaces(line);
    hashAdd(hash, word, cloneString(line));
    if (sameString(word, "Set-Cookie:"))
	{
	struct htmlCookie *cookie = parseCookie(line);
	if (cookie != NULL)
	    slAddTail(pCookies, cookie);
	}
    }
return hash;
}

static char *htmlAttributeFindVal(struct htmlAttribute *list, char *name)
/* Find named attribute or return NULL. */
{
struct htmlAttribute *att;
for (att = list; att != NULL; att = att->next)
    {
    if (sameWord(att->name, name))
        return att->val;
    }
return NULL;
}


char *htmlTagAttributeVal(struct htmlPage *page, struct htmlTag *tag, 
	char *name, char *defaultVal)
/* Return value of named attribute, or defaultVal if attribute doesn't exist. */
{
char *val = htmlAttributeFindVal(tag->attributes, name);
if (val == NULL)
    val = defaultVal;
return val;
}

char *htmlTagAttributeNeeded(struct htmlPage *page, struct htmlTag *tag, char *name)
/* Return named tag attribute.  Complain and return "n/a" if it
 * doesn't exist. */
{
char *val = htmlTagAttributeVal(page, tag, name, NULL);
if (val == NULL)
    {
    tagWarn(page, tag, "Missing %s attribute", name);
    val = "n/a";
    }
return val;
}

boolean isSelfClosingTag(struct htmlTag *tag)
/* Return strue if last attributes' name is "/" 
 * Self-closing tags are used with html5 and SGV */
{
struct htmlAttribute *att = tag->attributes;
if (!att)
    return FALSE;
while (att->next) att = att->next;
if (sameString(att->name,"/"))
    return TRUE;
return FALSE;
}

static struct htmlTag *htmlTagScan(char *html, char *dupe)
/* Scan HTML for tags and return a list of them. 
 * Html is the text to scan, and dupe is a copy of it
 * which this routine will insert 0's in in the course of
 * parsing.*/
{
char *s = dupe, c, *e, *tagName;
struct htmlTag *tagList = NULL, *tag;
struct htmlAttribute *att;
int pos;

for (;;)
    {
    c = *s++;
    if (c == 0)
        break;
    if (c == '<')
        {
	if (*s == '!')	/* HTML comment. */
	    {
	    s += 1;
	    if (s[0] == '-' && s[1] == '-')
	        s = stringIn("-->", s);
	    else
		s = strchr(s, '>');
	    if (s == NULL)
		{
	        warn("End of file in comment");
		break;
		}
	    }
	else
	    {
	    /* Grab first word into tagName. */
	    e = s;
	    for (;;)
	        {
		c = *e;
		if (c == '>' || c == 0 || isspace(c))
		    break;
		e += 1;
		}
	    if (c != 0)
	       *e++ = 0;
	    tagName = s;
	    s = e;
	    
	    /* Allocate tag, fill in name, and stick it on list. */
	    AllocVar(tag);
	    tag->name = cloneString(tagName);
	    slAddHead(&tagList, tag);
	    pos = tagName - dupe - 1;
	    tag->start = html+pos;

	    /* If already got end tag (or EOF) stop processing tag. */
	    if (c == '>' || c == 0)
		{
		tag->end = html + (e - dupe);
	        continue;
		}

	    /* Process name/value pairs until get end tag. */
	    for (;;)
		{
		char *name, *val;
		boolean gotEnd = FALSE;

		/* Check for end tag. */
		s = skipLeadingSpaces(s);
		if (s[0] == '>' || s[0] == 0)
		    {
		    tag->end = html + (s - dupe);
		    if (s[0] == '>')
			tag->end += 1;
		    break;
		    }

		/* Get name - everything up to equals. */
		e = s;
		for (;;)
		    {
		    c = *e;
		    if (c == '=')
		        break;
		    else if (c == '>')
		        break;
		    else if (c == 0)
		        break;
		    e += 1;
		    }
		if (c == 0)
		    {
		    warn("End of file in tag");
		    break;
		    }
		name = s;
		*e++ = 0;
		eraseTrailingSpaces(name);
		if (c == '>')
		    {
		    val = "";
		    gotEnd = TRUE;
		    tag->end = html + (e - dupe);
		    }
		else if (isspace(c))
		    {
		    val = "";
		    }
		else
		    {
		    val = e = skipLeadingSpaces(e);
		    if (e[0] == '"' || e[0] == '\'')
			{
			if (!parseQuotedStringNoEscapes(val, val, &e))
			    break;
			}
		    else
			{
			for (;;)
			    {
			    c = *e;
			    if (c == '>')
				{
				gotEnd = TRUE;
				*e++ = 0;
				tag->end = html + (e - dupe);
				break;
				}
			    else if (isspace(c))
				{
				*e++ = 0;
				break;
				}
			    else if (c == 0)
				break;
			    ++e;
			    }
			}
		    }
		
		AllocVar(att);
		att->name = cloneString(name);
		att->val = cloneString(val);
		// The html standard allows us to break quoted attributes into multiple lines using newlines,
		// but they are not part of the tag value itself, so 
		// Strip \n and \r chars from value (att->val);  
		stripChar(att->val, '\n');
		stripChar(att->val, '\r');
		attributeDecode(att->val);
		slAddTail(&tag->attributes, att);
		s = e;
		if (gotEnd)
		    break;
		}
            if (sameWord(tagName, "script")) /* Script tag - skip over javascript contents */
                {
                s = strcasestr(s, "</script>");
                if (s == NULL)
                    {
                    warn("End of file in script tag");
                    break;
                    }
                }
	    }
	}
    }
slReverse(&tagList);
return tagList;
}

static struct htmlFormVar *findOrMakeVar(struct htmlPage *page, char *name, 
	struct hash *hash, struct htmlTag *tag, struct htmlFormVar **pVarList)
/* Find variable of existing name if it exists,  otherwise
 * make a new one and add to hash and list.  Add reference
 * to this tag to var. */
{
struct htmlFormVar *var = hashFindVal(hash, name);
if (var == NULL)
    {
    AllocVar(var);
    var->name = name;
    var->tagName = tag->name;
    hashAdd(hash, name, var);
    slAddHead(pVarList, var);
    }
else
    {
    if (!sameWord(var->tagName, tag->name))
        {
	tagWarn(page, tag, "Mixing FORM variable tag types %s and %s", 
		var->tagName, tag->name);
	var->tagName = tag->name;
	}
    }
refAdd(&var->tags, tag);
return var;
}

static boolean isMixableInputType(char *type)
/* Return TRUE if it's a type you can mix with others ok, like
 * button, submit, and image. */
{
return sameWord(type, "BUTTON") || sameWord(type, "SUBMIT") 
	|| sameWord(type, "IMAGE");
}

static boolean areMixableInputTypes(char *type1, char *type2)
/* Return TRUE if type1 and type 2 can be safely mixed, i.e.
 * if type1 and type2 both pass isMixableInputType, OR
 * if type1 or type2 is HIDDEN. */
{
return sameWord(type1, "HIDDEN") || sameWord(type2, "HIDDEN")
    || (isMixableInputType(type1) && isMixableInputType(type2));
}

static void htmlFormVarAddValue(struct htmlFormVar *var, char *value)
/* Add value to list of predefined values for var. */
{
struct slName *name = slNameNew(value);
slAddTail(&var->values, name);
}


static struct htmlFormVar *formParseVars(struct htmlPage *page, struct htmlForm *form)
/* Return a list of variables parsed out of form.  
 * A form variable is something that may appear in the name
 * side of the name=value pairs that serves as input to a CGI
 * script.  The variables may be constructed from buttons, 
 * INPUT tags, OPTION lists, or TEXTAREAs. */
{
struct htmlTag *tag;
struct htmlFormVar *varList = NULL, *var;
struct hash *hash = newHash(0);
for (tag = form->startTag->next; tag != form->endTag; tag = tag->next)
    {
    if (sameWord(tag->name, "INPUT"))
        {
	char *type = htmlTagAttributeVal(page, tag, "TYPE", NULL);
	char *varName = htmlTagAttributeVal(page, tag, "NAME", NULL);
	char *value = htmlTagAttributeVal(page, tag, "VALUE", NULL);

	if (type == NULL)
	    type = "TEXT";
	if (varName == NULL)
	    {
	    if (!htmlTagAttributeVal(page, tag, "ONCHANGE", NULL)
	     && !htmlTagAttributeVal(page, tag, "ID", NULL)
	        && !sameWord(type, "SUBMIT") && !sameWord(type, "CLEAR")
	    	&& !sameWord(type, "BUTTON") && !sameWord(type, "RESET")
		&& !sameWord(type, "IMAGE"))
		tagWarn(page, tag, "Missing NAME attribute");
	    varName = "n/a";
	    }
	var = findOrMakeVar(page, varName, hash, tag, &varList); 
	if (var->type != NULL && !sameWord(var->type, type))
	    {
	    if (!areMixableInputTypes(var->type, type))
		tagWarn(page, tag, "Mixing input types %s and %s", var->type, type);
	    }
	var->type = type;
	if (sameWord(type, "TEXT") || sameWord(type, "PASSWORD") 
		|| sameWord(type, "FILE") || sameWord(type, "HIDDEN")
		|| sameWord(type, "IMAGE") || sameWord(type, "SEARCH")
                || sameWord(type, "COLOR"))
	    {
	    var->curVal = cloneString(value);
	    }
	else if (sameWord(type, "CHECKBOX"))
	    {
	    if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL)
	        var->curVal = cloneString("on");
	    }
	else if (sameWord(type, "RADIO"))
	    {
	    if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL)
	        var->curVal = cloneString(value);
	    htmlFormVarAddValue(var, value);
	    }
	else if ( sameWord(type, "RESET") || sameWord(type, "BUTTON") ||
		sameWord(type, "SUBMIT") || sameWord(type, "IMAGE") ||
		sameWord(type, "n/a"))
	    {
	    /* Do nothing. */
	    }
	else
	    {
	    tagWarn(page, tag, "Unrecognized INPUT TYPE %s", type);
	    }
	}
    else if (sameWord(tag->name, "SELECT"))
        {
	char *varName = htmlTagAttributeNeeded(page, tag, "NAME");
	struct htmlTag *subTag;
	var = findOrMakeVar(page, varName, hash, tag, &varList); 
	for (subTag = tag->next; subTag != form->endTag; subTag = subTag->next)
	    {
	    if (sameWord(subTag->name, "/SELECT"))
		{
		if (var->curVal == NULL && var->values != NULL)
		    {
		    var->curVal = cloneString(var->values->name);
		    }
		break;
		}
	    else if (sameWord(subTag->name, "OPTION"))
	        {
		char *val = cloneString(htmlTagAttributeVal(page, subTag, "VALUE", NULL));
		if (val == NULL)
		    {
		    char *e = strchr(subTag->end, '<');
		    if (e != NULL)
			val = cloneStringZ(subTag->end, e - subTag->end);
		    }
		if (val != NULL)
		    htmlFormVarAddValue(var, val);
		if (htmlTagAttributeVal(page, subTag, "SELECTED", NULL) != NULL)
		    {
		    if (val != NULL)
			var->curVal = cloneString(val);
		    }
		freez(&val);
		}
	    }
	}
    else if (sameWord(tag->name, "TEXTAREA"))
        {
	char *varName = htmlTagAttributeNeeded(page, tag, "NAME");
	char *e = strchr(tag->end, '<');
	var = findOrMakeVar(page, varName, hash, tag, &varList); 
	if (e != NULL)
	    var->curVal = cloneStringZ(tag->end, e - tag->end);
	}
    }
freeHash(&hash);    
slReverse(&varList);
for (var = varList; var != NULL; var = var->next)
    {
    slReverse(&var->tags);
    }
return varList;
}

static struct htmlForm *htmlParseForms(struct htmlPage *page,
	struct htmlTag *startTag, struct htmlTag *endTag)
/* Parse out list of forms from tag stream. */
{
struct htmlForm *formList = NULL, *form = NULL;
struct htmlTag *tag;
for (tag = startTag; tag != endTag; tag = tag->next)
    {
    if (sameWord(tag->name, "FORM"))
        {
	if (form != NULL)
	    tagWarn(page, tag, "FORM inside of FORM");
	AllocVar(form);
	form->startTag = tag;
	slAddHead(&formList, form);
	form->name = htmlTagAttributeVal(page, tag, "name", "n/a");
	form->action = htmlTagAttributeNeeded(page, tag, "action");
	form->method = htmlTagAttributeVal(page, tag, "method", "GET");
	}
    else if (sameWord(tag->name, "/FORM"))
        {
	if (form == NULL)
	    tagWarn(page, tag, "/FORM outside of FORM");
	else
	    {
	    form->endTag = tag->next;
	    form = NULL;
	    }
	}
    }
slReverse(&formList);
for (form = formList; form != NULL; form = form->next)
    {
    form->vars = formParseVars(page, form);
    }
return formList;
}

struct htmlPage *htmlPageParse(char *url, char *fullText)
/* Parse out page and return. */
{
struct htmlPage *page;
char *dupe = cloneLongString(fullText);
char *s = dupe;
struct htmlStatus *status = htmlStatusParse(&s);
char *contentType;

if (status == NULL)
    return NULL;

AllocVar(page);
page->url = cloneString(url);
page->fullText = fullText;
page->status = status;
page->header = htmlHeaderRead(&s, &page->cookies);
contentType = hashFindVal(page->header, "Content-Type:");
if (contentType == NULL)	
    {
    warn("No contentType, assuming text/html");
    contentType = cloneString("text/html");
    hashAdd(page->header, "Content-Type:", contentType);
    }
page->htmlText = fullText + (s - dupe);
if (startsWith("text/html", contentType))
    {
    page->tags = htmlTagScan(page->htmlText, s);
    page->forms = htmlParseForms(page, page->tags, NULL);
    }
freez(&dupe);
return page;
}

struct htmlPage *htmlPageParseNoHead(char *url, char *htmlText)
/* Parse out page in memory (past http header if any) and return. */
{
char *dupe = cloneString(htmlText);
struct htmlPage *page;
AllocVar(page);
page->url = cloneString(url);
page->fullText = page->htmlText = htmlText;
page->tags = htmlTagScan(page->htmlText, dupe);
page->forms = htmlParseForms(page, page->tags, NULL);
freez(&dupe);
return page;
}

struct htmlPage *htmlPageParseOk(char *url, char *fullText)
/* Parse out page and return only if status ok. */
{
struct htmlPage *page = htmlPageParse(url, fullText);
if (page == NULL)
   noWarnAbort();
if (page->status->status != 200)
   errAbort("%s returned with status code %d", url, page->status->status);
return page;
}

char *htmlSlurpWithCookies(char *url, struct htmlCookie *cookies)
/* Send get message to url with cookies, and return full response as
 * a dyString.  This is not parsed or validated, and includes http
 * header lines.  Typically you'd pass this to htmlPageParse() to
 * get an actual page. */
{
struct dyString *dyHeader = dyStringNew(0);
struct dyString *dyText;
int sd;

cookieOutput(dyHeader, cookies);
sd = netOpenHttpExt(url, "GET", dyHeader->string);
dyText = netSlurpFile(sd);
close(sd);
dyStringFree(&dyHeader);
return dyStringCannibalize(&dyText);
}

struct htmlPage *htmlPageGetWithCookies(char *url, struct htmlCookie *cookies)
/* Get page from URL giving server the given cookies.   Note only the
 * name and value parts of the cookies need to be filled in. */
{
char *buf = htmlSlurpWithCookies(url, cookies);
return htmlPageParse(url, buf);
}

struct htmlPage *htmlPageForwarded(char *url, struct htmlCookie *cookies)
/* Get html page.  If it's just a forwarding link then get do the
 * forwarding.  Cookies is a possibly empty list of cookies with
 * name and value parts filled in. */
{
struct htmlPage *page = htmlPageGetWithCookies(url, cookies);
int level, maxLevels = 7;
for (level = 0; level < maxLevels; ++level)
    {
    struct htmlPage *newPage;
    char *newUrl = hashFindVal(page->header, "Location:");
    if (newUrl == NULL)
        break;
    newPage = htmlPageGetWithCookies(newUrl, cookies);
    htmlPageFree(&page);
    page = newPage;
    }
return page;
}

struct htmlPage *htmlPageForwardedNoAbort(char *url, struct htmlCookie *cookies)
/* Try and get an HTML page.  Print warning and return NULL if there's a problem. */
{
struct errCatch *errCatch = errCatchNew();
struct htmlPage *page = NULL;
if (errCatchStart(errCatch))
    page = htmlPageForwarded(url, cookies);
errCatchEnd(errCatch);
if (errCatch->gotError)
    warn("%s", errCatch->message->string);
errCatchFree(&errCatch);
return page;
}


struct htmlPage *htmlPageGet(char *url)
/* Get page from URL (may be a file). */
{
if (fileExists(url))
    {
    char *buf;
    readInGulp(url, &buf, NULL);
    return htmlPageParseNoHead(url, buf);
    }
else
    return htmlPageGetWithCookies(url, NULL);
}

void htmlFormVarPrint(struct htmlFormVar *var, FILE *f, char *prefix)
/* Print out variable to file, prepending prefix. */
{
struct slName *val;
fprintf(f, "%s%s\t%s\t%s\t%s\n", prefix, var->name, var->tagName, 
	naForNull(var->type), 
	naForNull(var->curVal));
for (val = var->values; val != NULL; val = val->next)
     fprintf(f, "%s\t%s\n", prefix, val->name);
}

void htmlFormPrint(struct htmlForm *form, FILE *f)
/* Print out form structure. */
{
struct htmlFormVar *var;
fprintf(f, "%s\t%s\t%s\n", form->name, form->method, form->action);
for (var = form->vars; var != NULL; var = var->next)
    htmlFormVarPrint(var, f, "\t");
}

struct htmlForm *htmlFormGet(struct htmlPage *page, char *name)
/* Get named form. */
{
struct htmlForm *form;
for (form = page->forms; form != NULL; form = form->next)
    if (sameWord(form->name, name))
        break;
return form;
}

struct htmlFormVar *htmlFormVarGet(struct htmlForm *form, char *name)
/* Get named variable. */
{
struct htmlFormVar *var;
if (form == NULL)
    errAbort("Null form passed to htmlFormVarGet");
for (var = form->vars; var != NULL; var = var->next)
    if (sameWord(var->name, name))
	break;
return var;
}

void htmlFormVarSet(struct htmlForm *form, char *name, char *val)
/* Set variable to given value. Create it if it doesn't exist*/
{
struct htmlFormVar *var;
if (form == NULL)
    errAbort("Null form passed to htmlFormVarSet");
var = htmlFormVarGet(form, name);
if (var == NULL)
    {
    AllocVar(var);
    var->type = "TEXT";
    var->tagName = "INPUT";
    var->name = name;
    slAddHead(&form->vars, var);
    }
freez(&var->curVal);
var->curVal = cloneString(val);
}


struct htmlFormVar *htmlPageGetVar(struct htmlPage *page, struct htmlForm *form, char *name)
/* Get named variable.  If form is NULL, first form in page is used. */
{
if (form == NULL)
    form = page->forms;
return htmlFormVarGet(form, name);
}

void htmlPageSetVar(struct htmlPage *page, struct htmlForm *form, char *name, char *val)
/* Set variable to given value.  If form is NULL, first form in page is used. */
{
if (page == NULL)
    errAbort("Null page passed to htmlPageSetVar");
if (form == NULL)
    form = page->forms;
if (form == NULL)
    errAbort("Null form in htmlPageSetVar");
htmlFormVarSet(form, name, val);
}

static void asciiEntityDecode(char *in, char *out, int inLength)
/* Decode from SGML Character Entity &# format to normal. 
 * Out will be a little shorter than in typically, and
 * can be the same buffer. Only supports ASCII charset. */
{
char c;
int i;
char *e;
for (i=0; i<inLength;++i)
    {
    c = *in++;
    if ((c == '&') && (*in == '#'))
	{
	in++;
	if ((e = strchr(in,';')) == NULL  || (e - in) > 5)
	    { /* probably a badly formatted string, just recover and continue */
	    *out++ = '&';
	    *out++ = '#';
	    }
	else
	    {
	    int code;
	    if (sscanf(in, "%d", &code) != 1)
		{
		code = '?';
		}
	    if (code > 255) 
		{
		code = '?';
		}
	    in = e;
	    in++;
	    *out++ = code;
	    }
	}
    else
	*out++ = c;
    }
*out++ = 0;
}


char *expandUrlOnBase(char *base, char *url)
/* Figure out first character past host name. Load up
 * return string with protocol (if any) and host name. 
 * It is assumed that url is relative to base and does not contain a protocol.*/
{
struct dyString *dy = NULL;
char *hostName, *pastHostName;
dy = dyStringNew(256);
if (startsWith("http:", base) || startsWith("https:", base) || startsWith("ftp:", base))
    hostName = (strchr(base, ':') + 3);
else
    hostName = base;
pastHostName = strchr(hostName, '/');
if (pastHostName == NULL)
    pastHostName = hostName + strlen(hostName);
dyStringAppendN(dy, base, pastHostName - base);

/* Add url to return string after host name. */
if (startsWith("/", url))	/* New URL is absolute, just append to hostName */
    {
    dyStringAppend(dy, url);
    }
else
    {
    char *curDir = pastHostName;
    char *endDir;
    if (curDir[0] == '/')
        curDir += 1;
    dyStringAppendC(dy, '/');
    endDir = strrchr(curDir, '/');
    if (endDir == NULL)
	endDir = curDir;
    if (startsWith("../", url))
	{
	char *dir = cloneStringZ(curDir, endDir-curDir);
	char *path = expandRelativePath(dir, url);
	if (path != NULL)
	     {
	     dyStringAppend(dy, path);
	     }
	freez(&dir);
	freez(&path);
	}
    else
	{
	dyStringAppendN(dy, curDir, endDir-curDir);
	if (lastChar(dy->string) != '/')
	    dyStringAppendC(dy, '/');
	dyStringAppend(dy, url);
	}
    }
return dyStringCannibalize(&dy);
}

char *htmlExpandUrl(char *base, char *url)
/* Expand URL that is relative to base to stand on its own. 
 * Return NULL if it's not http or https. */
{

/* some mailto: have SGML char encoding, e.g &#97; to hide from spambots */
url = cloneString(url);	/* Clone because asciiEntityDecode may modify it. */
asciiEntityDecode(url, url, strlen(url));

/* In easiest case URL is actually absolute and begins with
 * protocol.  Just return clone of url. */
if (startsWith("http:", url) || startsWith("https:", url))
    return url;

/* If it's got a colon, but no http or https, then it's some
 * protocol we don't understand, like a mailto.  Just return NULL. */
if (strchr(url, ':') != NULL)
    {
    freez(&url);
    return NULL;
    }
char *result = expandUrlOnBase(base, url);
freez(&url);
return result;
}

static void appendCgiVar(struct dyString *dy, char *name, char *value)
/* Append cgiVar with cgi-encoded value to dy. */
{
char *enc = NULL;
if (value == NULL)
    value = "";
enc = cgiEncode(value);
if (dy->stringSize != 0)
    dyStringAppendC(dy, '&');
dyStringAppend(dy, name);
dyStringAppendC(dy, '=');
dyStringAppend(dy, enc);
freez(&enc);
}

#define MIMEBUFSIZE 4096

static void appendMimeVar(struct dyString *dy, char *name, char *value, char *varType, char *boundary)
/* Append cgiVar with cgi-encoded value to dy. */
{
char *fileName = NULL;

if (value == NULL)
    value = "";
dyStringAppend(dy, "\r\n--");
dyStringAppend(dy, boundary);
dyStringAppend(dy, "\r\n");
dyStringAppend(dy, "content-disposition: form-data; name=\"");
dyStringAppend(dy, name);
dyStringAppend(dy, "\"");

if (varType && sameWord(varType, "FILE"))
    {
    fileName = strrchr(value,'/'); 
    if (fileName)
	++fileName;
    else
	fileName = value;
    dyStringAppend(dy, "; filename=\"");
    dyStringAppend(dy, fileName);
    dyStringAppend(dy, "\"");
    }
dyStringAppend(dy, "\r\n");
dyStringAppend(dy, "\r\n");
if (varType && sameWord(varType, "FILE") && !sameWord(value,""))
    {
    FILE *f = mustOpen(value, "r");
    char buf[MIMEBUFSIZE];
    int bytesRead = 0;
    do
	{
	bytesRead = fread(buf,1,MIMEBUFSIZE,f);
	if (bytesRead < 0)
	    errnoAbort("error reading file to upload %s",value);
    	dyStringAppendN(dy, buf, bytesRead);
	}
    while(bytesRead > 0);
    carefulClose(&f);
    }
else    
    dyStringAppend(dy, value);
}

static void appendMimeTerminus(struct dyString *dy, char *boundary)
/* Append MIME boundary terminator to dy. */
{
dyStringAppend(dy, "\r\n--");
dyStringAppend(dy, boundary);
dyStringAppend(dy, "--\r\n");
}


static int countOccurrences(char *needle, int nLen, char *haystack, int hLen)
/* count # of occurrences of needle in haystack */
{
int count = 0;
char *match=NULL;
while((match=memMatch(needle, nLen, haystack, hLen)) != NULL)
    {
    ++count;
    hLen -= (match - haystack) + nLen;
    if (hLen < 1)
	break;
    haystack=match+nLen;
    }
return count;
}

static boolean isMimeEncoded(struct htmlForm *form)
/* determine if the form is using MIME encoding */
{
struct htmlAttribute *a;
for(a = form->startTag->attributes;a;a = a->next)
    if (sameWord(a->name,"ENCTYPE") && sameWord(a->val,"multipart/form-data"))
	return TRUE;
return FALSE;
}

char *htmlFormCgiVars(struct htmlPage *page, struct htmlForm *form, 
	char *buttonName, char *buttonVal, struct dyString *dyHeader)
/* Return cgi vars in name=val format from use having pressed
 * submit button of given name and value. */
{
struct dyString *dy = dyStringNew(0);
struct htmlFormVar *var;
boolean isMime = isMimeEncoded(form);
int mimeParts = 0;
char boundary[256];

while(TRUE)
    {
    if (isMime)
	{
	/* choose a new string for the boundary */
	/* Set initial seed */
	int i = 0;
    	safef(boundary,sizeof(boundary),"%s", "---------");
	srand( (unsigned)time( NULL ) );
	for(i=strlen(boundary);i<41;++i)
	    {
    	    int r = (int) 26 * (rand() / (RAND_MAX + 1.0));
	    boundary[i] = r+'A';
	    }
	boundary[i] = 0;
	}

    if (form == NULL)
	form = page->forms;
    if (buttonName != NULL && !isMime)
	appendCgiVar(dy, buttonName, buttonVal);
    for (var = form->vars; var != NULL; var = var->next)
	{
	if (sameWord(var->tagName, "SELECT") || 
	    sameWord(var->tagName, "TEXTAREA") || 
	    (var->type != NULL &&
	    ((sameWord(var->type, "RADIO") || sameWord(var->type, "TEXTBOX")
	    || sameWord(var->type, "PASSWORD") || sameWord(var->type, "HIDDEN")
	    || sameWord(var->type, "TEXT") || sameWord(var->type, "FILE")))))
	    {
	    char *val = var->curVal;
	    if (val == NULL)
		val = "";
	    if (isMime)
		{
		++mimeParts;
		appendMimeVar(dy, var->name, val, var->type, boundary);
		}
	    else	    
		appendCgiVar(dy, var->name, val);
	    }
	else if (var->type != NULL && sameWord(var->type, "CHECKBOX"))
	    {
	    if (var->curVal != NULL)
		{
		if (isMime)	    
		    {
		    ++mimeParts;
		    appendMimeVar(dy, var->name, var->curVal, var->type, boundary);
		    }
		else	    
		    appendCgiVar(dy, var->name, var->curVal);
		}
	    }
	else if (isMime && buttonName && sameWord(buttonName,var->name))
	    {
	    ++mimeParts;
	    appendMimeVar(dy, buttonName, buttonVal, NULL, boundary);
	    }
	}
    if (isMime) 
	{
	++mimeParts;
	appendMimeTerminus(dy,boundary);
	if (countOccurrences(boundary,strlen(boundary),dy->string,dy->stringSize) != mimeParts)
	    { /* boundary was found in input! # occurrences not as expected */
	    dyStringClear(dy);
    	    continue;  /* if at first you don't succeed, try another boundary string */
	    }
    	dyStringPrintf(dyHeader, "Content-type: multipart/form-data, boundary=%s\r\n",boundary);
	if (isMime && verboseLevel() == 2)
	    {
    	    mustWrite(stderr, dyHeader->string, dyHeader->stringSize);
	    mustWrite(stderr, dy->string, dy->stringSize);
	    }
	}
    break;
    }   
    
return dyStringCannibalize(&dy);

}

struct htmlPage *htmlPageFromForm(struct htmlPage *origPage, struct htmlForm *form, 
	char *buttonName, char *buttonVal)
/* Return a new htmlPage based on response to pressing indicated button
 * on indicated form in origPage. */
{
struct htmlPage *newPage = NULL;
struct dyString *dyUrl = dyStringNew(0);
struct dyString *dyHeader = dyStringNew(0);
struct dyString *dyText = NULL;
char *url = htmlExpandUrl(origPage->url, form->action);
char *cgiVars = NULL;
int contentLength = 0;
int sd = -1;

dyStringAppend(dyUrl, url);
cookieOutput(dyHeader, origPage->cookies);
if (sameWord(form->method, "GET"))
    {
    cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader);
    dyStringAppend(dyUrl, "?");
    dyStringAppend(dyUrl, cgiVars);
    verbose(3, "GET %s\n", dyUrl->string);
    sd = netOpenHttpExt(dyUrl->string, form->method, dyHeader->string);
    }
else if (sameWord(form->method, "POST"))
    {
    cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader);
    contentLength = strlen(cgiVars);
    verbose(3, "POST %s\n", dyUrl->string);
    dyStringPrintf(dyHeader, "Content-Length: %d\r\n", contentLength);
    sd = netOpenHttpExt(dyUrl->string, form->method, dyHeader->string);
    mustWriteFd(sd, cgiVars, contentLength);
    }
dyText = netSlurpFile(sd);
close(sd);
newPage = htmlPageParse(url, dyStringCannibalize(&dyText));
freez(&url);
dyStringFree(&dyUrl);
dyStringFree(&dyHeader);
freez(&cgiVars);
return newPage;
}

struct slName *htmlPageScanAttribute(struct htmlPage *page, 
	char *tagName, char *attribute)
/* Scan page for values of particular attribute in particular tag.
 * if tag is NULL then scans in all tags. */
{
struct htmlTag *tag;
struct htmlAttribute *att;
struct slName *list = NULL, *el;

for (tag = page->tags; tag != NULL; tag = tag->next)
    {
    if (tagName == NULL || sameWord(tagName, tag->name))
        {
	for (att = tag->attributes; att != NULL; att = att->next)
	    {
	    if (sameWord(attribute, att->name))
	        {
		el = slNameNew(att->val);
		slAddHead(&list, el);
		}
	    }
	}
    }
slReverse(&list);
return list;
}

struct slName *htmlPageLinks(struct htmlPage *page)
/* Scan through tags list and pull out HREF attributes. */
{
return htmlPageScanAttribute(page, NULL, "HREF");
}

struct slName *htmlPageSrcLinks(struct htmlPage *page)
/* Scan through tags list and pull out SRC attributes. */
{
return htmlPageScanAttribute(page, NULL, "SRC");
}

struct htmlTableRow
/* Data on a row */
    {
    struct htmlTableRow *next;
    int tdCount;
    int inTd;
    };

struct htmlTable 
/* Data on a table. */
    {
    struct htmlTable *next;
    struct htmlTableRow *row;
    int rowCount;
    };

static void validateTables(struct htmlPage *page, 
	struct htmlTag *startTag, struct htmlTag *endTag)
/* Validate <TABLE><TR><TD> are all properly nested, and that there
 * are no empty rows. */
{
struct htmlTable *tableStack = NULL, *table;
struct htmlTableRow *row;
struct htmlTag *tag;

for (tag = startTag; tag != endTag; tag = tag->next)
    {
    if (sameWord(tag->name, "TABLE"))
        {
	if (tableStack != NULL)
	    {
	    if (tableStack->row == NULL || !tableStack->row->inTd)
	    tagAbort(page, tag, "TABLE inside of another table, but not inside of <TR><TD>\n");
	    }
	AllocVar(table);
	slAddHead(&tableStack, table);
	}
    else if (sameWord(tag->name, "/TABLE"))
        {
	if ((table = tableStack) == NULL)
	    tagAbort(page, tag, "Extra </TABLE> tag");
	if (table->rowCount == 0)
	    tagAbort(page, tag, "<TABLE> with no <TR>'s");
	if (table->row != NULL)
	    tagAbort(page, tag, "</TABLE> inside of a row");
	tableStack = table->next;
	freez(&table);
	}
    else if (sameWord(tag->name, "TR"))
        {
	if ((table = tableStack) == NULL)
	    tagAbort(page, tag, "<TR> outside of TABLE");
	if (table->row != NULL)
	    tagAbort(page, tag, "<TR>...<TR> with no </TR> in between");
	AllocVar(table->row);
	table->rowCount += 1;
	}
    else if (sameWord(tag->name, "/TR"))
        {
	if ((table = tableStack) == NULL)
	    tagAbort(page, tag, "</TR> outside of TABLE");
	if (table->row == NULL)
	    tagAbort(page, tag, "</TR> with no <TR>");
#ifdef LEGAL_ACTUALLY
	if (table->row->inTd)
	    {
	    tagAbort(page, tag, "</TR> while <TD> is open");
	    }
#endif /* LEGAL_ACTUALLY */
	if (table->row->tdCount == 0)
	    tagAbort(page, tag, "Empty row in <TABLE>");
	freez(&table->row);
	}
    else if (sameWord(tag->name, "TD") || sameWord(tag->name, "TH"))
        {
	if ((table = tableStack) == NULL)
	    tagAbort(page, tag, "<%s> outside of <TABLE>", tag->name);
	if ((row = table->row) == NULL)
	    tagAbort(page, tag, "<%s> outside of <TR>", tag->name);
#ifdef LEGAL_ACTUALLY
	if (row->inTd)
	    {
	    tagAbort(page, tag, "<%s>...<%s> with no </%s> in between", 
	    	tag->name, tag->name, tag->name);
	    }
#endif /* LEGAL_ACTUALLY */
	row->inTd = TRUE;
	row->tdCount += 1;
	}
    else if (sameWord(tag->name, "/TD") || sameWord(tag->name, "/TH"))
        {
	if ((table = tableStack) == NULL)
	    tagAbort(page, tag, "<%s> outside of <TABLE>", tag->name);
	if ((row = table->row) == NULL)
	    tagAbort(page, tag, "<%s> outside of <TR>", tag->name);
	if (!row->inTd)
	    tagAbort(page, tag, "<%s> with no <%s>", tag->name, tag->name+1);
	row->inTd = FALSE;
	}
    }
if (tableStack != NULL)
    tagAbort(page, tag, "Missing </TABLE>");
}

static void checkTagIsInside(struct htmlPage *page, char *outsiders, char *insiders,  
	struct htmlTag *startTag, struct htmlTag *endTag)
/* Check that insiders are all bracketed by outsiders. */
{
char *outDupe = cloneString(outsiders);
char *inDupe = cloneString(insiders);
char *line, *word;
int depth = 0;
struct htmlTag *tag;
struct hash *outOpen = newHash(8);
struct hash *outClose = newHash(8);
struct hash *inHash = newHash(8);
char buf[256];

/* Create hashes of all insiders */
line = inDupe;
while ((word = nextWord(&line)) != NULL)
    {
    touppers(word);
    hashAdd(inHash, word, NULL);
    }

/* Create hash of open and close outsiders. */
line = outDupe;
while ((word = nextWord(&line)) != NULL)
    {
    touppers(word);
    hashAdd(outOpen, word, NULL);
    safef(buf, sizeof(buf), "/%s", word);
    hashAdd(outClose, buf, NULL);
    }

/* Stream through tags making sure that insiders are
 * at least one deep inside of outsiders. */
for (tag = startTag; tag != NULL; tag = tag->next)
    {
    char *type = tag->name;
    if (hashLookup(outOpen, type ))
        ++depth;
    else if (hashLookup(outClose, type))
        --depth;
    else if (hashLookup(inHash, type))
        {
	if (depth <= 0)
	    {
	    if (!startsWith("<INPUT TYPE=HIDDEN NAME=", tag->start))  // one exception hardwired
		tagAbort(page, tag, "%s outside of any of %s", type, outsiders);
	    }
	}
    }
freeHash(&inHash);
freeHash(&outOpen);
freeHash(&outClose);
freeMem(outDupe);
freeMem(inDupe);
}

static void checkNest(struct htmlPage *page,
	char *type, struct htmlTag *startTag, struct htmlTag *endTag)
/* Check that <type> and </type> tags are properly nested. */
{
struct htmlTag *tag;
int depth = 0;
char endType[256];
safef(endType, sizeof(endType), "/%s", type);
for (tag = startTag; tag != endTag; tag = tag->next)
    {
    if (sameWord(tag->name, type))
	++depth;
    else if (sameWord(tag->name, endType))
        {
	--depth;
	if (depth < 0)
	   tagAbort(page, tag, "<%s> without preceding <%s>", endType, type);
	}
    }
if (depth != 0)
    errAbort("Missing <%s> tag", endType);
}

static void validateNestingTags(struct htmlPage *page,
	struct htmlTag *startTag, struct htmlTag *endTag,
	char *nesters[], int nesterCount)
/* Validate many tags that do need to nest. */
{
int i;
for (i=0; i<nesterCount; ++i)
    checkNest(page, nesters[i], startTag, endTag);
}

static char *bodyNesters[] = 
/* Nesting tags that appear in body. */
{
    "ADDRESS", "DIV", "H1", "H2", "H3", "H4", "H5", "H6",
    "ACRONYM", "BLOCKQUOTE", "CITE", "CODE", "DEL", "DFN"
    "DIR", "DL", "MENU", "OL", "UL", "CAPTION", "TABLE", 
    "A", "MAP", "OBJECT", "FORM", "DIV", "SCRIPT", "SVG"
};

static char *headNesters[] =
/* Nesting tags that appear in header. */
{
    "TITLE", "SCRIPT"
};

static char *singleTons[] =
/* Tags which do not have closing tags. */
{
"AREA",
"BASE",
"BR",
"COL",
"COMMAND",
"EMBED",
"FRAME",  // not in html5
"HR",
"IMG",
"INPUT",
"LINK",
"META",
"PARAM",
"SOURCE"
};

static char *selfClosers[] =
/* Tags which can be optionally self-closing in html5 or SVG.
 * Note that a space is required BEFORE the /> which provides disambiguation,
 * e.g. We do not know if the trailing slash is part of SRC URL: <img src=http://domain.com/image.jpg/>
 */
{
"CIRCLE",   // SVG
"ELLIPSE",  // SVG
"LINE",     // SVG
"PATH",     // SVG
"POLYGON",  // SVG
"POLYLINE", // SVG
"RECT"      // SVG
};

static struct htmlTag *validateBody(struct htmlPage *page, struct htmlTag *startTag)
/* Go through tags from current position (just past <BODY>)
 * up to and including </BODY> and check some things. */
{
struct htmlTag *tag, *endTag = NULL;

/* First search for end tag. */
for (tag = startTag; tag != NULL; tag = tag->next)
    {
    if (sameWord(tag->name, "/BODY"))
        {
	endTag = tag;
	break;
	}
    }
if (endTag == NULL)
    errAbort("Missing </BODY>");
validateTables(page, startTag, endTag);
checkTagIsInside(page, "DIR MENU OL UL", "LI", startTag, endTag);
checkTagIsInside(page, "DL", "DD DT", startTag, endTag);
checkTagIsInside(page, "COLGROUP TABLE", "COL", startTag, endTag);
checkTagIsInside(page, "MAP", "AREA", startTag, endTag);
#ifdef OLD   /* These days input type controls allowed outside forms because of javascript */
checkTagIsInside(page, "FORM SCRIPT", 
	"INPUT BUTTON /BUTTON OPTION SELECT /SELECT TEXTAREA /TEXTAREA"
	"FIELDSET /FIELDSET"
	, 
	startTag, endTag);
#endif /* OLD */
validateNestingTags(page, startTag, endTag, bodyNesters, ArraySize(bodyNesters));
return endTag->next;
}

static char *urlOkChars()
/* Return array character indexed array that has
 * 1 for characters that are ok in URLs and 0
 * elsewhere. */
{
char *okChars;
int c;
AllocArray(okChars, 256);
for (c=0; c<256; ++c)
    if (isalnum(c))
        okChars[c] = 1;
/* This list is a little more inclusive than W3's. */
okChars['='] = 1;
okChars['-'] = 1;
okChars['/'] = 1;
okChars['%'] = 1;
okChars['.'] = 1;
okChars[';'] = 1;
okChars[':'] = 1;
okChars['_'] = 1;
okChars['&'] = 1;
okChars['+'] = 1;
okChars['('] = 1;
okChars[')'] = 1;
okChars['$'] = 1;
okChars['!'] = 1;
okChars['*'] = 1;
okChars['@'] = 1;
okChars['\''] = 1;  // apparently the apostrophe itself is ok
okChars['|'] = 1;   // apparently the google uses pipe char
okChars[','] = 1;   // apparently the google uses comma char
okChars['#'] = 1;  // URI fragment, typically an anchor
return okChars;
}

static void validateCgiUrl(char *url)
/* Make sure URL follows basic CGI encoding rules. */
{
if (startsWith("http:", url) || startsWith("https:", url))
    {
    static char *okChars = NULL;
    UBYTE c, *s;
    if (okChars == NULL)
	okChars = urlOkChars();
    url = strchr(url, '?');
    if (url != NULL)
	{
	s = (UBYTE*)url+1;
	while ((c = *s++) != 0)
	    {
	    if (!okChars[c])
		{
		errAbort("Character %c not allowed in URL %s", c, url);
		}
	    }
	}
    }
}

static void validateCgiUrls(struct htmlPage *page)
/* Make sure URLs in page follow basic CGI encoding rules. */
{
struct htmlForm *form;
struct slName *linkList = htmlPageLinks(page), *link;

for (form = page->forms; form != NULL; form = form->next)
    validateCgiUrl(form->action);
for (link = linkList; link != NULL; link = link->next)
    validateCgiUrl(link->name);
slFreeList(&linkList);
}

static struct htmlTag *nextTagOfTypeInList(struct htmlTag *tagList, char *type)
/* Return next tag of given type in list or NULL if none. */
{
struct htmlTag *tag;
for (tag = tagList; tag != NULL; tag = tag->next)
    if (sameString(tag->name, type))
	return tag;
return NULL;
}

static int countTagsOfType(struct htmlTag *tagList, char *type)
/* Count number of tags of given type. */
{
struct htmlTag *tag;
int count = 0;
for (tag = tagList; tag != NULL; tag = tag->next)
    if (sameString(tag->name, type))
        ++count;
return count;
}

static void checkExactlyOne(struct htmlTag *tagList, char *type)
/* Check there is exactly one of tag in list. */
{
int count = countTagsOfType(tagList, type);
if (count != 1)
    errAbort("Expecting exactly 1 <%s>, got %d", type, count);
}


void htmlPageFormOrAbort(struct htmlPage *page)
/* Aborts if no FORM found */
{
if (page == NULL)
    errAbort("Can't validate NULL page");
if (page->forms == NULL)
    errAbort("No form found");
}

void htmlPageValidateOrAbort(struct htmlPage *page)
/* Do some basic validations.  Aborts if there is a problem. */
{
struct htmlTag *tag;
boolean gotTitle = FALSE;
char *contentType = NULL;

if (page == NULL)
    errAbort("Can't validate NULL page");
if (page->header != NULL)
    contentType = hashFindVal(page->header, "Content-Type:");
if (contentType == NULL || startsWith("text/html", contentType))
    {
    /* To simplify things upper case all tag names. */
    for (tag = page->tags; tag != NULL; tag = tag->next)
	{
	touppers(tag->name);
	if (isEmpty(tag->name)) // causes a blank tag
	    tagAbort(page, tag, "Space not allowed between opening bracket < and tag name");
	if (startsWith("/", tag->name))
	    {
	    if (sameString(tag->name,"/")) // causes a blank close tag
		tagAbort(page, tag, "Space not allowed between opening bracket </ and closing tag name");
	    if (tag->attributes)
		tagAbort(page, tag, "Attributes are not allowed in closing tag: [%s]", tag->name);
	    }
	}


    checkExactlyOne(page->tags, "BODY");

    /* Validate header, and make a suggestion or two */
    if ((tag = page->tags) == NULL)
	errAbort("No tags");
    if (!sameWord(tag->name, "HTML"))
	errAbort("Doesn't start with <HTML> tag");
    struct htmlTag *headTag = nextTagOfTypeInList(tag->next, "HEAD");
    if (headTag == NULL)
        warn("No <HEAD> tag after <HTML> tag");
    else
	{
	tag = headTag;
	for (;;)
	    {
	    tag = tag->next;
	    if (tag == NULL)
		errAbort("Missing </HEAD>");
	    if (sameWord(tag->name, "TITLE"))
		gotTitle = TRUE;
	    if (sameWord(tag->name, "/HEAD"))
		break;
	    }
	if (!gotTitle)
	    warn("No title in <HEAD>");
	validateNestingTags(page, page->tags, tag, headNesters, ArraySize(headNesters));
	tag = tag->next;
	}
    if ((tag = nextTagOfTypeInList(tag, "BODY")) == NULL)
	errAbort("<BODY> tag does not follow <HTML> tag");
    tag = validateBody(page, tag->next);
    if (tag == NULL || !sameWord(tag->name, "/HTML"))
	errAbort("Missing </HTML>");
    validateCgiUrls(page);
    }
}

void htmlPageStrictTagNestCheck(struct htmlPage *page)
/* Do strict tag nesting check.  Aborts if there is a problem. */
{
struct htmlTag *tag;
/* To simplify things upper case all tag names. */
for (tag = page->tags; tag != NULL; tag = tag->next)
    touppers(tag->name);

/* Add singleton tags to hash. */
struct hash *singleTonHash = hashNew(8);
int i;
int count=ArraySize(singleTons);
for (i=0; i<count; ++i)
    hashAdd(singleTonHash, singleTons[i], NULL);

/* Add selfCloser tags to hash. */
struct hash *selfCloserHash = hashNew(8);
count=ArraySize(selfClosers);
for (i=0; i<count; ++i)
    hashAdd(selfCloserHash, selfClosers[i], NULL);

boolean inA = FALSE;  // inside A tag. (A tags may not be nested.)
struct slName *tagStack = NULL;
for (tag = page->tags; tag != NULL; tag = tag->next)
    {
    if (isEmpty(tag->name)) // causes a blank tag
	tagAbort(page, tag, "Space not allowed between opening bracket < and tag name");
    if (sameString(tag->name,"A")) // A open tag
	{
	if (inA) 
	    tagAbort(page, tag, "A tags may not be nested inside one another.");
	else
	    inA = TRUE;
	}

    if (startsWith("/", tag->name))
	{
	if (sameString(tag->name,"/")) // causes a blank close tag
	    tagAbort(page, tag, "Space not allowed between opening bracket </ and closing tag name");
        if (tag->attributes)
	    tagAbort(page, tag, "Attributes are not allowed in closing tag: [%s]", tag->name);
	if (sameString(tag->name,"/A")) // A close tag
	    {
	    if (inA) 
		inA = FALSE;
	    else
		tagAbort(page, tag, "/A close tag with no open tag.");
	    }
	if (hashLookup(singleTonHash, tag->name+1))
	    tagAbort(page, tag, "Tag %s closing tag not allowed for singleton tags.", tag->name);
	if (!sameString("P", tag->name+1))
	    {
	    if (!tagStack)
		tagAbort(page, tag, "No tags still left on stack. Closing tag %s has no corresponding open tag.", tag->name);
	    struct slName *top = slPopHead(&tagStack);
	    // flush LI tags still on stack when /UL or /OL encountered
	    // since the missing /LI tags are usually tolerated. 
	    while ((sameString(tag->name, "/UL") || sameString(tag->name, "/OL")) && sameString(top->name,"LI"))
		{
		tagWarn(page, tag, "Closing tag %s found. LI tag on stack. Missing /LI tag. Please fix. Continuing.", tag->name);
		top = slPopHead(&tagStack);
		}
	    if (!sameString(top->name,tag->name+1))
		{
		tagAbort(page, tag, "Closing tag %s found, tag %s at top of stack.", tag->name, top->name);
		}
	    }
	}
    else
	{
	if (
	    ! hashLookup(singleTonHash, tag->name) 
	 && !(hashLookup(selfCloserHash, tag->name) && isSelfClosingTag(tag))
         && ! sameString("P", tag->name))
	    {
	    slAddHead(&tagStack, slNameNew(tag->name));
	    }	    
	}	    
    }
if (tagStack)
    errAbort("Some tags still left on stack. Open tag %s missing its closing tag.", tagStack->name);
}
