/* paraNodeStatus - Check status of paraNode on a list of machines. */
#include "paraCommon.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "net.h"
#include "internet.h"
#include "paraLib.h"
#include "rudp.h"
#include "paraMessage.h"

char *version = PARA_VERSION;   /* Version number. */

void usage()
/* Explain usage and exit. */
{
errAbort(
  "paraNodeStatus - version %s\n"
  "Check status of paraNode on a list of machines.\n"
  "usage:\n"
  "    paraNodeStatus machineList\n"
  "options:\n"
  "    -retries=N  Number of retries to get in touch with machine.\n"
  "        The first retry is after 1/100th of a second. \n"
  "        Each retry after that takes twice as long up to a maximum\n"
  "        of 1 second per retry.  Default is 7 retries and takes\n"
  "        about a second.\n"
  "    -long  List details of current and recent jobs.\n"
  , version
  );
}

void listJobsErr(char *name, int n)
/*Report list jobs error. */
{
warn("%s: listJobs bad reponse %d", name, n);
}

void showLong(struct paraMessage *pm, char *name, struct rudp *ru, int *pRunning, int *pRecent)
/* Fetch and display response to listJobs message.
 * Increment running and recent counts. */
{
int running, recent, i;
struct paraMultiMessage pmm;

/* ensure the multi-message response comes from the correct ip and has no duplicate msgs*/
pmmInit(&pmm, pm);

if (!pmmReceive(&pmm, ru))
    {
    warn("%s: no listJobs response", name);
    return;
    }
running = atoi(pm->data);
for (i=0; i<running; ++i)
    {
    if (!pmmReceive(&pmm, ru))
        {
	listJobsErr(name, 1);
	return;
	}
    printf("%s %s %s\n", name, "running", pm->data);
    }
if (!pmmReceive(&pmm, ru))
    {
    listJobsErr(name, 2);
    return;
    }
recent = atoi(pm->data);
for (i=0; i<recent; ++i)
    {
    if (!pmmReceive(&pmm, ru))
        {
	listJobsErr(name, 3);
	return;
	}
    printf("%s %s %s\n", name, "recent", pm->data);
    if (!pmmReceive(&pmm, ru))
        {
	listJobsErr(name, 4);
	return;
	}
    printf("%s %s %s\n", name, "result", pm->data);
    }
printf("%s summary %d running %d recent\n", name, running, recent);
printf("\n");
*pRunning += running;
*pRecent += recent;
}

void paraNodeStatus(char *machineList)
/* paraNodeStatus - Check status of paraNode on a list of machines. */
{
struct lineFile *lf = lineFileOpen(machineList, FALSE);
boolean longFormat = optionExists("long");
char *row[1];
int totalCpu = 0, totalBusy = 0, totalRecent = 0;

while (lineFileRow(lf, row))
    {
    char *name = row[0];
    struct paraMessage pm;
    struct rudp *ru = rudpMustOpen();

    if (optionExists("retries"))
        ru->maxRetries = optionInt("retries", 7);
    pmInitFromName(&pm, name, paraNodePortStr);
    if (longFormat)
	{
	pmPrintf(&pm, "%s", "listJobs");
	if (pmSend(&pm, ru))
	    showLong(&pm, name, ru, &totalBusy, &totalRecent);
	}
    else
	{
	pmPrintf(&pm, "%s", "status");
	if (pmSend(&pm, ru))
	    {
	    if (pmReceive(&pm, ru))
		{
		char *row[3];
		printf("%s %s\n", name, pm.data);
		chopLine(pm.data, row);
		totalBusy += atoi(row[0]);
		if (!sameString(row[1], "of"))
		    errAbort("paraNode status message format changed");
		totalCpu += atoi(row[2]);
		}
	    else
		{
		printf("%s no status return: %s\n", name, strerror(errno));
		}
	    }
	else
	    {
	    printf("%s unreachable\n", name);
	    }
	}
    rudpClose(&ru);
    }
if (longFormat)
    printf("%d running, %d recent\n", totalBusy, totalRecent);
else
    printf("%d of %d CPUs busy total\n", totalBusy, totalCpu);
}


int main(int argc, char *argv[])
/* Process command line. */
{
optionHash(&argc, argv);
if (argc != 2)
    usage();
paraNodeStatus(argv[1]);
return 0;
}
