/*
 * PeakSeq
 * Version 1.01
 * Paper by Joel Rozowsky, et. al.
 * Coded in C by Theodore Gibson.
 * analyze.c
 * This module deals with BH correction for multiple hypothesis testing.
 */


#include <stdio.h>
#include <string.h>
#include "analyze.h"
#include "io.h"
#include "util.h"
#include "config.h"


//----------------------------------------------------------------------------
// PRIVATE STRUCTURE DEFINITIONS
//----------------------------------------------------------------------------
/*
 * This structure stores all information about a single peak outputted to
 * 	PolII.txt.
 * Datum is a pointer to this structure.
 */
typedef struct datum
{
	// The chromosome string from chr1 to chrM.
	String cname;
	// The start position of the peak.
	int start;
	// The stop position of the peak.
	int stop;
	// The enrichment.
	long double enrich;
	// The excess.
	int excess;
	// The pvalue.
	long double pval;
	// The number of reads from the sample elands file in the peak.
	int sample_tags;
	// The adjusted number of reads from the control input file in the peak.
	int control_tags;
	// The zscore -- only initialized for very small p-values.
	long double zscore;
}* Datum;

struct data
{
	// The dynamic array of peaks.
	Datum* ps;
	// The number of peaks inserted into the array so far.
	int nps;
	// The number of peaks that can be stored in the array without allocating
	// more memory.
	int size;
};


//----------------------------------------------------------------------------
// PRIVATE FUNCTION PROTOTYPES
//----------------------------------------------------------------------------
Datum* mergesortData( Datum* left, const int num );
Datum* mergeData( Datum* left, Datum* right, const int lnum, const int rnum );
Datum* mergesortZscore( Datum* left, const int num );
Datum* mergeZscore( Datum* left, Datum* right, const int lnum,
		const int rnum );
void freeDatum( Datum d );


//----------------------------------------------------------------------------
// PUBLIC FUNCTIONS
//----------------------------------------------------------------------------
Data newData( void )
{
	// Allocate memory for the structure.
	Data data = safe_malloc( sizeof(*data) );

	// Initialize the fields.
	data->nps = 0;
	data->size = 10;

	// Allocate memory for the array.
	data->ps = safe_malloc( data->size * sizeof(struct datum) );

	return data;
}

void insertData( Data data, const String cname, const int start,
		const int stop, const int sample_tags, const int control_tags,
		const long double enrich, const int excess, const long double pval,
		const long double zscore )
{
	// Allocate memory for the structure.
	Datum d = safe_malloc( sizeof(*d) );

	// Allocate memory for the chromosome String.
	d->cname = safe_malloc( 6 * sizeof(char) );

	// Initialize the fields.
	strcpy( d->cname, cname );
	d->start = start;
	d->stop = stop;
	d->sample_tags = sample_tags;
	d->control_tags = control_tags;
	d->enrich = enrich;
	d->excess = excess;
	d->pval = pval;
	d->zscore = zscore;

	// Insert the datum structure into the array.
	data->ps[data->nps] = d;
	data->nps++;

	// Allocate more memory if necessary.
	if( data->nps >= data->size )
	{
		data->size *= 2;
		data->ps = safe_realloc( data->ps,
				data->size * sizeof(struct datum) );
	}
}

void sortData( Data data )
{
	Datum* temp  = mergesortData( data->ps, data->nps );
	free( data->ps );
	data->ps = temp;
}

void fprintfData( FILE* out, Data data )
{
	if( data->nps <= 0 )
	{
		freeData( data );
		return;
	}

	// Print the header for each column in the file.
	fprintf(out, "Chr\tStart\tStop\tSample_tags\tAdj_control_tags\t"
			"Enrichment\tExcess\tp_value\tq_value\n");

	// On each run of the loop, this field will contain the rank based on the
	// p-value of each peak.
	int rank = 0;

	// On each run of the loop, this field will contain the q-value for the
	// peak.
	long double qvalue = 0.0L;

	// The number of peaks with a 0 p-value.
	int nzeros = 0;

	// Count the number of peaks whose pval fields are equal to 0.  These
	// p-values are not actually 0, but they were too small to be calculated.)
	for( int i = 0; data->ps[i]->pval == 0.0L; i++ )
	{
		nzeros++;
		if( i >= data->nps ) break;
	}


	// Now that the number of peaks with p-value of 0 is known, find the
	// number of non-zeros.
	int non_zeros = data->nps - nzeros;

	// Get a pointer to the array containing the non-zero peaks.  This is a
	// pointer to the first non-zero element in the array whose index is equal
	// to the number of zeros.
	Datum* non_zero_peaks = &data->ps[nzeros];

	// Sort the zero peaks by their z-score.
	Datum* zero_peaks = mergesortZscore( data->ps, nzeros );

	// This loop runs once for every peak with a zero p-value.
	while( rank < nzeros )
	{
		// Get a pointer to the peak.
		Datum d = zero_peaks[rank++];

		// Otherwise print the peak to the file..
		fprintf(out, "%s\t%d\t%d\t%d\t%d\t%Lg\t%d\t%Lg\t%Lg\n", d->cname,
				d->start, d->stop, d->sample_tags, d->control_tags, d->enrich,
				d->excess, d->pval, qvalue);

		// Free this datum structure.
		freeDatum( d );
	}

	// This loop runs once for every peak with a non-zero p-value.
	for( int i = 0; i < non_zeros; i++ )
	{
		// Get a pointer to the peak.
		Datum d = non_zero_peaks[i];

		// Increment the rank field so that it holds the rank of this peak.
		rank++;

		// Calculate the adjusted p-value using BH correction for multiple
		// hypothesis testing.
		qvalue = d->pval * data->nps / rank;

		// If the adjusted pvalue does not meet the criteria, skip this peak.
		if( qvalue > PVAL_THRESH )
		{
			freeDatum( d );
			continue;
		}

		// Otherwise print the peak to the file..
		fprintf(out, "%s\t%d\t%d\t%d\t%d\t%Lg\t%d\t%Lg\t%Lg\n", d->cname,
				d->start, d->stop, d->sample_tags, d->control_tags, d->enrich,
				d->excess, d->pval, qvalue);

		// Free this datum structure.
		freeDatum( d );
	}

	// Free the arrays.
	free( zero_peaks );
	free( data->ps );

	// Free the structure.
	free( data );
}

void freeData( Data data )
{
	// Free each datum structure.
	for(int i = 0; i < data->nps; i++)
		freeDatum(data->ps[i]);

	free( data->ps );	// Free the array.
	free( data );		// Free the structure.
}


//----------------------------------------------------------------------------
// PRIVATE FUNCTIONS
//----------------------------------------------------------------------------
/*
 * This function implements a standard mergesort algorithm on a data array,
 * 	sorting by p-value.
 * left: a pointer to the array to be sorted.
 * num: the number of elements in the array.
 * Outputs a pointer to the sorted array.
 */
Datum* mergesortData( Datum* left, const int num )
{
	// If the array contains only one element, return an array with just this
	// element.
	if ( num <= 1)
	{
		Datum* res = safe_malloc( sizeof(*res) );
		res[0] = left[0];
		return res;
	}

	// Otherwise split the array in half and sort each half of the list.
	int half = num / 2;

	// Get a pointer to the right half of the list.
	Datum* right = &left[half];

	// Sort both halves of the array.
	left = mergesortData( left, half );
	right = mergesortData( right, num - half );

	// Merge the two sorted halves.
	return ( mergeData( left, right, half, num - half ) );
}

/*
 * This function merges two already sorted arrays into a single sorted array.
 * left: a pointer to one of the arrays.
 * right: a pointer to the other array.
 * lnum: the number of elements in the left array.
 * rnum: the number of elements in the right array.
 * Outputs a pointer to the sorted array.
 */
Datum* mergeData( Datum* left, Datum* right, const int lnum, const int rnum )
{
	// The number of regions in the merged array must be calculated in order
	// to allocate enough storage in the new array to hold pointers to every
	// element in both of the input arrays.
	int num = lnum + rnum;
	Datum* result = safe_malloc( num * sizeof(*result) );

	int i = 0;	// The index of the current element in the left array.
	int j = 0;	// The index of the current element in the right array.

	// This loop runs while these indices are within the bounds of their
	// respective arrays.
	while ( (i < lnum) && (j < rnum) )
	{
		// If p-value of this position in the left array is less than the
		// p-value of this position in the right array, the position in the
		// left array comes next.
		if (left[i]->pval < right[j]->pval)
		{
			result[i+j] = left[i];
			i++;
		}

		// Otherwise, the current element in the right array should be the
		// next element in the result array.
		else
		{
			result[i+j] = right[j];
			j++;
		}
	}

	// Once the end of one of the arrays is reached, copy the remainder of the
	// other array.
	while (i < lnum)
	{
		result[i+j] = left[i];
		i++;
	}

	while (j < rnum)
	{
		result[i+j] = right[j];
		j++;
	}

	// Free the old arrays.
	free( left );
	free( right );

	return result;
}

/*
 * This function implements a standard mergesort algorithm on a data array,
 * 	sorting by z-score.
 * left: a pointer to the array to be sorted.
 * num: the number of elements in the array.
 * Outputs a pointer to the sorted array.
 */
Datum* mergesortZscore( Datum* left, const int num )
{
	// If the array contains only one element, return an array with just this
	// element.
	if ( num <= 1)
	{
		Datum* res = safe_malloc( sizeof(*res) );
		res[0] = left[0];
		return res;
	}

	// Otherwise split the array in half and sort each half of the list.
	int half = num / 2;

	// Get a pointer to the right half of the list.
	Datum* right = &left[half];

	// Sort both halves of the array.
	left = mergesortZscore( left, half );
	right = mergesortZscore( right, num - half );

	// Merge the two sorted halves.
	return ( mergeZscore( left, right, half, num - half ) );
}

/*
 * This function merges two already sorted arrays into a single sorted array.
 * left: a pointer to one of the arrays.
 * right: a pointer to the other array.
 * lnum: the number of elements in the left array.
 * rnum: the number of elements in the right array.
 * Outputs a pointer to the sorted array.
 */
Datum* mergeZscore( Datum* left, Datum* right, const int lnum,
		const int rnum )
{
	// The number of regions in the merged array must be calculated in order
	// to allocate enough storage in the new array to hold pointers to every
	// element in both of the input arrays.
	int num = lnum + rnum;
	Datum* result = safe_malloc( num * sizeof(*result) );

	int i = 0;	// The index of the current element in the left array.
	int j = 0;	// The index of the current element in the right array.

	// This loop continues while these indices are within the bounds of their
	// respective arrays.
	while ( (i < lnum) && (j < rnum) )
	{
		// If the z-score of this peak in the left array is more negative than
		// the z-score of this position in the right array, the position in
		// the left array comes next.
		if (left[i]->zscore < right[j]->zscore)
		{
			result[i+j] = left[i];
			i++;
		}
		// Otherwise, the current element in the right array should be the
		// next element in the result array.
		else
		{
			result[i+j] = right[j];
			j++;
		}
	}

	// Once the end of one of the arrays is reached, copy the remainder of the
	// other array.
	while (i < lnum)
	{
		result[i+j] = left[i];
		i++;
	}

	while (j < rnum)
	{
		result[i+j] = right[j];
		j++;
	}

	// Free the old arrays.
	free( left );
	free( right );

	return result;
}

/*
 * This function frees all memory associated with a single datum structure.
 * d: a pointer to the structure to be freed.
 */
void freeDatum( Datum d )
{
	free( d->cname );
	free( d );
}
