/*
 * PeakSeq
 * Version 1.01
 * Paper by Joel Rozowsky, et. al.
 * Coded in C by Theodore Gibson.
 * window.c
 * This module deals with everything that differs between windows.
*/

#include <stdio.h>
#include <string.h>
#include "window.h"
#include "filter.h"
#include "analyze.h"
#include "io.h"
#include "util.h"
#include "config.h"


//----------------------------------------------------------------------------
// PRIVATE STRUCTURE DEFINITIONS
//----------------------------------------------------------------------------
struct window
{
	// The mappability fraction.
	long double map_fract;
	// The number of reads.
	int n_reads;
	// An array holding the count of actual peaks at each threshold.  The
	// index of the array is the threshold - the minimum threshold.
	int* counts;
};

/*
 * This structure stores a single peak's start and stop positions.
 */
typedef struct
{
	// The start position of the peak.
	int start;
	// The stop position of the peak.
	int stop;
}* Peak;

struct actThresh
{
	// This flag tells whether a stop or start position is currently sought.
	// 0 indicates that a start of a peak is sought and 1 indicates that a
	// stop is sought.
	int flag;
	// This stores the most recently found stop position.  It is used to
	// determine whether the next peak should be merged with the previous peak
	// or if it is a new peak.
	int stop;
	// This is a pointer to the window indicated by peaks in this threshold.
	// It is used to count a peak in each window it overlaps.
	int w;

	// An array of the peaks at this threshold.
	Peak* ps;
	// The total number of peaks in this threshold.
	int nps;
	// The number of Peak pointers that can currently be stored in the array
	// without allocating more memory.
	int size;
	// A pointer to the last peak outputted to the file.
	int pp;
};


//----------------------------------------------------------------------------
// PRIVATE FUNCTION PROTOTYPES
//----------------------------------------------------------------------------
Window newWindow( void );
ActThresh newActThresh( void );
void process( Window* wa, ActThresh* ata, const int height, const int pos );
void insertPeak( ActThresh at, const int start );


//----------------------------------------------------------------------------
// PUBLIC FUNCTIONS
//----------------------------------------------------------------------------
Window** newWA( void )
{
	// Allocate memory for an array of Window pointers for each chromosome.
	Window** wa = safe_malloc( N_CHRS * sizeof(Window*) );

	// This loop runs once for each chromosome.
	for(int c = 0; c < N_CHRS; c++)
	{
		// Allocate memory for one Window pointer per window.
		wa[c] = safe_malloc( W_PER_C * sizeof(Window) );

		// Initialize all the window structures in this chromosome.
		for(int w = 0; w < W_PER_C; w++)
			wa[c][w] = newWindow();
	}

	return wa;
}

void getMapFracts( FILE* map, Window** wa )
{
	// These local variables are used for temporary storage.
	char cname[6];
	int window = 0;
	int count = 0;
	int cnum = 0;

	// These variables are used in loop and scanf() syntax.
	int ret = 0;
	char ch = '\0';

	// This loop runs until the end of the file is reached.
	while( (ch = getc(map)) != EOF )
	{
		// Return the character used for testing to the file.
		ungetc( ch, map );

		// Get the information from this line of the file.
		ret = fscanf(map, "%5s%d%d", cname, &window, &count);

		// If three arguments were not successfully read, there is an error in
		// this line, so skip the line.
		if( ret != 3 )
		{
			getNewline( map );
			printf("Error reading mappability file. Line skipped.\n");
			continue;
		}

		// Otherwise get the chromosome number from the string read.
		cnum = getCnum( cname );

		// If this function fails, skip this line.  This is probably a result
		// of a chromosome within the mappability file which is not between
		// the bounds of MIN_CNUM and MAX_CNUM as specified in config.h.
		// Also, if this window does not exist, skip the line.
		if( cnum == -1 || window >= W_PER_C || window < 0 )
		{
			getNewline( map );
			continue;
		}

		// Compute the mappability fraction and store it.
		wa[cnum-MIN_CNUM][window]->map_fract = ((long double) count) / W_SIZE;

		getNewline( map );	// Skip to the next line.
	}
}

void getNreadsStarts( FILE* eland, Window* wa, Starts starts )
{
	// These local variables are used for temporary storage.
	char seq[99];
	int pos = 0;
	char dir = '\0';

	// These variables are used in loop and scanf() syntax.
	int ret = 0;
	char ch = '\0';

	// This loop runs until the end of the file is reached.
	while( (ch = getc(eland)) != EOF )
	{
		// Return the character used for testing to the file.
		ungetc( ch, eland );

		// Get the information from this line of the file.
		ret = fscanf(eland, "%*s%98s%*s%*s%*s%*s%*s%d %c", seq, &pos, &dir);

		// If one argument was not successfully read, there is an error in
		// this line, so skip the line.
		if( ret != 3 )
		{
			getNewline( eland );
			printf("Error reading Eland file. Line skipped.\n");
			continue;
		}

		// If this is a reverse read, correct the position start.
		if( dir == 'R' )
			pos += strlen( seq ) - READ_LENGTH;

		// If the direction is not reverse or forward, there is an error in
		// this line, so skip the line.
		else if( dir != 'F' )
		{
			getNewline( eland );
			printf("Error reading Eland file. Line skipped.\n");
			continue;
		}

		// Find the window in which this position lies and check that it is
		// within the array memory allocated.  If it is beyond the allocated
		// memory, skip this line.
		int window = pos / W_SIZE;
		if( window >= W_PER_C || window < 0 )
		{
			getNewline( eland );
			continue;
		}

		// Increment the number of reads of the window in which this read
		// lies.
		wa[window]->n_reads++;

		// Insert this position into the position array.
		insertStart( starts, pos );

		getNewline( eland );	// Skip to the next line.
	}

	sortStarts( starts );		// Sort the position array by position.
}

ActThresh* newActThreshArray( void )
{
	// Allocate memory for the array.
	ActThresh* ata = safe_malloc( N_THRESHES * sizeof(ActThresh) );

	// Initialize each actThresh structure.
	for( int i = 0; i < N_THRESHES; i++)
		ata[i] = newActThresh();

	return ata;
}

void getActCounts( FILE* sgr, Window* wa, ActThresh* ata )
{
	// These local variables are used for temporary storage.
	int pos = 0;
	int height = 0;

	// These variables are used in loop and scanf() syntax.
	int ret = 0;
	char ch = '\0';

	// Allocate memory in the window array to store the counts for each
	// threshold in each window on this chromosome.
	for(int w = 0; w < W_PER_C; w++)
	{
		// Allocate the memory.
		wa[w]->counts = safe_malloc( N_THRESHES * sizeof(int) );

		// Initialize the fields.
		for( int i = 0; i < N_THRESHES; i++ )
			wa[w]->counts[i] = 0;
	}

	// This loop runs until the end of the file is reached.
	while( (ch = getc(sgr)) != EOF )
	{
		// Return the character used for testing to the file.
		ungetc( ch, sgr );

		// Get the information from this line of the file.
		ret = fscanf(sgr, "%*s%d%d", &pos, &height);

		// If two arguments were not successfully read, there is an error in
		// this line, so skip the line.
		if( ret != 2 )
		{
			getNewline( sgr );
			printf("Error reading SGR file. Line skipped.\n");
			continue;
		}

		// Use this position to find peaks.
		process( wa, ata, height, pos );

		getNewline( sgr );	// Skip to the next line.
	}

	// Store the stop position of the last peak as long as there was at least
	// one peak.
	for(int i = 0; i < N_THRESHES; i++)
	{
		// If there was at least 1 peak at this threshold, the following
		// comparisons will be safe.
		if( ata[i]->nps > 0 )
		{
			// If the stop of this peak was recorded in the at->stop field,
			// use at->stop.
			if( ata[i]->stop > ata[i]->ps[ata[i]->nps-1]->start )
				ata[i]->ps[ata[i]->nps-1]->stop = ata[i]->stop;

			// Otherwise the peak continued beyond the end of the region, so
			// call the end of the peak the final read from the file.
			else
			{
				ata[i]->ps[ata[i]->nps-1]->stop = pos;

				// Get the window in which this position is located.
				int stopw = pos / W_SIZE;

				// If the stop position of the last peak was never found, then
				// it is necessary to increment all the windows between the
				// start position of the peak and the window in which the
				// final position was located.
				while( (ata[i]->w+1) <= stopw )
				{
					ata[i]->w++;
					wa[ata[i]->w]->counts[i]++;
				}
			}
		}
	}
}

double getWMapFract( Window w )
{
	return w->map_fract;
}

int getWNReads( Window w )
{
	return w->n_reads;
}

int getWCount( Window w, const int thresh )
{
	return w->counts[thresh-MIN_THRESH];
}

void findPeaks( FILE* out, ActThresh at, Starts sample_starts,
		Starts control_starts, Data data, long double* pCache,
		long double* factCache, int* last_startp, int* last_stopp,
		const int w_start, const int w_stop, const long double scaling_factor,
		const int max_fact, const int max_exp, const String cname )
{
	// Advance until the first position located beyond the start of the window
	// is reached.
	while( (at->pp < at->nps) && (at->ps[at->pp]->stop < w_start) )
		at->pp++;

	// Check if this peak is within the window.  If it is within the window,
	// it is the first peak and a merge should be checked for with the last
	// peak from the previous window.
	if( (at->pp < at->nps) && (at->ps[at->pp]->start <= w_stop) )
	{
		// The start of the peak is the greater of either the peak start or
		// the window start.
		int start = (at->ps[at->pp]->start > w_start) ?
				at->ps[at->pp]->start : w_start;

		// If the last peak from the previous window is beyond the maximum gap
		// from this peak, this is a new peak, so print the previous peak and
		// record the starting position of this one.
		if( start - (*last_stopp) > MAX_GAP )
		{
			if( (*last_stopp) >= 0 )
				fprintfTagData( out, sample_starts, control_starts, data,
						pCache, factCache, (*last_startp), (*last_stopp),
						scaling_factor, max_fact, max_exp, cname );
			(*last_startp) = start;
		}

		// Either way, last_stop is the end of this peak and start looking for
		// the next peak.
		(*last_stopp) = (at->ps[at->pp]->stop < w_stop) ?
				at->ps[at->pp]->stop : w_stop;
		at->pp++;
	}

	// Now that the first peak in this window has been found, any more peaks
	// in this window must be at least max_gap apart and it is now only
	// necessary to make sure that the peak is within the window.
	while( (at->pp < at->nps) && (at->ps[at->pp]->start <= w_stop) )
	{
		// Print the previous peak now that it is complete and record the
		// starting position of this peak..
		fprintfTagData( out, sample_starts, control_starts, data, pCache,
				factCache, (*last_startp), (*last_stopp), scaling_factor,
				max_fact, max_exp, cname );
		(*last_startp) = at->ps[at->pp]->start;

		// The last_stop field is set to the lesser of the end of the peak and
		// the end of this window.
		(*last_stopp) = (at->ps[at->pp]->stop < w_stop) ?
				at->ps[at->pp]->stop : w_stop;

		at->pp++;	// Start looking for the next peak.
	}

	// Once a peak is reached that is outside the window, the pointer is
	// decremented in case the previous peak overlaps and is located in the
	// next window as well.
	if( at->pp > 0 )
		at->pp--;
}

void freeActThreshArray( ActThresh* ata )
{
	for(int i = 0; i < N_THRESHES; i++)
	{
		// Free all the peaks in the actThresh structure.
		for(int j = 0; j < ata[i]->nps; j++)
			free( ata[i]->ps[j] );

		free( ata[i]->ps );		// Free the peak array.
		free( ata[i] );			// Free the structure.
	}

	free( ata );	// Free the actThresh array.
}

void freeCounts( Window* wa )
{
	for(int i = 0; i < W_PER_C; i++)
		free( wa[i]->counts );
}

void freeWA( Window** wa )
{
	for(int c = 0; c < N_CHRS; c++)
	{
		// Free each window structure.
		for(int w = 0; w < W_PER_C; w++)
			free( wa[c][w] );
		free( wa[c] );	// Free the array for each chromosome.
	}

	free( wa );		// Free the pointer.
}


//----------------------------------------------------------------------------
// PRIVATE FUNCTIONS
//----------------------------------------------------------------------------
/*
 * This function allocates memory for and initializes the fields of a new
 * 	window structure.  I have purposely neglected to allocate memory for the
 * 	counts field because it would require a lot of memory and the field is
 * 	only needed for one chromosome at a time.
 * Outputs a pointer to the new structure.
 */
Window newWindow( void )
{
	// Allocate memory for the structure.
	Window w = safe_malloc( sizeof(*w) );

	// Initialize the fields.
	w->map_fract = 0.0L;
	w->n_reads = 0;
	w->counts = NULL;

	return w;
}

/*
 * This function allocates memory for and initializes the fields of a new
 * 	actThresh structure.
 * Outputs a pointer to the new structure.
 */
ActThresh newActThresh( void )
{
	// Allocate memory for the structure.
	ActThresh at = safe_malloc( sizeof(*at) );

	// Initialize the fields.
	at->flag = 0;
	at->stop = -MAX_GAP - 1;
	at->w = 0;
	at->nps = 0;
	at->size = 10;
	at->pp = 0;

	// Allocate memory for the array.
	at->ps = safe_malloc( at->size * sizeof(Peak) );

	return at;
}

/*
 * This function walks the position and height through each threshold.  It
 * 	keeps a running count of the number of peaks for each threshold in the
 * 	window structure and stores the start and end positions of the peaks in
 * 	the actThresh array. By storing the counts in the window structure, they
 * 	can be more easily accessed during the simulation.
 * wa: the single array of window structures for this chromosome.
 * ata: a pointer to the actThresh array containing the array of actThresh
 * 	structures that contain all the information that differs between
 * 	thresholds.
 * height: the height at this position.
 * pos: the position.
 */
void process( Window* wa, ActThresh* ata, const int height, const int pos )
{
	// This loop runs once for each threshold.
	for(int thresh = MIN_THRESH; thresh <= MAX_THRESH; thresh++)
	{
		// Get the threshold index.
		int ti = thresh - MIN_THRESH;

		// Get the current threshold's structure.
		ActThresh at = ata[ti];

		// This block is entered if the height is above the threshold and we
		// are looking for the starting position of a peak.
		if( (height >= thresh) && (at->flag == 0) )
		{
			// If this peak is further than the maximum gap from the most
			// recently found peak, then this peak is a new peak and must be
			// counted.
			if( pos - at->stop > MAX_GAP )
			{
				// Find the window in which this position is located.
				at->w = pos / W_SIZE;

				// If the window is beyond the window parameters, an error has
				// occurred and this function should return without doing
				// anything.
				if( at->w >= W_PER_C || at->w < 0 )
					return;

				// Increment the count in this window for this threshold.
				wa[at->w]->counts[ti]++;

				// Set the stop position for the previous peak, since that
				// peak is now definitively complete.
				if( at->nps > 0 )
					at->ps[at->nps-1]->stop = at->stop;

				// Insert the new peak into the array.
				insertPeak( at, pos );
			}

			// Whether this is a new peak or not, set the flag to 1 to look
			// for the end of this peak.
			at->flag = 1;
		}

		// If the height is below the threshold and the end of a peak is
		// sought, this is the end of the peak.
		else if( (height < thresh) && (at->flag == 1) )
		{
			// Set the stop field to this position and set the flag to 0.  If
			// the next peak begins more than max_gap away from this peak,
			// then this peak's stop position will be set to the value of the
			// at->stop field.
			at->stop = pos - 1;
			at->flag = 0;

			// Get the window in which this position is located.
			int stopw = at->stop / W_SIZE;

			// If the window does not exist, return.
			if( stopw >= W_PER_C || stopw < 0 )
				return;

			// The at->w field stores the window in which the start position
			// of this peak was located.  If the stop position is in the same
			// window as the start position, don't increment anything in the
			// window array.  Otherwise, keep incrementing the successive
			// window until the window in which this stop position is located
			// is reached.
			while( (at->w + 1) <= stopw )
			{
				at->w++;
				wa[at->w]->counts[ti]++;
			}
		}
	}
}

/*
 * This function allocates memory for and initializes a new peak structure and
 *	stores it in the array.
 * at: a pointer to the actThresh structure in which the peak is to be stored.
 * start: the start position of the peak.
 */
void insertPeak( ActThresh at, const int start )
{
	// Allocate memory for the structure.
	Peak p = safe_malloc( sizeof(*p) );

	// Initialize the fields.
	p->start = start;
	p->stop = -1;

	// Insert the peak into the array.
	at->ps[at->nps] = p;
	at->nps++;

	// If necessary, allocate more memory for the next peak.
	if( at->nps >= at->size )
	{
		at->size *= 2;
		at->ps = safe_realloc( at->ps, at->size * sizeof(Peak) );
	}
}
