#!/usr/bin/perl
#
# find_patterns_gibb : A Gibbs sampler to find patterns. Allows patterns that
# don't require inclusion of all sequences.
#
# Notes
# * $gibbs_state contains the CENTERS of the motifs; the motifs extend from
#   [$center - $width/2, $center + $width/2 - 1]. We assume that $width is
#   divisibly by 2.
#
################################################################################
BEGIN {
    push (@INC, "$ENV{PWD}/scripts/fast_pmf_cycle");
    push (@INC, "$ENV{PWD}/scripts/euclidean_dist");
    srand(0);
}
$| = 1;

use strict;
use Carp;
use Math::CDF;
use euclidean_dist;
use fast_pmf_cycle;
use DBI;

#
# Database settings
#
my $DB_NAME = "chromasig;host=redivivus.cacr.caltech.edu";
my $DB_USER = "chromasig";
my $DB_PASS = "Caan0che";

#
# constants
#
my @MODS = qw(H3K9ac H3K27me3 TAF1 Long-Non-PolyA-CAGE H3K4me3 H3K36me3 H3K4me1 Control CTCF DNase-Seq-UW H3K4me2 H3K27ac H4K20me1);

my $NONE_VAL = -9999;

my $TRUE = 1;
my $FALSE = 0;
my $DEBUG = 0;
my $DEBUG2 = 0;
my $DEBUG3 = 0;

my $MAX_WANDERING_DIST = 10; # in 100-bp bins, half of this per side
my $STD_FACTOR = 5;
my $SEED_PROFILE_SIZE;
my $MAX_SEED_SIZE = 20;

my $NUM_INIT_CYCLES = 0;
my $NUM_CYCLES = 5;

my $PRIOR_MODEL = 0.9;
my $PRIOR_BG    = 0.1;

my $BACKGROUND_BUFFER = 20000;

my %ID_TO_MARK = ( 1 => "Broad-GM12878-Rep1-H3K9ac.36mers.rds",
		2 => "Broad-GM12878-Rep1-H3K27me3.36mers.rds",
		3 => "GM12878-TAF1-SL804.rds",
		4 => "GM12878-Cytosolic-Long-Non-PolyA-CAGE.rds",
		5 => "Broad-GM12878-Rep1-H3K4me3.36mers.rds",
		6 => "Broad-GM12878-Rep1-H3K36me3.36mers.rds",
		7 => "Broad-GM12878-Rep1-H3K4me1.36mers.rds",
		8 => "Broad-GM12878-Rep1-Control.36mers.rds",
		9 => "Broad-GM12878-Rep1-CTCF.36mers.rds",
		10 => "GM12878-DNase-Seq-UW-Rep2.rds",
		11 => "Broad-GM12878-Rep1-H3K4me2.36mers.rds",
		12 => "Broad-GM12878-Rep1-H3K27ac.36mers.rds",
		13 => "Broad-GM12878-Rep1-H4K20me1.36mers.rds",");

my %CHR_TO_ID = ( chr1  =>  1,
		  chr2  =>  2,
		  chr3  =>  3,
		  chr4  =>  4,
		  chr5  =>  5,
		  chr6  =>  6,
		  chr7  =>  7,
		  chr8  =>  8,
		  chr9  =>  9,
		  chr10 => 10,
		  chr11 => 11,
		  chr12 => 12,
		  chr13 => 13,
		  chr14 => 14,
		  chr15 => 15,
		  chr16 => 16,
		  chr17 => 17,
		  chr18 => 18,
		  chr19 => 19,
		  chr20 => 20,
		  chr21 => 21,
		  chr22 => 22,
		  chrX  => 23,
		  chrY  => 24);

################################################################################
#
# MAIN
#
################################################################################
MAIN : {
    my ($filtered_file, $width, $max_wandering_dist,
	$std_factor, $bg_prior) = @ARGV;
    if ((not defined $filtered_file) ||
	(not defined $width) ||
	(not defined $max_wandering_dist) ||
	(not defined $std_factor) ||
	(not defined $bg_prior)) {
	die ("Usage: find_patterns_gibb_test_27_mysql.pl <filtered file> <width> <max wandering dist> <std factor> <bg prior>\n");
    }

    if ($width % 2 != 0) {
	die ("width must be divisible by 2\n");
    }
    if ($max_wandering_dist % 2 != 0) {
	die ("max_wandering_dist must be divisible by 2\n");
    }
    $MAX_WANDERING_DIST = $max_wandering_dist;
    $STD_FACTOR = $std_factor;
    $PRIOR_BG = $bg_prior;
    $PRIOR_MODEL = (1 - $PRIOR_BG) / 2;

    $SEED_PROFILE_SIZE = scalar(@MODS) * ($width + 1);

    # read data
    my $filtered_data = read_filtered($filtered_file);
    my $filtered_size = scalar(@$filtered_data);
    my $test_data = get_test_data();

    # get background stats
    my $background_stats = get_background_dist_global( $test_data, $filtered_data );

    # initialize C code
    fast_pmf_cycle::init( scalar(@MODS), $width, $MAX_WANDERING_DIST, $PRIOR_MODEL, $PRIOR_BG, $STD_FACTOR);
    fast_pmf_cycle::set_DEBUG(0);

    my %done;
    my $done = \%done;
    my $cluster = 0;
    while ($filtered_size - scalar(keys %$done) > $MAX_SEED_SIZE) {
	print STDERR ("\ncluster = $cluster\n"); #print ("\ncluster = $cluster\n");

	# initialize gibbs sampler
	my $gibbs_state = init_gibbs_seed( $filtered_data, $done, $test_data, $width, $background_stats );
	my $copy_gibbs_state = copy_gibbs_state( $gibbs_state );
	my $not_done_ids = get_not_done_ids($filtered_size, $done);
	my @seed_ids = keys (%$gibbs_state);

	for(my $cycle = 1 ; $cycle <= $NUM_CYCLES ; $cycle++) {
	    print STDERR ("\ncycle = $cycle\n"); #print ("\ncycle = $cycle\n");

	    # init variables for cycle
	    my $top_vote_tally;
	    my %new_state;
	    my $new_state = \%new_state;
	    my $motif_dist = compute_motif_full($gibbs_state, -1, $test_data, $width, $background_stats);
	    my $iter = 0;

	    while (1) {
		my ($exclude_id, $is_init, $is_first) = get_exclude_id($cycle, $iter,\@seed_ids, $not_done_ids);
		if ($exclude_id == -1) {
		    last;
		}
		print_status($iter);

		# compute probability distribution for selecting a site
		my ($top_pol, $top_index) = get_best_alignment($exclude_id, $filtered_data, $test_data, $width);

		my $en = $filtered_data->[$exclude_id]->{en};
		my $original_center = $filtered_data->[$exclude_id]->{loc};

		if (!$is_init) {
		    $top_vote_tally->{$top_index}++;
		}
		update_new_state( $filtered_data, $exclude_id, $new_state, $top_index, $top_pol );
		$iter++;
	    }

	    # reset gibbs
	    $gibbs_state = $new_state;

	    # gibbs state too small, so this seed was bad. Remove all elements and try again.
	    if (scalar (keys %$gibbs_state) <= scalar(@seed_ids)) {
		print STDERR ("bad seed, skipping\n");
		update_done($copy_gibbs_state, $done);
		last;
	    }
	}

	# print cluster, update done
	if (scalar (keys %$gibbs_state) > scalar(@seed_ids)) {

	    print ("\ncluster = $cluster\n");
	    print_cycle_summary($NUM_CYCLES, $gibbs_state, $test_data, $width, $background_stats);
	    print_cluster($gibbs_state, $cluster);
	    update_done($gibbs_state, $done);
	    $cluster++;
	}
    }
}

################################################################################
#
# get_best_alignment
#
################################################################################
sub get_best_alignment {
    my ($exclude_id, $filtered_data, $test_data, $width) = @_;

    # check args
    if (not defined $exclude_id) {
	Carp::confess ("exclude_id must be defined\n");
    }
    if (not defined $filtered_data) {
	Carp::confess ("filtered_data must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }

    # load excluded region for C code
    my $en = $filtered_data->[$exclude_id]->{en};
    my $original_center = $filtered_data->[$exclude_id]->{loc};

#    if ($DEBUG3) {
#	print("\nprofile:\n");
#	foreach my $mod (@MODS) {
#	    print("\nmod: $mod\n");
#	    for(my $i = -$MAX_WANDERING_DIST/2 - $width/2 ;
#		$i <= $MAX_WANDERING_DIST/2 + $width/2 ;
#		$i++) {
#
#		my $loc = $original_center + 100 * $i;
#		my $logr = $test_data->{$mod}->{$en}->{$loc};
#		if ($logr eq "none") {
#		    $logr = $NONE_VAL;
#		}
#
#		print(join("\t", "", $i, $loc, $logr) . "\n");
#	    }
#	}
#    }

    my $data = read_test_data($test_data, $en, $original_center, ($MAX_WANDERING_DIST + $width) * 100 / 2);
    if ($DEBUG3) {
	print("\nprofile:\n");
	foreach my $mod (@MODS) {
	    print("\nmod: $mod\n");
	    for(my $i = -$MAX_WANDERING_DIST/2 - $width/2 ;
		$i <= $MAX_WANDERING_DIST/2 + $width/2 ;
		$i++) {

		my $loc = $original_center + 100 * $i;
		my $logr = $data->{$mod}->{$loc} || 0;
		if ($logr eq "none") {
		    $logr = $NONE_VAL;
		}

		print(join("\t", "", $i, $loc, $logr) . "\n");
	    }
	}
    }

#    my $offset = -1 * (-$MAX_WANDERING_DIST/2 - $width/2);
#    for (my $mod_i = 0 ; $mod_i < scalar(@MODS) ; $mod_i++) {
#	for(my $loc_i = -$MAX_WANDERING_DIST/2 - $width/2 ;
#	    $loc_i <= $MAX_WANDERING_DIST/2 + $width/2 ;
#	    $loc_i++) {
#
#	    my $mod = $MODS[$mod_i];
#	    my $loc = $original_center + 100 * $loc_i;
#	    my $logr = $test_data->{$mod}->{$en}->{$loc};
#	    if ($logr eq "none") {
#		$logr = $NONE_VAL;
#	    }
#
#	    my $new_loc_i = $loc_i + $offset;
#	    fast_pmf_cycle::set_region($mod_i, $new_loc_i, $logr + 0.0);
#	}
#    }

    my $offset = -1 * (-$MAX_WANDERING_DIST/2 - $width/2);
    for (my $mod_i = 0 ; $mod_i < scalar(@MODS) ; $mod_i++) {
	for(my $loc_i = -$MAX_WANDERING_DIST/2 - $width/2 ;
	    $loc_i <= $MAX_WANDERING_DIST/2 + $width/2 ;
	    $loc_i++) {

	    my $mod = $MODS[$mod_i];
	    my $loc = $original_center + 100 * $loc_i;
	    my $logr = $data->{$mod}->{$loc} || 0;
#	    if ($logr eq "none") {
#		$logr = $NONE_VAL;
#	    }

	    my $new_loc_i = $loc_i + $offset;
	    fast_pmf_cycle::set_region($mod_i, $new_loc_i, $logr + 0.0);
	}
    }

    # call C code
    fast_pmf_cycle::compute_pmf();
    my $exclude = fast_pmf_cycle::get_votes();
    if ($exclude) {
	return (1, "EXCLUDE");
    }
    fast_pmf_cycle::tally_votes();
    return (fast_pmf_cycle::get_return_pol(), fast_pmf_cycle::get_return_loc());
}

################################################################################
#
# print_cycle_summary
#
################################################################################
sub print_cycle_summary {
    my ($cycle, $gibbs_state, $test_data, $width, $background_stats) = @_;

    # check args
    if (not defined $cycle) {
	Carp::confess ("cycle must be defined\n");
    }
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }
    if (not defined $background_stats) {
	Carp::confess ("background_stats must be defined\n");
    }

#    # output summary
#    print("top vote tally:\n");
#    my $total = 0;
#    foreach my $center_index (sort {$a <=> $b } keys %$top_vote_tally) {
#	print("    $center_index : " . $top_vote_tally->{$center_index} . "\n");
#	if ($center_index ne "EXCLUDE") {
#	    $total += $top_vote_tally->{$center_index};
#	}
#    }
#    print("total number classified: $total\n");

    # print motif at end
    print("\n\nmotif at end of $cycle cycles:\n");
    get_running_stats ( $gibbs_state, $test_data, $width, $background_stats, $TRUE );
}

################################################################################
#
# print_cluster
#
################################################################################
sub print_cluster {
    my ($gibbs_state, $cluster) = @_;

    # check args
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }
    if (not defined $cluster) {
	Carp::confess ("cluster must be defined\n");
    }

    foreach my $id (keys %$gibbs_state) {
	print(join("\t",
		   $gibbs_state->{$id}->{en},
		   $gibbs_state->{$id}->{loc},
		   $gibbs_state->{$id}->{pol},
		   $cluster) . "\n");
    }
}

################################################################################
#
# copy_gibbs_state: copies the gibbs state
#
################################################################################
sub copy_gibbs_state {
    my ($gibbs_state) = @_;

    # check args
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }

    my $new_state;
    foreach my $id (keys %$gibbs_state) {
	$new_state->{$id}->{en}  = $gibbs_state->{$id}->{en};
	$new_state->{$id}->{loc} = $gibbs_state->{$id}->{loc};
	$new_state->{$id}->{pol} = $gibbs_state->{$id}->{pol};
    }

    return $new_state;
}

################################################################################
#
# update_new_state
#
################################################################################
sub update_new_state {
    my ($filtered_data, $exclude_id, $new_state, $top_index, $top_pol) = @_;

    # check args
    if (not defined $filtered_data) {
	Carp::confess ("filtered_data must be defined\n");
    }
    if (not defined $exclude_id) {
	Carp::confess ("exclude_id must be defined\n");
    }
    if (not defined $new_state) {
	Carp::confess ("new_state must be defined\n");
    }
    if (not defined $top_index) {
	Carp::confess ("top_index must be defined\n");
    }
    if (not defined $top_pol) {
	Carp::confess ("top_pol must be defined\n");
    }

    if ($top_index eq "EXCLUDE") {
	return;
    }

    my $en = $filtered_data->[$exclude_id]->{en};
    my $original_center = $filtered_data->[$exclude_id]->{loc};

    my $new_center = $original_center + 100 * $top_index;
    $new_state->{$exclude_id}->{en} = $en;
    $new_state->{$exclude_id}->{loc} = $new_center;
    $new_state->{$exclude_id}->{pol} = $top_pol;
}

################################################################################
#
# update_done: updates the $done hashref to reflect the elements that have been
# visited
#
################################################################################
sub update_done {
    my ($gibbs_state, $done) = @_;

    # check args
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }
    if (not defined $done) {
	Carp::confess ("done must be defined\n");
    }

    foreach my $id (keys %$gibbs_state) {
	$done->{$id} = 1;
    }
}

################################################################################
#
# get_not_done_ids
#
################################################################################
sub get_not_done_ids {
    my ($filtered_size, $done) = @_;

    # check args
    if (not defined $filtered_size) {
	Carp::confess ("filtered_size must be defined\n");
    }
    if (not defined $done) {
	Carp::confess ("done must be defined\n");
    }

    my @not_done;
    for(my $i=0 ; $i < $filtered_size ; $i++) {
	if (!$done->{$i}) {
	    push(@not_done, $i);
	}
    }

    return \@not_done;
}

################################################################################
#
# get_exclude_id
#
################################################################################
sub get_exclude_id {
    my ($cycle, $iter, $seed_ids, $not_done_ids) = @_;

    # check args
    if (not defined $cycle) {
	Carp::confess ("cycle must be defined\n");
    }
    if (not defined $iter) {
	Carp::confess ("iter must be defined\n");
    }
    if (not defined $seed_ids) {
	Carp::confess ("seed_ids must be defined\n");
    }
    if (not defined $not_done_ids) {
	Carp::confess ("not_done_ids must be defined\n");
    }

    my $not_done_ids_size = scalar(@$not_done_ids);
    my $seed_size = scalar(@$seed_ids);
    my $factor = ($cycle == 1) ? $NUM_INIT_CYCLES : 0;
#    my $factor = $NUM_INIT_CYCLES;

    if ($iter < $factor * $seed_size) {
	my $i = $iter % $seed_size;
	return ($seed_ids->[$i], $TRUE, $i==0);
    } else {
	my $i = $iter - $factor * $seed_size;
	my $id = ($i < $not_done_ids_size) ? $not_done_ids->[$i] : -1;
	return ($id, $FALSE, $i==0);
#	return -1;
    }
}

################################################################################
#
# init_gibbs_seed: initializes the gibbs sampler with the best seed of
# $SEED_SIZE elements.
#
# For simplicity, we assume for now that all polarities are positive. The
# Gibbs sampler will fix this later.
#
################################################################################
sub init_gibbs_seed {
    my ($filtered_data, $done, $test_data, $width, $background_stats) = @_;

    # check args
    if (not defined $filtered_data) {
	Carp::confess ("filtered_data must be defined\n");
    }
    if (not defined $done) {
	Carp::confess ("done must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }
    if (not defined $background_stats) {
	Carp::confess ("background_stats must be defined\n");
    }

    print STDERR ("initializing clusters\n");

    # initialize clusters
    my $clusters_hashref;
    my $clusters_arrayref;
    for (my $id=0 ; $id < scalar(@$filtered_data) ; $id++) {
	if (!$done->{$id}) {
	    $clusters_hashref->{$id} = 1;
	    push(@$clusters_arrayref, $id);
	}
    }
    my $clusters_size = scalar(@$clusters_arrayref);

    # get all profiles and norms
    my $profiles_c;
    my $norm;
    for(my $i=0 ; $i < $clusters_size ; $i++) {
	my $id = $clusters_arrayref->[$i];
	$profiles_c->{$id} = get_profile_c($id, $filtered_data, $test_data, $width);

	my $e;
	$e->{id} = $id;
	$e->{norm} = euclidean_dist::euclidean_norm($profiles_c->{$id},
						    $SEED_PROFILE_SIZE);

#	print("norm = " . $e->{norm} . "\n\n");

	push(@$norm, $e);
    }

    # rank sites by euclidean norm
    my @sorted_norm = sort {$b->{norm} <=> $a->{norm}} @$norm;

    my $num_sites = 100;
    # choose $num_sites sites at evenly spaced intervals in top 25 percentile and perform
    # tournament sort
#    my $percentile = int(0.10 * $clusters_size);
#    my $percentile = int(0.75 * $clusters_size);
    my $percentile = 0;
    my $interval = int(0.25 * $clusters_size / $num_sites);
    if ($interval == 0) {
	$interval = 1;
    }

    my $best_seed = undef;
    my $best_score = undef;
    for(my $i=0 ; ($i < $num_sites) && ($i < $clusters_size) ; $i++) {
	print STDERR (".");
	if ($i % 10 == 0) {
	    print STDERR ("\t$i\n");
	}

	my $id = $sorted_norm[$percentile + $i * $interval]->{id};
#	print("i = $i, id = $id\n");

	my $seed = tournament_sort($id, $profiles_c, $clusters_arrayref, $filtered_data);
	my $score = get_seed_score($seed, $test_data, $width, $background_stats);

#	print("score = $score\n");

#	my ($seed, $ave_dist) = tournament_sort($id, $profiles_c,
#						$clusters_arrayref, $filtered_data);
#	my $norm = $sorted_norm[$percentile + $i * $interval]->{norm};
#	my $score = $norm / $ave_dist;

#	if ($best_score < $score) {
	if ((not defined $best_score) ||
	    ($best_score < $score)) {
	    $best_score = $score;
	    $best_seed = $seed;
	}

#	die ("haha");
    }

    if ($DEBUG) {
	print("best seed score = $best_score\n");
    }

#    print("best seed:\n");
#    print("size of seed = " . scalar(keys %$best_seed) . "\n");
#    get_running_stats ( $best_seed, $test_data, $width, $background_stats, $TRUE );    

    # clean up
    foreach my $id (keys %$profiles_c) {
	euclidean_dist::delete_doubleArray( $profiles_c->{$id} );
    }

    return $best_seed;
}

################################################################################
#
# get_seed_score
#
################################################################################
sub get_seed_score {
    my ($seed, $test_data, $width, $background_stats) = @_;

    # check args
    if (not defined $seed) {
	Carp::confess ("seed must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }
    if (not defined $background_stats) {
	Carp::confess ("background_stats must be defined\n");
    }

    my $logr_sums = get_running_stats($seed, $test_data, $width, $background_stats);
    my $motif_dist = compute_motif($seed, -1, $test_data, $width,
				   $logr_sums, $background_stats);

#    if ($DEBUG) { print("\npossible seed:\n"); }
    my $score = 0;
    foreach my $mod (@MODS) {
	my $ave_std = 0;
	my @abs_means;
	for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
	    my $m = abs($motif_dist->{$mod}->{$motif_index}->{mean});
	    my $s =     $motif_dist->{$mod}->{$motif_index}->{std};
	    $ave_std += $s;
	    push(@abs_means, $m);
	}
	$ave_std /= scalar(@abs_means);

#	my $symmetry_sum_left = 0;
#	my $symmetry_sum_right = 0;
#	for(my $i=0 ; $i < $width/2 ; $i++) {
#	    $symmetry_sum_left  += $abs_means[$i];
#	    $symmetry_sum_right += $abs_means[scalar(@abs_means) - 1 - $i];
#	}
#
#	# symmetry factor is max at 0.5, but min approaches 0
#	my $symmetry_factor;
#	if ($symmetry_sum_left > $symmetry_sum_right) {
#	    $symmetry_factor = 1 - $symmetry_sum_left  / ($symmetry_sum_left + $symmetry_sum_right);
#	} else {
#	    $symmetry_factor = 1 - $symmetry_sum_right / ($symmetry_sum_left + $symmetry_sum_right);
#	}

	# highest to lowest
	@abs_means = sort {$b <=> $a} @abs_means;

	my $differential_signal_factor = 0;
	my $num_elements = int(scalar(@abs_means) / 2);
	for(my $i=0 ; $i < $num_elements ; $i++) {
	    $differential_signal_factor += $abs_means[$i];
	    $differential_signal_factor -= $abs_means[scalar(@abs_means) - 1 - $i];
	}
#	$score += ($max_mean - $min_mean) / $ave_std;
#	$score /= $ave_std;

	$score += $differential_signal_factor / $ave_std;
#	$score += $symmetry_factor * $differential_signal_factor / $ave_std;

#	$score += $best_score;
#	print ("\n");
    }

#    if ($DEBUG) { print("score = $score\n"); }
    return $score;
}

################################################################################
#
# tournament_sort
#
################################################################################
sub tournament_sort {
    my ($id, $profiles_c, $clusters_arrayref, $filtered_data) = @_;

    # check args
    if (not defined $id) {
	Carp::confess ("id must be defined\n");
    }
    if (not defined $profiles_c) {
	Carp::confess ("profiles_c must be defined\n");
    }
    if (not defined $clusters_arrayref) {
	Carp::confess ("clusters_arrayref must be defined\n");
    }
    if (not defined $filtered_data) {
	Carp::confess ("filtered_data must be defined\n");
    }

    # filter out id from clusters_arrayref
    my $original;
    foreach my $e_id (@$clusters_arrayref) {
	if ($id != $e_id) {
	    push(@$original, $e_id);
	}
    }

    # randomize array ordering
    fisher_yates_shuffle($original);

#    print("distances\n");

    # get distances to $id
    my $distances;
    foreach my $e_id (@$original) {
	my $dist = euclidean_dist::euclidean_dist( $profiles_c->{$id},
						   $profiles_c->{$e_id},
						   $SEED_PROFILE_SIZE );
	$distances->{$e_id} = $dist;
#	print(join("\t", $e_id, $dist) . "\n");
    }

    # perform sort
    while (scalar(@$original >= $MAX_SEED_SIZE)) {
	my $new;
	for(my $i=0 ; $i < scalar(@$original) ; $i += 2) {
	    my $e_id_1 = $original->[$i];
	    my $e_id_2 = $original->[$i+1];

	    if ((not defined $e_id_2) ||
		($distances->{$e_id_1} < $distances->{$e_id_2})) {
		push(@$new, $e_id_1);
	    } else {
		push(@$new, $e_id_2);
	    }
	}

	$original = $new;
    }

    # create seed
    push(@$original, $id);
    my $seed;
    foreach my $e_id (@$original) {
	$seed->{$e_id}->{en}  = $filtered_data->[$e_id]->{en};
	$seed->{$e_id}->{loc} = $filtered_data->[$e_id]->{loc};
	$seed->{$e_id}->{pol} = 1;
    }

    return $seed;
}

################################################################################
#
# fisher_yates_shuffle
#
################################################################################
sub fisher_yates_shuffle {
    my ($array) = @_;

    # check args
    if (not defined $array) {
	Carp::confess ("array must be defined\n");
    }

    my $i;
    for ($i = @$array; --$i; ) {
        my $j = int rand ($i+1);
        next if $i == $j;
        @$array[$i,$j] = @$array[$j,$i];
    }
}

################################################################################
#
# get_profile_c: averages the entries in the ids, returning the profile over
# the full length of the window; the object returned is a C object
#
################################################################################
sub get_profile_c {
    my ($id, $filtered_data, $test_data, $width) = @_;

    # check args
    if (not defined $id) {
	Carp::confess ("id must be defined\n");
    }
    if (not defined $filtered_data) {
	Carp::confess ("filtered_data must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }

    my $average_c = euclidean_dist::new_doubleArray( $SEED_PROFILE_SIZE );

    my $en = $filtered_data->[$id]->{en};
    my $center = $filtered_data->[$id]->{loc};
#    for(my $mod_index = 0 ; $mod_index < scalar(@MODS) ; $mod_index++) {
#	my $mod = $MODS[$mod_index];
#
#	for (my $i = -$width/2 ; $i <= $width/2 ; $i++) {
#	    my $index_c = $mod_index * ($width + 1) + ($i + $width/2);
#	    my $val = 0.0 + $test_data->{$mod}->{$en}->{$center + 100 * $i};
#	    euclidean_dist::doubleArray_setitem($average_c, $index_c, $val);
#	}
#    }

    my $data = read_test_data($test_data, $en, $center, $width * 100 / 2);
    for(my $mod_index = 0 ; $mod_index < scalar(@MODS) ; $mod_index++) {
	my $mod = $MODS[$mod_index];

	for (my $i = -$width/2 ; $i <= $width/2 ; $i++) {
	    my $index_c = $mod_index * ($width + 1) + ($i + $width/2);
	    my $val = 0.0 + ($data->{$mod}->{$center + 100 * $i} || 0);
	    euclidean_dist::doubleArray_setitem($average_c, $index_c, $val);
	}
    }

    return $average_c;
}

################################################################################
#
# print_status
#
################################################################################
sub print_status {
    my ($exclude_id) = @_;

    # check args
    if (not defined $exclude_id) {
	Carp::confess ("exclude_id must be defined\n");
    }

    # print stats
    if ($exclude_id != 0) {
	if ($exclude_id % 10 == 0) {
	    print STDERR (".");
	}
	if ($exclude_id % 100 == 0) {
	    print STDERR ("\t$exclude_id\n");
	}
    }
}

################################################################################
#
# compute_motif_full
#
################################################################################
sub compute_motif_full {
    my ($gibbs_state, $exclude_id, $test_data, $width, $background_stats) = @_;

    # check args
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }
    if (not defined $exclude_id) {
	Carp::confess ("exclude_id must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $background_stats) {
	Carp::confess ("background_stats must be defined\n");
    }

    my $motif_logr_sums  = get_running_stats ($gibbs_state, $test_data,
					      $width, $background_stats);
    my $motif = compute_motif($gibbs_state, $exclude_id, $test_data, $width,
			      $motif_logr_sums, $background_stats);

    # load motif into C
    for (my $mod_i = 0 ; $mod_i < scalar(@MODS) ; $mod_i++) {
	my $mod = $MODS[$mod_i];

	# motif stats
	for(my $motif_index = -$width/2; $motif_index <= $width/2; $motif_index++) {
	    my $mean = $motif->{$mod}->{$motif_index}->{mean};
	    my $std  = $motif->{$mod}->{$motif_index}->{std};

	    my $new_motif_index = $motif_index + $width/2;
	    fast_pmf_cycle::set_motif_mean($mod_i, $new_motif_index, $mean);
	    fast_pmf_cycle::set_motif_std ($mod_i, $new_motif_index, $std);
	}

	# background stats
	my $bg_mean = $background_stats->{$mod}->{mean};
	fast_pmf_cycle::set_bg_mean($mod_i, $bg_mean);

	my $bg_std  = $background_stats->{$mod}->{std};
	fast_pmf_cycle::set_bg_std($mod_i, $bg_std);
    }    
    fast_pmf_cycle::set_motif_mean_of_std();

    return $motif;
}

################################################################################
#
# compute_motif: computes the distribution at each residue of the motif
#
# The returned $motif_dist is a hash with keys in [-$width/2, $width/2-1]
#
################################################################################
sub compute_motif {
    my ($gibbs_state, $exclude_id, $test_data, $width,
	$motif_logr_sums, $background_stats) = @_;

    # check args
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }
    if (not defined $exclude_id) {
	Carp::confess ("exclude_id must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $motif_logr_sums) {
	Carp::confess ("motif_logr_sums must be defined\n");
    }
    if (not defined $background_stats) {
	Carp::confess ("background_stats must be defined\n");
    }

    if ($DEBUG3) {
	print("\nexclude_id = $exclude_id\n");
	if (defined $gibbs_state->{$exclude_id}) {
	    print("  in gibbs state\n");
	    print("  en  = " . $gibbs_state->{$exclude_id}->{en} . "\n");
	    print("  loc = " . $gibbs_state->{$exclude_id}->{loc} . "\n");
	    print("  pol = " . $gibbs_state->{$exclude_id}->{pol} . "\n");
	} else {
	    print("  not in gibbs state\n");
	}
    }

    # copy existing
    my $motif_dist;
    foreach my $mod (@MODS) {
	for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
	    $motif_dist->{$mod}->{$motif_index}->{mean} =
		$motif_logr_sums->{$mod}->{$motif_index}->{sum};

	    $motif_dist->{$mod}->{$motif_index}->{std} =
		$motif_logr_sums->{$mod}->{$motif_index}->{sum2};
	}
    }

    # update if necessary
    my $num_elements_offset;
    if (defined $gibbs_state->{$exclude_id}) {
	my $en     = $gibbs_state->{$exclude_id}->{en};
	my $center = $gibbs_state->{$exclude_id}->{loc};
	my $pol    = $gibbs_state->{$exclude_id}->{pol};

#	foreach my $mod (@MODS) {
#	    for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
#		my $index = $motif_index * $pol;
#
#		my $val = $test_data->{$mod}->{$en}->{$center + 100 * $index};
#		if ($val ne "none") {
#		    $motif_dist->{$mod}->{$motif_index}->{mean} -= $val;
#		    $motif_dist->{$mod}->{$motif_index}->{std} -= $val * $val;
#		    $num_elements_offset->{$mod}->{$motif_index} = -1;
#		} else {
#		    $num_elements_offset->{$mod}->{$motif_index} = 0;
#		}
#	    }
#	}

	my $data = read_test_data($test_data, $en, $center, $width * 100 / 2);
	foreach my $mod (@MODS) {
	    for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
		my $index = $motif_index * $pol;

		my $val = $data->{$mod}->{$center + 100 * $index} || 0;
		if ($val ne "none") {
		    $motif_dist->{$mod}->{$motif_index}->{mean} -= $val;
		    $motif_dist->{$mod}->{$motif_index}->{std} -= $val * $val;
		    $num_elements_offset->{$mod}->{$motif_index} = -1;
		} else {
		    $num_elements_offset->{$mod}->{$motif_index} = 0;
		}
	    }
	}
    }

    # end finding mean
    foreach my $mod (@MODS) {
	for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
	    my $num_elements = $motif_logr_sums->{$mod}->{$motif_index}->{count};

	    if ($num_elements == 1) {
#		$motif_dist->{$mod}->{$motif_index}->{mean} /= $num_elements;
#		$motif_dist->{$mod}->{$motif_index}->{std}  = $background_stats->{$mod}->{std};
		die ("zero element motif\n");
		next;
	    } elsif ($num_elements == 0) {
#		$motif_dist->{$mod}->{$motif_index}->{mean} = $background_stats->{$mod}->{mean};
#		$motif_dist->{$mod}->{$motif_index}->{std}  = $background_stats->{$mod}->{std};
		die ("negative element motif\n");
		next;
	    }

	    # element was excluded, so update count at this mod and column if necessary
	    if (defined $gibbs_state->{$exclude_id}) {
		$num_elements += $num_elements_offset->{$mod}->{$motif_index};
	    }

	    $motif_dist->{$mod}->{$motif_index}->{mean} /= $num_elements;

	    # currently motif_dist->{std} holds sum(x^2), so then we compute
	    # sqrt( (sum(x^2) - N*(E(X))^2)/(N-1) )
	    $motif_dist->{$mod}->{$motif_index}->{std} -= $num_elements * 
		                                          $motif_dist->{$mod}->{$motif_index}->{mean} * 
		                                          $motif_dist->{$mod}->{$motif_index}->{mean};
	    $motif_dist->{$mod}->{$motif_index}->{std} /= ($num_elements - 1);
	    $motif_dist->{$mod}->{$motif_index}->{std} = sqrt($motif_dist->{$mod}->{$motif_index}->{std});
	}
    }

    return $motif_dist;
}

################################################################################
#
# init_gibbs_seed_file
#
################################################################################
sub init_gibbs_seed_file {
    my ($file) = @_;

    # check args
    if (not defined $file) {
	Carp::confess ("file must be defined\n");
    }

    my $seed;
    my $id = 0;
    open(FILE, $file) || Carp::confess("could not open file ($file)\n");
    foreach my $line (<FILE>) {
	chomp $line;
	my ($en, $loc, $pol) = split(/\t/, $line);
	$seed->{$id}->{en} = $en;
	$seed->{$id}->{loc} = $loc;
	$seed->{$id}->{pol} = ($pol eq "+") ? 1 : -1;

	$id++;
    }
    close(FILE) || Carp::confess("could not close file ($file)\n");

    print STDERR ("done init with $id elements\n");

    return $seed;
}

################################################################################
#
# read_filtered
#
################################################################################
sub read_filtered {
    my ($file) = @_;

    # check args
    if (not defined $file) {
	Carp::confess ("file must be defined\n");
    }

    my $filtered;
    open(FILE, $file) || Carp::confess("could not open file ($file)\n");
    foreach my $line (<FILE>) {
	chomp $line;
	my ($en, $loc) = split(/\t/, $line);

	my $element;
	$element->{en} = $en;
	$element->{loc} = $loc;

	push(@$filtered, $element);
    }
    close(FILE) || Carp::confess("could not close file ($file)\n");

    print STDERR ("filtered_size = " . scalar(@$filtered) . "\n");

    return $filtered;
}

################################################################################
#
# get_test_data
#
################################################################################
sub get_test_data {
    my $test_data = DBI->connect("DBI:mysql:$DB_NAME","$DB_USER","$DB_PASS");
    return $test_data;
}

################################################################################
#
# get_running_stats: computes
# * the sum of all logr values for each position of the motif for each
#   modification
# * the squared sum of all logr values for each position of the motif for each
#   modification
# * the number of non-null elements at each position
#
# The returned $motif_dist is a hash with keys in [-$width/2, $width/2-1]
#
################################################################################
sub get_running_stats {
    my ($gibbs_state, $test_data, $width, $background_stats, $print) = @_;

    # check args
    if (not defined $gibbs_state) {
	Carp::confess ("gibbs_state must be defined\n");
    }
    if (not defined $width) {
	Carp::confess ("width must be defined\n");
    }
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $background_stats) {
	Carp::confess ("test_data must be defined\n");
    }

    # pseudocounts
    my $logr_sums;
    foreach my $mod (@MODS) {
	for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
	    my $val = 0.5;
	    $logr_sums->{$mod}->{$motif_index}->{sum}  += $val;
	    $logr_sums->{$mod}->{$motif_index}->{sum2} += $val * $val;
	    $logr_sums->{$mod}->{$motif_index}->{count}++;
	}
    }

    # start finding mean
    foreach my $id (keys %{ $gibbs_state }) {
	my $en     = $gibbs_state->{$id}->{en};
	my $center = $gibbs_state->{$id}->{loc};
	my $pol    = $gibbs_state->{$id}->{pol};

#	foreach my $mod (@MODS) {
#	    for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
#		my $index = $motif_index * $pol;
#		my $val = $test_data->{$mod}->{$en}->{$center + 100 * $index};
#
#		if ($val ne "none") {
#		    $logr_sums->{$mod}->{$motif_index}->{sum}  += $val;
#		    $logr_sums->{$mod}->{$motif_index}->{sum2} += $val * $val;
#		    $logr_sums->{$mod}->{$motif_index}->{count}++;
#		}
#	    }
#	}

	my $data = read_test_data($test_data, $en, $center, $width * 100 / 2);
	foreach my $mod (@MODS) {
	    for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
		my $index = $motif_index * $pol;
		my $val = $data->{$mod}->{$center + 100 * $index} || 0;

		$logr_sums->{$mod}->{$motif_index}->{sum}  += $val;
		$logr_sums->{$mod}->{$motif_index}->{sum2} += $val * $val;
		$logr_sums->{$mod}->{$motif_index}->{count}++;
	    }
	}
    }

    if ($DEBUG || $print) {
#	print ("seed = \n");
#	foreach my $mod (@MODS) {
#	    foreach my $id (keys %{ $gibbs_state }) {
#		my $en     = $gibbs_state->{$id}->{en};
#		my $center = $gibbs_state->{$id}->{loc};
#		my $pol    = $gibbs_state->{$id}->{pol};
#
#		my @vals;
#		for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
#		    my $index = $motif_index * $pol;
#		    push(@vals, $test_data->{$mod}->{$en}->{$center + 100 * $index});
#		}
#
#		print(join("\t", $mod, $id, $pol, @vals) . "\n");
#	    }
#	}
#
	my $motif_dist = compute_motif($gibbs_state, -1,
				       $test_data, $width,
				       $logr_sums,
				       $background_stats);

	foreach my $mod (@MODS) {
	    print ("mod = $mod\n");
	    for(my $motif_index = -$width/2 ; $motif_index <= $width/2 ; $motif_index++) {
		print (join("\t",
			    $motif_index,
			    $motif_dist->{$mod}->{$motif_index}->{mean},
			    $motif_dist->{$mod}->{$motif_index}->{std}
			    ) . "\n");
	    }
	    print ("\n");
	}
    }

    return $logr_sums;
}

################################################################################
#
# get_background_dist_global: computes the global background distribution
#
################################################################################
sub get_background_dist_global {
    my ($test_data, $filtered_data) = @_;

    # check args
    if (not defined $test_data) {
	Carp::confess ("test_data must be defined\n");
    }
    if (not defined $filtered_data) {
	Carp::confess ("filtered__data must be defined\n");
    }

    my $nums;
#    my $num_blank = 0;
#    my $num_not_blank = 0;
#    foreach my $mod (@MODS) {
#	foreach my $en (keys %{ $test_data->{$mod} }) {
#	    foreach my $loc (keys %{ $test_data->{$mod}->{$en} }) {
#
#		if ($test_data->{$mod}->{$en}->{$loc} eq "none") {
#		    $num_blank++;
#		    #$test_data->{$mod}->{$en}->{$loc} = 0;
#		} else {
#		    $num_not_blank++;
#		    push(@{ $nums->{$mod} }, $test_data->{$mod}->{$en}->{$loc});
#		}
#	    }
#	}
#    }
#
#    print STDERR ("num blanks = " . $num_blank / scalar(@MODS) . "\n");
#    print STDERR ("num not blank = " . $num_not_blank / scalar(@MODS) . "\n");

    my $background_stats;
    my $count = scalar(@$filtered_data) * ((2 * $BACKGROUND_BUFFER) / 100 + 1);

    print STDERR ("computing background mean\n");
    for(my $i=0 ; $i < scalar(@$filtered_data); $i++) {
	if ($i % 100 == 0) {
	    print STDERR (".");
	}
	if ($i % 1000 == 0) {
	    print STDERR (" $i\n");
	}

	my $en = $filtered_data->[$i]->{en};
	my $original_center = $filtered_data->[$i]->{loc};

	my $data = read_test_data($test_data, $en, $original_center, $BACKGROUND_BUFFER);
	for(my $loc = $original_center - $BACKGROUND_BUFFER ;
	    $loc <=   $original_center + $BACKGROUND_BUFFER ;
	    $loc += 100) {

	    foreach my $mod (@MODS) {
		my $val = $data->{$mod}->{$loc} || 0;
		$background_stats->{$mod}->{mean} += $val;
	    }
	}
    }
    foreach my $mod (@MODS) {
	$background_stats->{$mod}->{mean} /= $count;
    }

    print STDERR ("\n\ncomputing background std\n");
    for(my $i=0 ; $i < scalar(@$filtered_data); $i++) {
	if ($i % 100 == 0) {
	    print STDERR (".");
	}
	if ($i % 1000 == 0) {
	    print STDERR (" $i\n");
	}

	my $en = $filtered_data->[$i]->{en};
	my $original_center = $filtered_data->[$i]->{loc};

	my $data = read_test_data($test_data, $en, $original_center, $BACKGROUND_BUFFER);
	for(my $loc = $original_center - $BACKGROUND_BUFFER ;
	    $loc <=   $original_center + $BACKGROUND_BUFFER ;
	    $loc += 100) {

	    foreach my $mod (@MODS) {
		my $val = $data->{$mod}->{$loc} || 0;
		my $diff = $val - $background_stats->{$mod}->{mean};
		$background_stats->{$mod}->{std} += $diff * $diff;
	    }
	}
    }
    foreach my $mod (@MODS) {
	$background_stats->{$mod}->{std} /= $count - 1;
	$background_stats->{$mod}->{std} = sqrt($background_stats->{$mod}->{std});
    }

    if ($DEBUG) {
	print STDERR ("background:\n");
	foreach my $mod (@MODS) {
	    print STDERR ("  mod = $mod:\t" . 
			  "mean = " . $background_stats->{$mod}->{mean} . ", " .
			  "std = " . $background_stats->{$mod}->{std} . "\n");
	}
    }

    return $background_stats;
}

################################################################################
#
# read_test_data
#
################################################################################
sub read_test_data {
    my ($test_data, $chr, $loc, $buffer) = @_;

    # check args
    if (not defined $test_data) {
        die ("test_data must be defined\n");
    }
    if (not defined $chr) {
        die ("chr must be defined\n");
    }
    if (not defined $loc) {
        die ("loc must be defined\n");
    }
    if (not defined $buffer) {
        die ("buffer must be defined\n");
    }

    my $chr_id = $CHR_TO_ID{$chr};
    my $left  = $loc - $buffer;
    my $right = $loc + $buffer;

    my $query = "SELECT loc, mark_id, val FROM gm12878 WHERE chr_id = $chr_id AND loc BETWEEN $left AND $right";
    my $query_handle = $test_data->prepare($query);
    $query_handle->execute();

    my $data;
    while(my($loc, $mark_id, $val) = $query_handle->fetchrow_array()) {
	my $mark = $ID_TO_MARK{$mark_id};
	$data->{$mark}->{$loc} = $val;
    }

    return $data;
}
