#!/usr/bin/env perl
# DO NOT EDIT this file in any installed-bin location --
# edit kent/src/hg/snp/hapmapLd/deQuestionPhase.pl instead.

# $Id: deQuestionPhase.pl,v 1.1 2007/10/29 23:45:05 angie Exp $

use warnings;
use strict;

sub usage {
  die "usage: deQuestionPhase.pl inBase outDir
inBase is the root of the three PHASE files (.phase.gz, _legend.txt.gz, and
_sample.txt.gz).  outDir is the directory into which 3 files of the same
names will be written, after SNPs that have a '?' for any sample have been
removed from phase and legend.  sample is unmodified, but copied into outDir
for completeness.
\n";
}

my $inBase = shift || &usage;
my $outDir = shift || &usage;
&usage if @ARGV;

$inBase =~ /^(.*\/)?([\w_\.+-]+)$/
  || die "inBase $inBase doesn't look right.\t";
my $inRoot = $2;

my $inPhaseName = "$inBase.phase.gz";
die "Required file $inPhaseName does not exist.\n" if (! -e $inPhaseName);
my $inLegendName = "${inBase}_legend.txt.gz";
die "Required file $inLegendName does not exist.\n" if (! -e $inLegendName);
my $inSampleName = "${inBase}_sample.txt.gz";
die "Required file $inSampleName does not exist.\n" if (! -e $inSampleName);

warn "Opening $inPhaseName at " . `date`;
open(PHASE, "zcat $inPhaseName |")
  || die "Couldn't open pipe from zcat $inPhaseName: $!\n";
my @phase;
my @qOffsets;
my $snpCount;
while (<PHASE>) {
  chomp;
  my @snpCodes = split;
  my $wordCount = scalar(@snpCodes);
  if (! defined $snpCount) {
    $snpCount = $wordCount;
  } elsif ($snpCount != $wordCount) {
    die "Inconsistent column count in $inPhaseName.  Previously saw " .
      "$snpCount, but see $wordCount at line $..\n";
  }
  for (my $i = 0;  $i < $wordCount;  $i++) {
    if ($snpCodes[$i] eq '?') {
      push @qOffsets, $i;
    } elsif ($snpCodes[$i] ne '0' && $snpCodes[$i] ne '1') {
      die "Expected to find only 0, 1 or ? SNP codes in $inPhaseName, but " .
	"got '$snpCodes[$i]', (0-based) column $i of (1-based) line $..\n";
    }
  }
  push @phase, \@snpCodes;
}
close(PHASE);
warn "Read all data from $inPhaseName at " . `date`;
die "Empty $inPhaseName?\t" if (! $snpCount);

# Reverse numeric sort @qOffsets so that we can remove higher-offset
# elements first, avoiding the need for offset correction.  Then uniqify.
my @tmp = sort {$b <=> $a} @qOffsets;
@qOffsets = ();
my $prevOffset;
foreach my $offset (@tmp) {
  next if (defined $prevOffset && $prevOffset == $offset);
  push @qOffsets, $offset;
  $prevOffset = $offset;
}
my $numYanked = scalar(@qOffsets);
warn "Will splice out $numYanked offsets: " . join(", ", @qOffsets) . ".\n";

# Remove the ?-tainted SNPs from each row of @phase:
my $newSnpCount = $snpCount - $numYanked;
foreach my $ref (@phase) {
  my @snpCodes = @{$ref};
  foreach my $offset (@qOffsets) {
    splice @snpCodes, $offset, 1;
  }
  $ref = \@snpCodes;
  my $rcc = scalar(@{$ref});
  if ($rcc != $newSnpCount) {
    die "Expected to have $newSnpCount columns in phase output, but looks " .
      "like at least one column has $rcc.\t";
  }
}
warn "Original snp count is $snpCount, " .
  "new snp count is $newSnpCount.\n";
warn "Done with splicing phase data, writing phase output at " . `date`;

# Write new phase output.
my $outPhaseName = "$outDir/$inRoot.phase.gz";
open(PHASE, "| gzip -c > $outPhaseName")
  || die "Can't open pipe to gzip -c > $outPhaseName: $!\n";
foreach my $ref (@phase) {
  print PHASE join(' ', @{$ref}) . "\n";
}
close(PHASE);
@phase = ();
warn "Done with phase at " . `date`;

# Read in legend in a big gulp and remove rows by using splice same way
# as above.
open(LEGEND, "zcat $inLegendName |")
  || die "Can't open pipe from zcat $inLegendName: $!\n";
my $legendHeader = <LEGEND>;
my @legendRows = <LEGEND>;
close(LEGEND);
my $lrc = scalar(@legendRows);
if ($lrc != $snpCount) {
  die "Expected $inLegendName to have $snpCount rows after the 1-line " .
    "header, but got $lrc rows.\t";
}
foreach my $offset (@qOffsets) {
  splice @legendRows, $offset, 1;
}
$lrc = scalar(@legendRows);
if ($lrc != $newSnpCount) {
  die "Expected to end up with $newSnpCount non-header rows of legend " .
    "output, but got $lrc.\t";
}

# Write new legend output.
my $outLegendName = "$outDir/${inRoot}_legend.txt.gz";
open(LEGEND, "| gzip -c > $outLegendName")
  || die "Can't open pipe to gzip -c > $outLegendName: $!\n";
print LEGEND $legendHeader, @legendRows;
close(LEGEND);
warn "Done with legend at " . `date`;

# Sample file is unchanged -- just copy it.
my $outSampleName = "$outDir/${inRoot}_sample.txt.gz";
(system("cp $inSampleName $outSampleName") == 0)
  || die "Can't cp $inSampleName $outSampleName: $!\n";
warn "Done copying sample at " . `date`;

