#!/usr/bin/env perl
# DO NOT EDIT this file in any installed-bin location --
# edit kent/src/hg/snp/hapmapLd/mergeHaploviewLD.pl instead.

# $Id: mergeHaploviewLD.pl,v 1.1 2007/10/29 23:47:21 angie Exp $

use warnings;
use strict;

sub usage {
  die "usage: mergeHaploviewLD.pl mapFile outFile.LD.gz
Assumes that mapFile is uncompressed and ends in .map, and that
mapFileRoot.???.txt.LD[.gz] exist in same directory as mapFile
-- natural result of splitGenotype.pl followed by Haploview.
Puts gzipped output to outFile.LD.gz.\n\n";
}

my $mapFileName = shift || &usage;
my $outFileName = shift || &usage;
&usage if (@ARGV);

open(MAP, "$mapFileName")
  || die "Could not open mapFile $mapFileName: $!\n";
open(OUT, "| gzip -c > $outFileName")
  || die "Can't open pipe to gzip -c > $outFileName: $!";

my $baseDir = $mapFileName;
$baseDir =~ s#/[^/]+$##;


sub getMappedSnps() {
  my ($nextMapHeader, %mappedSnps);
  while (<MAP>) {
    if (/^# \S+$/) {
      $nextMapHeader = $_;
      last;
    } elsif (/^(rs\d+)$/) {
      $mappedSnps{$1} = 1;
    } else {
      die "Can't parse line $. of $mapFileName:\n$_\t";
    }
  }
  return ($nextMapHeader, \%mappedSnps);
}

my $mapHeader = <MAP>;
my $ldHeader;
my $done = 0;
while ($mapHeader) {
  die "mapFile $mapFileName header not in expected format  " .
      "[# genotypes file]:\n$mapHeader\t"
    if ($mapHeader !~ /^# ([\w\.+-]+)$/);
  my $mapped = $1;
  $mapped =~ s/\.txt/.txt.LD/;
  my $ldChunkFileName = "$baseDir/$mapped";
  die "Can't find expected LD chunk file $ldChunkFileName.\n"
    if (! -s $ldChunkFileName);
  my $ldChunkFile;
  if ($ldChunkFileName =~ /\.gz$/) {
    open($ldChunkFile, "zcat $ldChunkFileName |")
      || die "Couldn't open pipe from zcat $ldChunkFileName: $!\n";
  } else {
    open($ldChunkFile, "$ldChunkFileName")
      || die "Couldn't open $ldChunkFileName: $!\n";
  }
  ($mapHeader, my $mappedSnps) = &getMappedSnps;
  while (<$ldChunkFile>) {
    if (/^L1\s+L2/) {
      if (! defined $ldHeader) {
	$ldHeader = $_;
	print OUT;
      }
      next;
    } elsif ($_ =~/^(rs\d+)\s+rs\d+\s+/) {
      if ($mappedSnps->{$1}) {
	print OUT;
      }
    } else {
      die "Can't parse line $. of $ldChunkFileName:\n$_\t";
    }
  }
  close($ldChunkFile);
}
close(MAP);

