#!/usr/bin/env perl
# DO NOT EDIT this file in any installed-bin location --
# edit kent/src/hg/snp/hapmapLd/makeDccAndLdBed.pl instead.

# $Id: makeDccAndLdBed.pl,v 1.1 2007/10/29 23:46:06 angie Exp $

# Based on Daryl Thomas's makeDcc.pl and makeLdBed.pl scripts in
# makeDb/doc/hg17.txt.  Combined here for fewer passes through the
# large files, and extended to add several new fields to the bed+:
# T-int and precomputed average scores for dense mode.

# Translates HapMap genotypes (or PHASE legend) and Haploview LD output
# into the HapMap DCC's downloads/ld_data/ format and UCSC's hapmapLd2
# (bed4+) format.  Works on one {chrom, pop} at a time -- suitable for
# a big cluster run.

use warnings;
use strict;

sub usage {
  die "usage: makeDccAndLdBed.pl genotypes.txt.gz LD.gz ld_chr_pop.txt.gz db.pop_chr.bed.gz
First arg can also be a PHASE _legend.txt.gz file.\n";
}


sub min ($$) {
  my ($a, $b) = @_;
  return ($a < $b) ? $a : $b;
}

sub encodeDprime($) {
  my $val = shift @_;
  my $ret;
  if ( ($val > 1) || ($val < -1) ) {
    die "Dprime value ($val) is out of range [-1,1]";
  } elsif ($val >= 0) {
    $ret = ord('a') + $val*9;
  } else {
    $ret = ord('A') - $val*9;
  }
  return chr($ret);
}

sub encodeRsquared($) {
  my $val = shift @_;
  if ( ($val > 1) || ($val < 0) ) {
    die "R^2 value ($val) is out of range [0,1]";
  }
  return encodeDprime($val);
}

sub encodeLod($$) {
  my ($lod, $dPrime) = @_;
  my $ret = ord('a');
  if ($lod >= 2) {		# high LOD
    if (abs($dPrime) < 0.5) {
      $ret = ord('y');
    }				# high LOD, low D'  -> pink
    else {
      $ret += min((int($lod - abs($dPrime) - 1.5)), 9) ;
    }
  } elsif (abs($dPrime) > 0.99) {
    $ret = ord('z');
  }				# high D', low LOD  -> blue
  return chr($ret);
}


############### MAIN ################

my $geno   = shift || &usage;
my $ld     = shift || &usage;
my $outDcc = shift || &usage;
my $outBed = shift || &usage;
&usage if (@ARGV);

$geno =~ m/^(.*\/)?genotypes_(chr[0-9MXY]+)_([A-Z+]+)_/
  || die "Can't parse chrom and population from filename $geno";
my ($chrom, $pop) = ($2, $3);
$outDcc =~ s/$/.gz/ if $outDcc !~ /\.gz$/;
$outBed =~ s/$/.gz/ if $outBed !~ /\.gz$/;

die "Genotype/legend file $geno does not exist.\n" if (! -s $geno);
die "LD file $ld does not exist.\n" if (! -s $ld);

open(GENO,"zcat $geno |" ) || die "can't open pipe from zcat $geno: $!";
open(LD,  "zcat $ld   |" ) || die "can't open pipe from zcat $ld: $!";

my $tmpDcc = `mktemp -p /scratch/tmp makeDccAndLdBed.XXXXXX`; chomp $tmpDcc;
my $tmpBed = `mktemp -p /scratch/tmp makeDccAndLdBed.XXXXXX`; chomp $tmpBed;
open(DCC, "| gzip -c > $tmpDcc" )
  || die "can't open pipe to gzip -c > $tmpDcc: $!";
open(BED, "| gzip -c > $tmpBed" )
  || die "can't open pipe to gzip -c > $tmpBed: $!";

# All we need from the genotypes/legend file is the SNP positions:
my %pos;
my $posField = ($geno =~ /[_\.]legend.txt/) ? 1 : 3;
<GENO>;				#ignore header
while (<GENO>) {
  chomp;
  my @fields = split /\s+/;
  $pos{$fields[0]} = $fields[$posField];
}
close(GENO);

# Add position data to LD data to produce DCC and hapmapLd2 formats:
my ($prevChromStart, $prevChromEnd, $prevName, $prevTInt);
my $ldCount = 0;
my ($dprimeList, $rsquaredList, $lodList, $tInt) = ("", "", "", undef);
my (%sumDprime, %sumRsquared, %sumLod);
<LD>;				#ignore header;
while (<LD>) {
  chomp;
  my ($snp1, $snp2, $dprime, $lod, $rsquared, undef, undef, undef, $tInt) =
    split /\t/;
  my $chromStart = $pos{$snp1};
  my $chromEnd   = $pos{$snp2};
  die unless (defined $chromStart && defined $chromEnd);
  # DCC format is nice and simple:
  print DCC "$chromStart $chromEnd $pop $snp1 $snp2 $dprime $rsquared $lod\n";

  # For hapmapLd2 (bed4+), we distill multiple lines down to a single line
  # per $snp1, with encoded scores.
  my $name = $snp1;
  if ($tInt =~ /^[0-9\.]+$/) {
    $tInt = int($tInt / 100);
    $tInt = 5 if ($tInt > 5);
    $tInt = chr(ord('c') + $tInt);
  } elsif ($tInt eq '-') {
    $tInt = undef;
  } else {
    die "Can't parse tInt '$tInt', line $. of $ld.\n";
  }
  if (defined $prevChromStart && $prevChromStart != $chromStart) {
    if ($ldCount > 0) {
      $prevChromStart--;
      my $avgDprime   = encodeDprime($sumDprime{$prevName}/$ldCount);
      my $avgRsquared = encodeRsquared($sumRsquared{$prevName}/$ldCount);
      my $avgLod      = encodeLod($sumLod{$prevName}/$ldCount,
				  $sumDprime{$prevName}/$ldCount);
      $prevTInt = 'a' if (! defined $prevTInt);
      print BED join("\t", $chrom, $prevChromStart, $prevChromEnd, $prevName,
		     $ldCount, $dprimeList, $rsquaredList, $lodList,
		     $avgDprime, $avgRsquared, $avgLod, $prevTInt) . "\n";
    }
    delete $sumDprime{$prevName};
    delete $sumRsquared{$prevName};
    delete $sumLod{$prevName};
    $ldCount      = 1;
    $dprimeList   = encodeDprime($dprime);
    $rsquaredList = encodeRsquared($rsquared);
    $lodList      = encodeLod($lod, $dprime);
    $sumDprime{$name}   = abs($dprime);
    $sumRsquared{$name} = $rsquared;
    $sumLod{$name}      = $lod;
    $sumDprime{$snp2}   += abs($dprime);
    $sumRsquared{$snp2} += $rsquared;
    $sumLod{$snp2}      += $lod;
    $prevTInt           = undef;
  } elsif ($chromEnd - $chromStart < 250000) {
    $prevChromEnd     = $chromEnd;
    $ldCount++;
    $dprimeList   .= encodeDprime($dprime);
    $rsquaredList .= encodeRsquared($rsquared);
    $lodList      .= encodeLod($lod, $dprime);
    $sumDprime{$name}   += abs($dprime);
    $sumRsquared{$name} += $rsquared;
    $sumLod{$name}      += $lod;
    $sumDprime{$snp2}   += abs($dprime);
    $sumRsquared{$snp2} += $rsquared;
    $sumLod{$snp2}      += $lod;
  }
  ($prevChromStart, $prevChromEnd, $prevName) =
    ($chromStart, $chromEnd, $name);
  $prevTInt = $tInt if (defined $tInt);
}
close(LD);
close(DCC);

if ($ldCount > 0) {
  $prevChromStart--;
  my $avgDprime   = encodeDprime($sumDprime{$prevName}/$ldCount);
  my $avgRsquared = encodeRsquared($sumRsquared{$prevName}/$ldCount);
  my $avgLod      = encodeLod($sumLod{$prevName}/$ldCount,
			      $sumDprime{$prevName}/$ldCount);
  $prevTInt = 'a' if (! defined $prevTInt);
  print BED join("\t", $chrom, $prevChromStart, $prevChromEnd, $prevName,
		 $ldCount, $dprimeList, $rsquaredList, $lodList,
		 $avgDprime, $avgRsquared, $avgLod, $prevTInt) . "\n";
}
close(BED);

(system("mv $tmpDcc $outDcc") == 0) || die "'mv $tmpDcc $outDcc' failed: $!\n";
(system("mv $tmpBed $outBed") == 0) || die "'mv $tmpBed $outBed' failed: $!\n";
