#!/usr/bin/env perl
# DO NOT EDIT this file in any installed-bin location --
# edit kent/src/hg/snp/hapmapLd/checkLDSnpOrder.pl instead.

# $Id: checkLDSnpOrder.pl,v 1.1 2007/10/29 23:44:41 angie Exp $

# Our encoding and display software assumes that the N characters in
# a list are for consecutive SNPs after the current SNP -- no gaps or
# order-swaps.  Verify that assumption...

use warnings;
use strict;


sub usage {
  die "usage: write me
";
}

my $inFile = shift || &usage;
&usage if @ARGV;

if ($inFile =~ /\.gz$/) {
  open(IN, "zcat $inFile |")
    || die "Can't open pipe from zcat $inFile: $!\n";
} else {
  open(IN, $inFile)
    || die "Can't open $inFile for reading: $!\n";
}

my @snps;
my @snpsPerSnp;
my %snpIndex;
my $i = 0;
my $pairCount = 0;
while (<IN>) {
  next unless (/^(rs\d+)\s+(rs\d+)\s+/);
  my ($snp1, $snp2) = ($1, $2);
  if (! defined $snps[0]) {
    die if ($i != 0);
    $snps[0] = $snp1;
  } elsif ($snps[$i] ne $snp1) {
    $snps[++$i] = $snp1;
  }
  $snpIndex{$snp1} = $i;
  push @{$snpsPerSnp[$i]}, $snp2;
  $pairCount++;
}
close(IN);

warn "Got $i SNPs in first column, $pairCount pairs.\n";
for ($i = 0;  $i < @snps;  $i++) {
  my $snp = $snps[$i];
  my $i2 = $snpIndex{$snp};
  my $snp2 = $snps[$i2];
  if ($snp ne $snp2 || $i != $i2) {
    die "DOH: $i -> $snp -> $i2 -> $snp2\n";
  }
}

my $errCount = 0;
my %alreadyNeverSaw;
for ($i = 0;  $i < @snpsPerSnp;  $i++) {
  my $sRef = $snpsPerSnp[$i];
  my $firstColSnp = $snps[$i];
  my $prevRank = $snpIndex{$firstColSnp};
  foreach my $snp (@{$snpsPerSnp[$i]}) {
    my $rank = $snpIndex{$snp};
    if (! defined $rank) {
      if (! defined $alreadyNeverSaw{$snp}) {
	warn "Never saw $snp(???) in first column.  " .
	  "First seen in second column of $firstColSnp($i).\n";
	$errCount++;
	$alreadyNeverSaw{$snp} = 1;
      }
      next;
    }
    my $prevSnp = $snps[$prevRank];
    if ($rank == 1+$prevRank) {
      $prevRank = $rank;
    } elsif ($rank > $prevRank) {
      warn "Gap in order: $prevSnp($prevRank) followed by $snp($rank) " .
	"in second column of $firstColSnp($i).\n";
      $prevRank = $rank;
      $errCount++;
    } elsif ($rank == $prevRank) {
      warn "$snp($rank) appears in second column of $firstColSnp($i) " .
	"more than once.\n";
      $errCount++;
    } else {
      warn "OUT OF ORDER: $snp($rank) precedes $prevSnp($prevRank) in " .
	"first column but follows it in second column mapping " .
	"from $firstColSnp($i).\n";
      $errCount++;
    }
    die "Quitting after 100th warning.\n" if ($errCount > 100);
  }
}
exit 1 if ($errCount > 0);

