#!/usr/bin/env perl

# encodeValidate.pl - validate an ENCODE data submission generated by the
#                       automated submission pipeline
#
# Verifies that all files and metadata are present and of correct formats
# Creates a load file (load.ra) and track configuration (trackDb.ra) for the datasets
#
# Returns 0 if validation succeeds
#
# Error reporting:
#
# We die immediately (with a human readable message) when internal errors are encountered (e.g. file I/O errors or misconfiguration).
#
# In order to facilitate debugging of often very large file uploads, we try to accumulate multiple user errors (e.g. DAF, DAS or
# file syntax errors) before die'ing with a message with a list of errors.

# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit the CVS'ed source at:
# $Header: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v 1.233 2010/06/09 23:52:42 vsmalladi Exp $

use warnings;
use warnings FATAL => 'all';
use strict;

#use DataBrowser qw(browse);
use File::stat;
use File::Basename;
use File::Temp qw/ tempfile tempdir /;
use Getopt::Long;
use English;
use Carp qw(cluck);
use Cwd;
use IO::File;
use File::Basename;

use FindBin qw($Bin);
use lib "$Bin";
use Encode;
use HgAutomate;
use HgDb;
use RAFile;
use SafePipe;

use vars qw/
    $opt_allowReloads
    $opt_configDir
    $opt_fileType
    $opt_metaDataOnly
    $opt_outDir
    $opt_skipAll
    $opt_skipAutoCreation
    $opt_skipOutput
    $opt_skipValidateFiles
    $opt_skipValidateFastQ
    $opt_validateDaf
    $opt_validateFile
    $opt_sendEmail
    $opt_verbose
    $opt_timing
    /;

# Global variables
our $submitPath;        # full path of data submission directory
our $configPath;        # full path of configuration directory
our $outPath;           # full path of output directory
our %terms;             # controlled vocabulary, indexed by type and term
our %tags;              # controlled vocabulary, indexed by tag
our $time0 = time;
our $timeStart = time;
our %chromInfo;         # chromInfo from assembly for chrom validation
our %chromSizes;
our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
our %tableNamesUsed;
our ($fields, $daf);
our $SORT_BUF = " -S 5G ";
our $assembly;

sub usage {
    print STDERR <<END;
usage: encodeValidate.pl pipeline-instance project-submission-dir

The pipeline instance variable is a switch that changes the behavior of doEncodeValidate.
The changes if the instance is:

standard
    allows use of hg19 and mm9 databases only

anything else
    allows use of the encodeTest database only

Current dafVersion is: $Encode::dafVersion

Creates the following output files: $Encode::loadFile, $Encode::trackFile 

options:
    -allowReloads       Allow reloads of existing tables
    -configDir=dir      Path of configuration directory, containing
                        metadata .ra files (default: submission-dir/../config)
    -database=assembly  Specify an assembly; necessary only when using -validateFile
    -fileType=type      used only with validateFile option; e.g. narrowPeak
    -metaDataOnly       Process DAF/DDF and just update the projects.metadata field;
                        equal to -allowReloads -skipAll
    -skipAll            Turn on all "-skip..." options
    -skipAutoCreation   Tells script skip creating the auto-created files (e.g. RawSignal, PlusRawSignal, MinusRawSignal)
                        this can save you a lot of time when you are debugging and re-running the script on large projects
    -skipOutput         Don't write the various output files
    -skipValidateFiles  Tells script skip the file validation step; to save a lot of time during testing
    -validateDaf        exit after validating DAF file (project-submission-dir is the DAF file name).
    -validateFile       exit after validating file (project-submission-dir is the file name;
                        requires -fileType option as well)
    -verbose=num        Set verbose level to num (default 1).
    -outDir=dir         Path of output directory, for validation files
                        (default: submission-dir/out)
END
exit 1;
}

sub pushError
{
    my ($errors, @new) = @_;
    if(@new) {
        push(@{$errors}, @new);
        HgAutomate::verbose(2, "pushing errors:\n\t" . join("\n\t", @new) . "\n");
    }
}

sub doTime
# print out time difference in seconds since last call to this function, or the program started.
{
    my $msg = shift || "";
    my $lines = shift || 0;
    my $time1 = time;
    my $t = $time1-$time0;
    $t = 1 if ($lines>0 and $t<1);
    warn("# $msg : $t secs".($lines>0 ? "  ($lines lines, ".(int($lines/$t))." lines/sec)" : ""));
    $time0 = time;
}

sub dieTellWrangler
{
    my ($msg) = @_;
    $msg .= "Please contact the ENCODE staff at encode-staff\@soe.ucsc.edu\n";
    die $msg;
}

############################################################################
# Validators for DDF columns -- extend when adding new metadata fields
#
# validators should return list of errors encountered (empty list means no errors were found).
#
# validator callbacks are called thus:
#
# validator(value, track, daf);
#
# value is value in DDF column
# track is track/view value
# daf is daf hash

# dispatch table
# modified some validators for specific types, it no longer validates orControl across the board, 
# orControl is now only used for antibody. 
# a new validator OrNone was created to handle 'None' as a value for white listed fields

our %validators = (
    files => \&validateFiles,
    view => \&validateControlledVocab,
    labVersion => \&validateNoValidation,
    origAssembly => \&validateNoValidation,
    controlId => \&validateNoValidation,
    labExpId => \&validateNoValidation,
    labProtocolId => \&validateNoValidation,
    softwareVersion => \&validateNoValidation,
    accession => \&validateNoValidation,
    replicate => \&validateNoValidation,
    rank => \&validateNoValidation,
    fragLength => \&validateNoValidation,
    setType => \&validateSetType,
    cell => \&validateControlledVocabOrNone,
    insertLength => \&validateControlledVocabOrNone,
    antibody => \&validateControlledVocabOrControl,
    control => \&validateControlledVocabOrControl,
    ripAntibody => \&validateControlledVocabOrNone,
    ripTgtProtein => \&validateControlledVocabOrNone,
    treatment => \&validateControlledVocabOrNone,
    protocol => \&validateControlledVocabOrNone,
    phase => \&validateControlledVocabOrNone,
    restrictionEnzyme => \&validateControlledVocabOrNone,
    obtainedBy => \&validateObtainedBy,
    md5sum => \&validateNoValidation,
    bioRep => \&validateNoValidation,
    tissueSourceType => \&validateControlledVocabOrNone,
    spikeInPool => \&validateNoValidation,
    readType => \&validateControlledVocabOrNone,
    region => \&validateControlledVocabOrNone,
    default => \&validateControlledVocab,
    );

# standard validators (required or optional for all projects)

sub validateFiles {
    # Validate array of filenames, ordered by part
    # Check files exist and are of correct data format
    # Venkat: Added $sex to pass sex from ddf to validate bam files for
    #          mouse tissues.
    my ($files, $type, $track, $daf, $cell,$sex) = @_;
    my @newFiles;
    my @errors;
    my $regex = "\`\|\\\|\|\"\|\'";
    doTime("beginning validateFiles") if $opt_timing;
    for my $file (@{$files}) {
        my @list = glob $file;
        if(@list) {
            push(@newFiles, @list);
        } else {
            pushError(\@errors, "File '$file' does not exist (possibly bad glob?)");
        }
    }
    HgAutomate::verbose(3, "     Track: $track    Files: " . join (' ', @newFiles) . "\n");
    return () if $opt_skipValidateFiles;
    for my $file (@newFiles) {
        my ($fbase,$dir,$suf) = fileparse($file, ".gz");
        # Check if the file has been replaced with an unzipped version
        # This check is also done where we auto create the RawSignal view from the Alignments
        if ($suf eq ".gz" and ! -e $file and -s "$dir/$fbase") {
            $file = "$dir/$fbase";
        }
        if($file =~ /($regex)/) {
            # Do not allows filenames with suspicious characters (b/c filename will be used in shell commands).
            pushError(\@errors, "File '$file' has invalid characters; files cannot contain following characters: \"'`|");
        } elsif(!-e $file) {
            pushError(\@errors, "File \'$file\' does not exist");
        } elsif(!(-s $file)) {
            pushError(\@errors, "File \'$file\' is empty");
        } elsif(!(-r $file)) {
            pushError(\@errors, "File \'$file\' is un-readable");
        } else {
            #pushError(\@errors, "Start validating $file:\n");
            #Venkat: Added $sex to pass sex from ddf to bam validate mouse tissues
            pushError(\@errors, checkDataFormat($daf->{TRACKS}{$track}{type}, $file, $cell,$sex));
            #pushError(\@errors, "End validating file $file\n\n\n");
        }
    }
    $files = \@newFiles;
    doTime("done validateFiles") if $opt_timing;
    unless (@errors) {
        return ();
    } else {
        my $errstr = "";
        for my $error (@errors) {
            $errstr = $errstr . "$error\n";
        }
        return $errstr;
    }
}

sub validateDatasetName {
    my ($val) = @_;
    return ();
}

sub validateDataType {
    my ($val) = @_;
    return ();
}

sub validateRawDataAcc {
# No validation
    return ();
}

sub validateNoValidation {
# No validation
    return ();
}

sub validateSetType {
    my ($val, $type) = @_;
    if($val ne 'exp' && $val ne 'input') {
        return ("Controlled Vocabulary \'$type\' value \'$val\' must be either \'exp\' or \'input\'");
    }
    return ();
}


# project-specific validators
sub validateControlledVocabOrNone {

    my ($val, $type) = @_;
    #correction for how cell is termed in the CV
    if($type eq 'cell') {
        $type = 'Cell Line';
    }

    if ($val eq "None"){
        return ()
    }
    return defined(${$terms{$type}}{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");

}

sub validateControlledVocabOrControl {
    my ($val, $type) = @_;
    if ($type eq 'antibody') {
        $type = 'Antibody';
        return defined($terms{$type}->{$val} || $terms{'control'}->{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
    }
    return defined($terms{$type}->{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
}

sub validateControlledVocab {
    my ($val, $type) = @_;

    if (not defined $terms{'typeOfTerm'}->{$type}) {
        return ("Controlled Vocabulary \'$type\' is not a defined type");
    }   
    if (not defined $terms{'typeOfTerm'}->{$type}->{'cvDefined'}) {
        return ("Controlled Vocabulary \'$type\' has no cvDefined field");
    }

    if ($terms{'typeOfTerm'}->{$type}->{'cvDefined'} eq "no") {
        return &validateNoValidation();
    }
    return defined($terms{$type}->{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
}

sub validateObtainedBy {
    my ($val,$type) = @_;
    return defined(${$terms{'lab'}}{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
}


############################################################################
# Format checkers - check file format for given types; extend when adding new
# data formats
#
# Some of the checkers use regular expressions to validate syntax of the files.
# Others pass first 10 lines to utility loaders; the later has:
# advantages:
#      checks semantics as well as syntax
# disadvantages;
#      only checks the beginning of the file
#      but some of the loaders tolerate (but give incorrect results) for invalid files

# dispatch table
our %formatCheckers = (
    bigWig => \&validateBigWig,
    bigBed => \&validateBigBed,
    bam => \&validateBam,
    bed => \&validateBed,
    bedCluster => \&validateBed,
    bedLogR => \&validateBed,
    bedRnaElements => \&validateBed,
    bedRrbs => \&validateBed,
    narrowPeak => \&validateNarrowPeak,
    broadPeak => \&validateBroadPeak,
    gappedPeak => \&validateBed,
    fastq => \&validateFastQ,
    csfasta => \&validateCsfasta,
    csqual  => \&validateCsqual,
    genePred => \&validateGene,
    gtf => \&validateGtf,
    gff => \&validateGtf,
    txt  => \&validateFreepass,
    pdf  => \&validateFreepass,
    document => \&validateFreepass,
    fasta  => \&replacedByFastQ,
    wig => \&replacedByBigWig,
    bedGraph => \&replacedByBigWig,
    tagAlign => \&replacedByBam,
    pairedTagAlign => \&replacedByBam,
    );

my $floatRegEx = "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?";
# my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|[eE]{1}?[+-]{1}?\\d+))";  # Tim's attempt
# my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|))";                      # Original
my %typeMap = (int => "[+-]?\\d+", uint => "\\d+", float => $floatRegEx, string => "\\S+");

sub validateFreepass
{
    my ($path, $file, $type) = @_;
    doTime("beginning validateFreepass") if $opt_timing;
    my $fh = Encode::openUtil($file, $path);
    $fh->close();
    HgAutomate::verbose(2, "File \'$file\' free pass on validation\n");
    doTime("done validateFreepass") if $opt_timing;
    return ();
}


sub validateBed {
# Validate each line of a bed 5 or greater file.
    my ($path, $file, $type, $cell, $sex) = @_;
    doTime("beginning validateBed") if $opt_timing;
    if ($type =~ m/bed\s*(\d+)/){
        unless ($1 =~ m/^(3|4|5|6|8|9|12)$/) {
            return "ENCODE does not accept bed$1, please change the field type to match one of the standard bed types (3,4,5,6,8,9,12).\n";
        }
    }
    my %bedPlusTypes = (
        bedRnaElements => "bed6+3",
        bedLogR => "bed9+1",
        bedRrbs => "bed9+2",
        gappedPeak => "bed12+3",
        bedCluster => "bed11+1"
    );
    my $paramList = validationSettings("validateFiles","$type");
    my $cmdtype = $type;
    if (exists ($bedPlusTypes{$type})) {
        $cmdtype = $bedPlusTypes{$type};
    }
    $cmdtype =~ s/\s+//g;
    my $asFile = "";
    unless ($sex) {
        $sex = "M";
    }
    my ($infoFile, $twoBitFile) = getInfoFiles($cell, $sex);
    my $asPath = "$configPath/autoSql/$type.as";
    if (exists($bedPlusTypes{$type}) and -e "$asPath") {
        $asFile = "-as=$asPath";
    } elsif (exists($bedPlusTypes{$type}) and !(-e "$configPath/$type.as")) {
        return "Can't find .as file for type $type";
    }
    my $cmd = "validateFiles $paramList -type=$cmdtype $asFile -chromInfo=$infoFile $path/$file";
    my $safe = SafePipe->new(CMDS => ["$cmd"]);
    my $err = $safe->exec();
    if($err) {
        #uncomment for web based debug
        #my $errorPrefix = "type = $cmdtype\ninfoFile = $infoFile\ncmd = $cmd\nparamList = $paramList\n";
        my $errorlog = "ERROR: failed validateBed : " . $safe->stderr() . "\n" . "End\n";
        return("$errorlog\n\nfailed validateBed for '$file'");
    }


    HgAutomate::verbose(2, "File \'$file\' passed bed validation\n");
    doTime("done validateBed") if $opt_timing;

    return ();
}

sub validateGtf {
# validate GTF by converting to genePred and validating that
    my ($path, $file, $type) = @_;
    my $errFile = "$path/doEncodeValidate.gtf.err";
    doTime("beginning validateGtf") if $opt_timing;
    my $filePath = defined($path) ? "$path/$file" : $file;
    my $outFile = "$path/doEncodeValidate.gtf.bed";
    if(Encode::isZipped($filePath)) {
        # XXXX should be modified to handle zipped files.
        die "We don't currently support gzipped gtf files\n";
    }
    HgAutomate::verbose(2, "validateGtf(path=$path,file=$file,type=$type)\n");
    # XXXX Add support for $opt_quick
    my $err = system ( "gtfToGenePred -simple $filePath $outFile >$errFile 2>&1");
    if ($err) {
        print STDERR  "File \'$file\' failed GTF validation\n";
        open(ERR, "$errFile") || die "ERROR: Can't open gtfToGenePred error file \'$errFile\': $!\n";
        my @err = <ERR>;
        die "@err\n";
    }
    unlink $errFile;
    HgAutomate::verbose(2, "File \'$file\' passed gtfToGenePred conversion \n");
    doTime("done validateGtf") if $opt_timing;
    my @res = validateGene(undef,$outFile,$type);
    if (scalar(@res)==0) { # no errors so remove the temp .bed file
        HgAutomate::verbose(2, "File \'$file\' passed gtf gene validation \n");
        unlink $outFile;
    }
    return @res;
}

sub validateGene {
    my ($path, $file, $type) = @_;
    my $outFile = "validateGene.out";
    doTime("beginning validateGene") if $opt_timing;
    my $filePath = defined($path) ? "$path/$file" : $file;
    if(Encode::isZipped($filePath)) {
        # XXXX should be modified to handle zipped files.
        die "We don't currently supporte gzipped gene files\n";
    }
    # XXXX Add support for $opt_quick
    my $err = system ("cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt $assembly testTable stdin >$outFile 2>&1");
    if ($err) {
        print STDERR  "File \'$file\' failed GFF validation\n";
        open(ERR, "$outPath/$outFile") || die "ERROR: Can't open GFF validation file \'$outPath/$outFile\': $!\n";
        my @err = <ERR>;
        die "@err\n";
    } else {
        HgAutomate::verbose(2, "File \'$file\' passed GFF validation\n");
    }
    doTime("done validateGene") if $opt_timing;
    return ();
}

sub replacedByBam {
# tagAlign and pairedTagAligne are replaced by BAM for ENCODE (Jan 2011)
# After training the labs, this code should be removed (remove in Jan 2013)
    my ($path,$file,$type) = @_;
    return ("Files of type \'$type\' should be submitted as type BAM");
}

sub replacedByBigWig {
# wigs and bedGraphs are replaced by bigWig for ENCODE (Jan 2011)
# After training the labs, this code should be removed (remove in Jan 2013)
    my ($path,$file,$type) = @_;
    return ("Files of type \'$type\' should be submitted as type bigWig");
}

sub replacedByFastQ {
# fasta is replaced by fastQ for ENCODE (Jan 2011)
# After training the labs, this code should be removed (remove in Jan 2013)
    my ($path,$file,$type) = @_;
    return ("Files of type \'$type\' should be submitted as type fastQ");
}

sub getInfoFiles
{
    my ($cell,$sex) = @_;
    my $downloadDir = "/hive/groups/encode/dcc/pipeline/downloads/$assembly/referenceSequences";
    my $infoFile =  "$downloadDir/male.$assembly.chrom.sizes";
    my $twoBitFile =  "$downloadDir/male.$assembly.2bit";


    if (not defined $terms{'Cell Line'}->{$cell}) {
        return ($infoFile, $twoBitFile);
    }
    my $cellLineSex = $terms{'Cell Line'}->{$cell}->{'sex'};

    # For category= Tissues change sex to one defined by the DFF
    # The reason that I did not just pass sex is because I will be using the
    # same DAF with required fields for mouse tissue and cell samples

    # Category is defined in cv.ra for
    # T= Tissue
    # L= Cell Line
    # P= Primary Cells
    my $category = $terms{'Cell Line'}->{$cell}->{'category'};

    # Can be a better design, but need to flesh out design more.
    if (defined $category && $category eq "Tissue" && defined $sex) {
        $cellLineSex=$sex;
    }
    if (defined $sex) {
        $cellLineSex = $sex;
    }


    if ($cellLineSex eq "F")  {
        $infoFile =  "$downloadDir/female.$assembly.chrom.sizes";
        $twoBitFile =  "$downloadDir/female.$assembly.2bit";
    }
    return ($infoFile, $twoBitFile);
}

sub validateNarrowPeak
{
    my ($path, $file, $type, $cell,$sex) = @_;
    # validate chroms, chromSize, etc.
    my $paramList = validationSettings("validateFiles","narrowPeak",$assembly);
    my ($infoFile, $twoBitFile ) = getInfoFiles($cell, $sex);
    my $safe = SafePipe->new(CMDS => ["validateFiles -chromInfo=$infoFile $paramList -type=narrowPeak $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateNarrowPeak : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateNarrowPeak for '$file'");
    }
    return ();
}

sub validateBroadPeak
{
    my ($path, $file, $type, $cell,$sex) = @_;
    # validate chroms, chromSize, etc.
    my $paramList = validationSettings("validateFiles","broadPeak",$assembly);
    my ($infoFile, $twoBitFile ) = getInfoFiles($cell, $sex);
    my $safe = SafePipe->new(CMDS => ["validateFiles -chromInfo=$infoFile $paramList -type=broadPeak $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateBroadPeak for '$file'");
    }
    return ();
}

sub validateFastQ
{
    # Syntax per http://maq.sourceforge.net/fastq.shtml
    # I added '/' in the seqNameRegEx and plusLine even though it wasnt in the spec
    #   because this is what Colin Kingswood (Gingeras project)
    #   is getting in the fastq files from GIS for the GisPet project
    #   and they are being sent on to us
    # Note on "FASTQ Quality scores":-   http://maq.sourceforge.net/qual.shtml
    # Fastq has 2 different semantics for the score field.
    # - fastq produced directly from Solexa has a 'solexa' quality score
    # - fastq defined by Sanger has a 'PHRED' quality score
    # - The 2 urls above show how to convert between both
    my ($path, $file, $type) = @_;
    my $paramList = validationSettings("validateFiles","fastq");
    my $safe = SafePipe->new(CMDS => ["validateFiles $paramList -type=fastq \"$file\""]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateFastQ : " . $safe->stderr() . "\n";

        # don't show end-user pipe error(s)
        return("failed validateFastQ for '$file'");
    }

    return ();
}

sub validateCsfasta
{
    # Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
    # Sample:-

    # # Wed Jul 30 15:30:48 2008 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/S0033/S0033_20080723_2/I22_EA/results.01/primary.20080730194737531 --name=S0033_20080723_2_I22_EA_ --tag=F3 --minlength=30 --mask=111111111111111111111111111111 --prefix=T /data/results/S0033/S0033_20080723_2/I22_EA/jobs/postPrimerSetPrimary.1416/rawseq
    # # Cwd: /home/pipeline
    # # Title: S0033_20080723_2_I22_EA_
    # >461_19_90_F3
    # T203033330010111011221200302001
    # >461_19_209_F3
    # T022213002230311203200200322000

    # Files from GIS have this header:
    # >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
    # T01301010111200210102321210100112312
    my ($path, $file, $type) = @_;
    doTime("beginning validateCsfasta") if $opt_timing;
    HgAutomate::verbose(2, "validateCsfasta($path,$file,$type)\n");
    my $paramList = validationSettings("validateFiles","csfasta");
    my $safe = SafePipe->new(CMDS => ["validateFiles $paramList -type=csfasta $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateCsfasta : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateCsfasta for '$file'");
    }
    HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
    doTime("done validateCsfasta") if $opt_timing;
    return ();
}


sub validateBam
{
    my ($path, $file, $type, $cell,$sex) = @_;
    doTime("beginning validateBam") if $opt_timing;
    HgAutomate::verbose(2, "validateBam($path,$file,$type)\n");
    my $paramList = validationSettings("validateFiles","bam");
    my ($infoFile, $twoBitFile ) = getInfoFiles($cell, $sex);

    # index the BAM file
    my $safe = SafePipe->new(CMDS => ["samtools index $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed samtools index : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateBam for '$file'");
    }

    
    $safe = SafePipe->new(CMDS => ["validateFiles $paramList -type=BAM -chromInfo=$infoFile -genome=$twoBitFile $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateBam : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateBam for '$file'");
    }
    HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
    doTime("done validateBam") if $opt_timing;
    return ();
}

sub validateBigBed
{
    my ($path, $file, $type, $cell, $sex) = @_;
    doTime("Beginning validateBigBed") if $opt_timing;
    HgAutomate::verbose(2, "validateBigBed($path,$file,$type)\n");
    my $fh = File::Temp->new(UNLINK => 1);
    $fh->unlink_on_destroy( 1 );
    my $tempfilename = $fh->filename;
    my $safe = SafePipe->new(CMDS => ["bigBedToBed $file $tempfilename"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateBigBed : " . $safe->stderr() . "\n";
        return("failed validateBigBed for '$file'");
    }
    my ($tmpfile, $basedir, $bar) = fileparse($tempfilename);
    my $bedError = &validateBed($basedir, $tmpfile, "bed3+", $cell, $sex);
    if ($bedError) {
        $bedError =~ s/$tmpfile/$file/g;
        print STDERR "ERROR: failed validateBigBed : " . $bedError . "\n";
        return("failed validateBigBed for '$file'");
    }
    HgAutomate::verbose(2, "File \'$file\' passed validateBigBed\n");
    doTime("done validateBigBed") if $opt_timing;
    return ();
}


sub validateBigWig
{
    my ($path, $file, $type) = @_;
    doTime("beginning validateBigWig") if $opt_timing;
    HgAutomate::verbose(2, "validateBigWig($path,$file,$type)\n");
    my $paramList = validationSettings("validateFiles","bigWig");
    my $safe = SafePipe->new(CMDS => ["validateFiles $paramList -type=bigWig -chromDb=$daf->{assembly} $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateBigWig : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateBigWig for '$file'");
    }
    HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
    doTime("done validateBigWig") if $opt_timing;
    return ();
}


sub validateCsqual
{
    # Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
    # Sample:-

    # # Cwd: /home/pipeline
    # # Title: S0033_20080723_2_I22_EA_
    # >461_19_90_F3
    # 20 10 8 13 8 10 20 7 7 24 15 22 21 14 14 8 11 15 5 20 6 5 8 22 6 24 3 16 7 11
    # >461_19_209_F3
    # 16 8 5 12 20 24 19 8 13 17 11 23 8 24 8 7 17 4 20 8 29 7 3 16 3 4 8 20 17 9
    my ($path, $file, $type) = @_;
    doTime("beginning validateCsqual") if $opt_timing;
    HgAutomate::verbose(2, "validateCsqual($path,$file,$type)\n");
    my $paramList = validationSettings("validateFiles","csqual");
    my $safe = SafePipe->new(CMDS => ["validateFiles $paramList -type=csqual $file"]);
    if(my $err = $safe->exec()) {
        print STDERR  "ERROR: failed validateCsqual : " . $safe->stderr() . "\n";
        # don't show end-user pipe error(s)
        return("failed validateCsqual for '$file'");
    }
    HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
    doTime("done validateCsqual") if $opt_timing;
    return ();
}

# sub validatePsl
# We are not accepting this anymore, but I thought that we might want the code as an example (Mar 2012)
# PSL format (for download) from Wold lab.
# EXAMPLE FROM http://genome.ucsc.edu/FAQ/FAQformat#format2
# This adds 2 columns (sequence,<tab>sequence,) to the standard 21 columns
# Only the first 21 are validated
#
# Sample first 6 lines
#psLayout version 3
#
#match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStarts  tStarts
#        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
#---------------------------------------------------------------------------------------------------------------------------------------------------------------
#71      3       0       0       0       0       0       0       -       HWI-EAS229_75_30DY0AAXX:4:1:0:743/1     75      1       75      chr2    242951149       184181032       184181106       1  74,      0,      184181032,      agccttttacagcaacacctttacctctgctagatctttctgtagctcgtctgaagccatgggggctgggtcag,     agccttttccagcaacacctttacctcttctagatctttctgtagctcttctgaagccatgggggctgggtcag,
#72      2       0       0       0       0       0       0       -       HWI-EAS229_75_30DY0AAXX:7:1:0:713/1     75      1       75      chr14   106368585       49540119        49540193        1  74,      0,      49540119,       cgggtgcgggccgagcagttctccgcacctccggtaaaggttcaggaccgggtgatggtctctgcagcagtcag,     ccggtgcgggccgagcagttctccgcacctccggtaaaggtgcaggaccgggtgatggtctctgcagcagtcag,
#{
#    my ($path, $file, $type) = @_;
#    my $lineNumber = 0;
#    doTime("beginning validatePsl") if $opt_timing;
#    my $fh = Encode::openUtil($file, $path);
#    while(<$fh>) {
#        chomp;
#        $lineNumber++;
#        next if $lineNumber == 1 and m/^psLayout version \d+/; # check first line
#        next if $lineNumber == 2 and m/^$/;
#        next if $lineNumber == 3 and m/^match/;
#        next if $lineNumber == 4 and m/^\s+match/;
#        next if $lineNumber == 5 and m/^------/;
#        die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
#            unless m/^(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([+-][+-]?)\t([A-Za-z0-9:>\|\/_-]+)\t(\d+)\t(\d+)\t(\d+)\t(\w+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([0-9,]+)\t([0-9,]+)\t([0-9,]+)/;
#        last if($opt_quick && $lineNumber >= $quickCount);
#    }
#    $fh->close();
#    HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
#    doTime("done validatePsl", $lineNumber) if $opt_timing;
#    return ();
#}


############################################################################
# Misc subroutines

sub validateDafField {
    # validate value for type of field
        # Venkat: Added $sex to accomadate tissues for mouse
    my ($type, $val, $daf) = @_;
    $type =~ s/ /_/g;
    HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
    my $deprecated;
    $deprecated = &isDeprecated($type, $val);
    if ($deprecated){
        return($deprecated);
    }
    if($validators{$type}) {
                # Venkat: Added the return $sex to accomadate tissues for mouse
        return $validators{$type}->($val, $type, "", $daf);
    } else {
        return $validators{'default'}->($val, $type, "", $daf); # Considers the term controlled vocab
    }
}

sub isDeprecated {
    my ($type, $val) = @_;
    if ($type eq "cell") {
        $type = "Cell Line";
    }
    if (exists($terms{$type}->{$val}) && exists($terms{$type}->{$val}->{'deprecated'})){
        return("Controlled Vocabulary '$val' is deprecated: $terms{$type}->{$val}->{'deprecated'}");
    }
    else {
        return ();
    }
}

sub validateDdfHeader {

    my @ddfHeader = @{$_[0]};
    #can't use %terms becuase it's global, not falling into that trap.
    my %cv = %{$_[1]};
    my @localerrors;
    my @variables = @{$_[2]};
    my %ddfHash = map {$_ => 1} @ddfHeader;
    foreach my $reqVar (@variables) {
        unless (exists($ddfHash{$reqVar})) {
            push @localerrors, "The required variable '$reqVar' is defined in the DAF, but is not in the DDF header.";
        }
    }


    foreach my $column (@ddfHeader) {
        if ($column eq "cell") {
            $column = "cellType";
        }
        if ($column eq "antibody") {
            $column = "Antibody";
        }
        unless (defined $cv{'typeOfTerm'}->{$column}) {
            push @localerrors, "The term '$column' is not in the Controlled Vocabulary";
        }
    }

    return (\@localerrors);

}

sub validateDdfField {
    # validate value for type of field
    # Venkat: Added $sex to accomadate tissues for mouse
    my ($type, $val, $track, $daf, $cell, $sex) = @_;
    $type =~ s/ /_/g;
    HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
    my $deprecated;
    $deprecated = &isDeprecated($type, $val);
    if ($deprecated){
        return($deprecated)

    }
    if($validators{$type}) {
    # Venkat: Added the return $sex to accomadate tissues for mouse
        return $validators{$type}->($val, $type, $track, $daf, $cell, $sex);
    } else {
        return $validators{'default'}->($val, $type, $track, $daf); # Considers the term controlled vocab
    }
}

sub checkDataFormat {
    # validate file type
    # Venkat: Added $sex to accomadate tissues for mouse
    my ($format, $file, $cell,$sex) = @_;
    HgAutomate::verbose(3, "Checking data format for $file: $format\n");
    my $type = $format;
    if ($format =~ m/(bed)\s*\d/) {
        $format = $1;
    }
    if ($format =~ m/(bedGraph) (\d+)/) {
        $format = $1;
    }
    $formatCheckers{$format} || return "Data format \'$format\' is unknown\n";
    return $formatCheckers{$format}->($submitPath, $file, $type, $cell,$sex);
    HgAutomate::verbose(3, "Done checking data format for $file: $format\n");
}

sub ddfKey
{
# return key for given DDF line (e.g. "antibody=$antibody;cell=$cell" for ChIP-Seq data).
# The key includes replicate (if applicable) if $includeReplicate is true.
    my ($fields, $ddfHeader, $daf, $includeReplicate) = @_;

    if (defined($daf->{variables})) {
        my $delim = ";";
        my $key = join($delim, map("$_=" . $fields->{$_}, sort @{$daf->{variableArray}}));
        if($includeReplicate && defined($fields->{replicate})) {
            $key .= $delim . $fields->{replicate};
        }
        return $key;
    } else {
        return undef; # Some dafs have no variables, eg, Sanger Gencode
    }
}

sub isDownloadOnly {
    my ($view, $grant, $lab, $daf) = @_;
    # Added 'downloadOnly' bool to DAF views so these rules can be explicit not hardcoded
    # Dont load any RawData* or Comparative views,
    # Dont load Alignments unless they are from Gingeras or Wold labs (RNA folks like to  see their RNAs)
    # Riken group have RawData and RawData2 because they have colorspace fasta and quality files
    # Wold group have RawData, RawData[2-7]
    # Wold group alignments are called 'Aligns', 'Splices', 'Paired'
    my $downOnly = $daf->{TRACKS}->{$view}->{downloadOnly};
    if (!defined($daf->{TRACKS}->{$view}->{downloadOnly})) {
        $downOnly = 'no';  # downloadsOnly not defined so assumes view in browser
        if (lc($daf->{TRACKS}->{$view}->{type}) eq 'fastq') { # fastqs are explicitly always download only
            $downOnly = 'yes';
        } elsif ($view =~ m/^RawData\d*$/) { # To be backwards compatible, we are leaving this restriction
            $downOnly = 'yes';
        } elsif ($view eq 'Comparative') { # Again, left for backwards campatibility.  Is this wise?
            $downOnly = 'yes';
        } elsif ($view eq 'Alignments') { # DNA data will be special cased to dowload only BAMs
            my $dataType = lc($daf->{dataType});
            if ($dataType =~m/^chip.*$/ or $dataType =~m/^dna.*$/ or $dataType =~m/^faire.*$/ or $dataType =~m/^methyl.*$/) {
                $downOnly = 'yes';
            }
        }
    }
    if ($downOnly eq 'yes') {
        return 1;
    } else {
        return 0;
    }
    #return ( (($daf->{TRACKS}->{$view}->{downloadOnly} || "") eq 'yes') or ($view =~ m/^RawData\d*$/ or $view eq 'Comparative'
    #    or ($view eq 'Alignments' and $grant ne "Gingeras" and $grant ne "Wold"))) ? 1 : 0;

}

sub printCompositeTdbSettings {
# prints out trackDb.ra settings for the composite track
    local *OUT_FILE = shift;
    my ($daf,%ddfSets) = @_;

    my $compositeTrack = Encode::compositeTrackName($daf);

    print OUT_FILE "track $compositeTrack\n";
    print OUT_FILE "compositeTrack on\n";

    my $setting    = "subGroup1 view Views";
    my $visDefault = "visibilityViewDefaults ";
    # Cycle through to get best view to default labels and to get all views and terms
    for my $view (keys %{$daf->{TRACKS}}) {
        for my $key (keys %ddfSets) {
            if(defined($ddfSets{$key}{VIEWS}{$view})) {
                my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
                if(!$downloadOnly) {
                    $setting = $setting . " " . $view . "=" . $view;
                    $visDefault = $visDefault . " " . $view . "=";
                    if($view eq "Peaks") {
                        $visDefault = $visDefault . "dense";
                    } elsif($view eq "Signal") {
                        $visDefault = $visDefault . "full";
                    } else {
                        $visDefault = $visDefault . "hide";
                    }
                }
            }
        }
    }
    print OUT_FILE "shortLabel " . $daf->{lab} . " " . $daf->{dataType} . "\n"; # Default to  lab datatype
    print OUT_FILE "longLabel ENCODE " . $daf->{lab} . " " . $daf->{grant} . " " . $daf->{dataType} . "\n";  # Default to lab grant datatype
    my $group =  "regulation"; # default (common case for ENCODE)
    if (defined($daf->{group})) {
        $group = $daf->{group};
    }
    print OUT_FILE "group $group\n";
    print OUT_FILE $setting . "\n"; # "subGroup1 view Views Peaks=Peaks Signal=Signal RawSignal=Raw_Signal\n";

    # Need to create N subgroups with M members each
    if (defined($daf->{variables})) {
        my $grpNo = 1;
        my $sortOrder = "sortOrder ";
        my $dimensions = "dimensions";
        my $controlledVocab = "controlledVocabulary encode/cv.ra";
        my %tags = ();
        if (defined($daf->{variables})) {
            my @variables = @{$daf->{variableArray}};
            for my $variable (@variables) {
                $grpNo++;
                my $groupVar = $variable;
                my $cvTypeVar = $variable;
                # special names for cell and antibody
                if ($variable eq "cell") {
                    $groupVar = "cellType";
                    $cvTypeVar = "Cell Line";
                }
                if ($variable eq "antibody") {
                    $groupVar = "factor";
                    $cvTypeVar = "Antibody";
                }
                if($grpNo < 5) {
                    $dimensions .= " dimension" . chr(86 + $grpNo) . "=" . $groupVar;
                }
                $sortOrder = "$sortOrder$groupVar=+ ";
                $controlledVocab = "$controlledVocab $groupVar";
                # TODO: This template could you typeOfTerms from cv.ra and substitute "label" as subGroup2 cell Cell_Line term1=label1 term2=label2
                $setting = "subGroup$grpNo $groupVar " . ucfirst($groupVar);
                $setting = "subGroup$grpNo $groupVar " . "Cell_Line" if $variable eq "cell";
                for my $key (keys %ddfSets) {
                    my @pairs = split(';', $key);
                    for my $pair (@pairs) {
                        my ($var, $term) = split('=', $pair);
                        if ($var eq $variable) {
                            next if ($term eq "None");
                            my $tag;
                            if (defined($terms{$cvTypeVar}->{$term})) {
                                $tag=$terms{$cvTypeVar}->{$term}->{"tag"};
                            } else {
                                if (defined($terms{"control"}->{$term})) {
                                    $tag=$terms{"control"}->{$term}->{"tag"};
                                }
                                elsif (defined($terms{"lab"}->{$term})) {
                                    $tag=$terms{"lab"}->{$term}->{"tag"};
                                } else {
                                    die "'$term' is not a registered '$cvTypeVar' term\n";
                                }
                            }
                            if (!defined($tags{$tag})) {
                                # suppress dups, requested by Brian
                                $setting = "$setting $tag=$term";
                                $tags{$tag} = $term;
                            }
                        }
                    }
                }
                print OUT_FILE $setting . "\n";     # "subGroup2\cellType Cell_Line ???\n;
            }
        }
        $setting = $sortOrder . "view=+";
        print OUT_FILE $dimensions . "\n";         # "dimensions  dimensionX=cellType dimensionY=factor"
        print OUT_FILE $setting . "\n";         # "sortOrder cellType=+ factor=+ view=+\n";
        print OUT_FILE $controlledVocab . "\n"; # "controlledVocabulary encode/cv.ra cellType factor\n";
    }
    print OUT_FILE "dragAndDrop subTracks\n";
    print OUT_FILE $visDefault . "\n";          #"visibilityViewDefaults Peaks=dense Signal=full RawSignal=hide\n";
    print OUT_FILE "priority 0\n";
    print OUT_FILE "type bed 3\n";
    print OUT_FILE "wgEncode 1\n\n";
}

sub validationSettings {
    # parse validationSettings: "validationSettings allowReloads;validateFiles.tagAlign:mmCheckOnInN=100,mismatches=3"
    my ($type, $fileType, $genome ) = @_;
    my $chrom=1;
    my $align=1;

    if($opt_metaDataOnly) {
        return 0;
    }
    if($daf->{validationSettings} || $opt_validateFile) {
        my @set = $opt_validateFile ? () : split('\;', $daf->{validationSettings});
        if($type eq "validateFiles") {
            my $paramList = "";
            for my $setting (@set) {
                if($setting =~ /^validateFiles\./) {
                    my @pair = split('\:',$setting,2);
                    my @subTypes = split('\.',$pair[0],2);
                    unless ($subTypes[1] eq "bam") {
                        next;
                    }
                    if($fileType eq $subTypes[1]) {
                        my @params = split('\,',$pair[1]);
                        for my $param (@params) {
                            if ($param eq "ignoreAlignment") {
                                $align = 0;
                            } elsif ($param eq "ignoreChromLen") {
                                $chrom = 0;
                            } else {
                                $paramList .= " -" . $param;
                            }
                        }
                        last;
                        #return $paramList;
                    }
                }
            }
            if($genome) {
                if($align) {
                    $paramList .= " -genome=/cluster/data/$genome/$genome.2bit";
                }
                if($chrom) {
                    $paramList .= " -chromDb=$genome";
                }
            }
            if ($paramList ne "") {
                HgAutomate::verbose(2, "validationSettings $type $fileType params:$paramList\n");
            }
            $paramList .= " -doReport";
            return $paramList;
        } else {
            for my $setting (@set) {
                if($setting eq $type) {
                    HgAutomate::verbose(2, "validationSettings $type found\n");
                    return 1;
                }
            }
        }
    } else  {
        die "Must specify validationSettings in daf\n";
    }

    if( scalar(@_) > 1 ) {
        return "";
    }
    return 0;
}

sub makeDownloadTargetFileName {
# Make the target filename out of tableName, type and source file format
    my ($tablename, $type, $srcFilesRef) = @_;
    my @srcFiles = @{$srcFilesRef};
    HgAutomate::verbose(2, "makeDownloadTargetFileName ( $tablename, $type )\n");

    if (@srcFiles > 1) {
        if (($type eq "bam") || ($type eq "bigWig") || ($type eq "bigBed")) {
            die "Cannot concatenate '$type' files";
        }
    }

    my $target;
    if (($type eq "bam") || ($type eq "bigWig") || ($type eq "bigBed"))  {
        $target = "$tablename.$type";

    } else {

        my $fileType = $type;
        $fileType = "bed" if ($type =~ /^bed /);

        if (@srcFiles > 1) {
            if (($type eq "fastq") || ($type eq "document")) {
                $target = "$tablename.$fileType.tgz"; # will want to tar these
            } else {
                $target = "$tablename.$fileType.gz";  # will cat and gz these
            }
        } else {
            my $srcFile  = $srcFiles[0];

            # Special effort for single docs which will have the suffix they came in with
            if ($type eq "document") {
                my @fileNameParts = split(/\./,$srcFile);
                if (@fileNameParts > 1) {
                    shift( @fileNameParts ); # Throw away the root
                    $fileType = join(".", @fileNameParts);
                }
            }

            if (Encode::isTarZipped($srcFile)) {
                if (Encode::isTarZipped(".$fileType")) {
                    $target = "$tablename.$fileType"; # $fileType includes .tgz
                } else {
                    $target = "$tablename.$fileType.tgz";
                }
            } elsif (Encode::isZipped($srcFile) && Encode::isZipped(".$fileType")) {
                $target = "$tablename.$fileType"; # $fileType includes .gz
            } else {
                $target = "$tablename.$fileType.gz"; # default of single file will be gz (even for a document or fastq)
            }
        }
    }
    $target =~ s/ //g;  # removes spaces which should already be gone!
    return $target;
}

############################################################################
# Main

# if you want to use a different path for executed binaries, this
# is how you do it
# $ENV{PATH} = "/cluster/home/braney/bin/x86_64:" . $ENV{PATH};

my @ddfHeader;         # list of field names on the first line of DDF file
my %ddfHeader = ();    # convenience hash version of @ddfHeader (maps name to field index)
my @ddfLines = ();     # each line in DDF (except for fields header); value is a hash; e.g. {files => 'foo.bed', cell => 'HeLa-S3', ...}
my %ddfSets = ();      # info about DDF entries broken down by ddfKey
my %ddfReplicateSets = ();     # info about DDF entries broken down by ddfKey (including replicate)
my $wd = cwd();

my $ok = GetOptions("allowReloads",
                    "configDir=s",
                    "fileType=s",
                    "metaDataOnly",
                    "outDir=s",
                    "timing",
                    "skipAll",
                    "skipAutoCreation",
                    "justFileDb",
                    "skipOutput",
                    "skipValidateFiles",
                    "skipValidateFastQ",
                    "validateDaf",
                    "validateFile",
                    "sendEmail",
                    "verbose=i",
                    "database=s" => \$assembly
                    );
usage() if (!$ok);
$opt_verbose = 1 if (!defined $opt_verbose);
$opt_sendEmail = 0 if (!defined $opt_sendEmail);

if($opt_skipAll) {
    $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
}

if($opt_metaDataOnly) {
    $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
    $opt_allowReloads = 1;
}

usage() if (scalar(@ARGV) < 2);

# Get command-line args
my $pipelineInstance = $ARGV[0];     # currently not used
my $submitDir = $ARGV[1];

$ENV{TMPDIR} = $Encode::tempDir;

if($opt_validateFile) {
    if(!$opt_fileType) {
        die "Error: -fileType argument is required when using -validateFile\n";
    }
    if(!$assembly) {
        die "Error: -database argument is required when using -validateFile\n";
    }
    my $db = HgDb->new(DB => $assembly);
    $db->getChromInfo(\%chromInfo);
    $db->getChromSizes(\%chromSizes);
    if(my @errors = checkDataFormat($opt_fileType, $submitDir)) {
        die "Invalid file: " . join(", ", @errors) . "\n";
    } else {
        exit(0);
    }
}

# Determine submission, configuration, and output directory paths
HgAutomate::verbose(2, "Validating submission in directory \'$submitDir\'\n");
if ($submitDir =~ /^\/.*/) {
    $submitPath = $submitDir;
} else {
    $submitPath = "$wd/$submitDir";
}
HgAutomate::verbose(4, "Submission directory path: \'$submitPath\'\n");

if (defined $opt_configDir) {
    if ($opt_configDir =~ /^\//) {
        $configPath = $opt_configDir;
    } else {
        $configPath = "$wd/$opt_configDir";
    }
} else {
    $configPath = "$submitPath/../config"
}
if(!(-d $configPath)) {
    die "configPath '$configPath' is invalid; Can't find the config directory\n";
}
HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");

if (defined $opt_outDir) {
    if ($opt_outDir =~ /^\//) {
        $outPath = $opt_outDir;
    } else {
        $outPath = "$wd/$opt_outDir";
    }
} else {
    $outPath = "$submitPath/out"
}
HgAutomate::verbose(4, "Output directory path: '$outPath'; submitPath: '$submitPath'\n");

if(!$opt_validateDaf) {
    # Change dir to submission directory
    if(!chdir($submitPath)) {
        die ("SYS ERR; Can't change to submission directory \'$submitPath\': $OS_ERROR\n");
    }
    HgAutomate::verbose(3, "Creating output in directory \'$outPath\'\n");
    if(!(-d $outPath)) {
        mkdir $outPath || die ("SYS ERR: Can't create out directory \'$outPath\': $OS_ERROR\n");
    }
}

# labs is now in fact the list of grants (labs are w/n grants, and are not currently validated).
$fields = Encode::getFields($configPath);

if($opt_validateDaf) {
    if(-f $submitDir) {
        Encode::parseDaf($submitDir,  $fields, $pipelineInstance);
    } else {
        Encode::getDaf($submitDir, $fields, $pipelineInstance);
    }
    print STDERR "DAF is valid\n";
    exit(0);
}

$daf = Encode::getDaf($submitDir, $fields, $pipelineInstance);
$assembly = $daf->{assembly};

my $db = HgDb->new(DB => $daf->{assembly});
$db->getChromInfo(\%chromInfo);
$db->getChromSizes(\%chromSizes);

# Add the variables in the DAF file to the required fields list
if (defined($daf->{variables})) {
    for my $variable (keys %{$daf->{variableHash}}) {
        $fields->{$variable}{required} = 1;
        $fields->{$variable}{file} = 'ddf';
    }
}

# make replicate column required when appropriate.
my $hasReplicates = 0;
#my $maxOrder = 0; Removing order for view level as this is not being used for prioritization
for my $view (keys %{$daf->{TRACKS}}) {
    $hasReplicates += $daf->{TRACKS}{$view}{hasReplicates};
#    if($daf->{TRACKS}{$view}{order} > $maxOrder) {
#        $maxOrder = $daf->{TRACKS}{$view}{order}
#    }
}

if($hasReplicates) {
    $fields->{replicate}{required} = 1;
}

# DAF may contain option to allow Reloads
if(validationSettings("allowReloads")) {
    $opt_allowReloads = 1;
}
if(validationSettings("skipAutoCreation")) {
    $opt_skipAutoCreation = 1;
}
if(validationSettings("skipValidateFiles")) {
    $opt_skipValidateFiles = 1;
}
if(validationSettings("skipOutput")) {
    $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
}

# Open dataset descriptor file (DDF)
my @glob = glob "*.DDF";
push(@glob, glob "*.ddf");
my $ddfFile = Encode::newestFile(@glob);
die "ERROR: Can't find DDF file\n" unless -e $ddfFile;
my $ddfFileTime = (stat($ddfFile))->ctime;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime($ddfFileTime);
my $dateSubmitted    = sprintf("%04d-%02d-%02d", 1900 + $year, $mon + 1, $mday);

HgAutomate::verbose(2, "Using newest DDF file \'$ddfFile\'\n");
my $lines = Encode::readFile($ddfFile);

my $ddfLineNumber = 0;
# Get header containing column names
while(@{$lines}) {
    my $line = shift(@{$lines});
    $ddfLineNumber++;
    # remove leading and trailing spaces and newline
    $line =~ s/^\s+//;
    $line =~ s/\s+$//;
    # ignore empty lines and comments
    next if $line =~ /^$/;
    next if $line =~ /^#/;
    if($line !~ /\t/) {
        die "ERROR: The DDF header has no tabs; the DDF is required to be tab delimited\n";
    }
    @ddfHeader = split(/\t/, $line);
    for (my $i=0; $i < @ddfHeader; $i++) {
        $ddfHeader{$ddfHeader[$i]} = $i;
    }
    last;
}

%terms = Encode::getControlledVocab($configPath);

#my @errors = Encode::validateFieldList(\@ddfHeader, $fields, 'ddf');

#the ddf header should not validate against fields.ra, so it now validates against the CV
my @errors = @{&validateDdfHeader(\@ddfHeader, \%terms, $daf->{variableArray})};


# Special cases to handle conditionally required fields
if(!defined($ddfHeader{controlId})) {
    if($db eq "hg19") {
        if($daf->{compositeSuffix} == "HaibTfbs"
        || $daf->{compositeSuffix} == "SydhTfbs"
        || $daf->{compositeSuffix} == "SydhHistone") {
            push(@errors, "field 'controlId' not defined");
        }
    }
}

if(@errors) {
    die "ERROR in DDF '$ddfFile':\n" . join("\n", @errors) . "\n";
}


my @variables;
if (defined($daf->{variables})) {
    @variables = @{$daf->{variableArray}};
} else {
    # Hubbard Sanger Gencode project has no variables
    @variables = ();
}

# Now testing daf lab and dataType.
pushError(\@errors, validateDafField("grant", $daf->{grant}, $daf));
pushError(\@errors, validateDafField("lab", $daf->{lab}, $daf));
pushError(\@errors, validateDafField("dataType", $daf->{dataType}, $daf));
my %metadataHash;

# Process lines in DDF file. Create a list with one entry per line;
# the entry is field/value hash (fields per @ddfHeader).

while (@{$lines}) {
    my $line = shift(@{$lines});
    $ddfLineNumber++;
    my $errorPrefix = "DDF lineNumber $ddfLineNumber:";
    HgAutomate::verbose(2, "Parsing ddf line $ddfLineNumber\n");

    $line =~ s/^\s+//;
    $line =~ s/\s+$//;
    next if $line =~ /^#/;
    next if $line =~ /^$/;

    if($line !~ /\t/) {
        pushError(\@errors, "$errorPrefix line has no tabs; the DDF is required to be tab delimited");
        next;
    }
    
    #I added a function top check if the lines after the ddf header had more values than the ddf header had fields
    #previously it would just throw a runtime error, as $i is just incremented along with the values of $line
    #if @ddfHeader didn't have a matching index location, which would only happen at the end, the program would die.
    #now it throws an error into @errors, and skips the ddf line
    my @linetest = split "\t", $line;
    my $linamt = scalar(@linetest);
    my $ddfamt = scalar(@ddfHeader);
    if ($linamt > $ddfamt){
        pushError(\@errors, "$errorPrefix has too many fields Line:$linamt DDF:$ddfamt");
        next;
    }
    my $i = 0;
    my %line;
    for my $val (split('\t', $line)) {
        if($ddfHeader[$i] ne "files" && $val =~ / / ) {
            if($val !~ /^\"/ || $val !~ /\"$/ ) {  # Only if not already quoted
                $val =~ s/\"/\\"/g if $val =~ /\"/;
                $val = '"' . $val . '"';
            }
        }
        if($ddfHeader[$i] ne "files" && $val =~ /\;/ ) { # Replace ; with .
            $val =~ s/;/\./g;
        }
        $line{$ddfHeader[$i]} = $val;
        $i++;
    }
    if(my @tmp = Encode::validateValueList(\%line, $fields, 'ddf')) {
        pushError(\@errors, $errorPrefix . "\n" . join("\n", @tmp));
        next;
    }
    my $view = $line{view};
    HgAutomate::verbose(2,"Parsing $view\n");
    if($daf->{TRACKS}{$view}) {
        my $files = $line{files};
        if($fields->{replicate}{required}) {
            my $replicate = $line{replicate};
            if($daf->{TRACKS}{$view}{hasReplicates} && (!defined($replicate) || !length($replicate))) {
                pushError(\@errors, "$errorPrefix missing replicate number for view '$view'");
            }
        }
        my @filenames;
        for(split(',', $files)) {
            # Use glob explicitly so our error messages have the list of files actually used.
            if(my @glob = glob) {
                push(@filenames, @glob);
            } else {
                push(@filenames, $_);
            }
        }
        $line{files} = \@filenames;
        my @metadataErrors;
        
        
        for my $field (keys %line) {
            #the next two condotionals evaluate whether a field value is blank, if the field is required throw an error,
            #if not, then skip validation, and pass the blank value through
            if ($line{$field} eq "" && !($fields->{$field}{required})){
                next;
            } elsif ($line{$field} eq "" && $fields->{$field}{required}){
                push (@errors, "Missing value for required field '$field' on ddf line $ddfLineNumber");
                next;
            }
            my $cell = $line{cell};
            my $sex = $line{sex};
            my $category;
            if (defined $terms{'Cell Line'}->{$cell}) {
                $category = $terms{'Cell Line'}->{$cell}->{'category'};
            }
            if (defined $category && $category eq "Tissue" && not defined $sex) {
                push (@errors, "Cell '$cell' is a tissue; the sex must be defined in the DDF.");
            }
            my $mdbError = validateDdfField($field, $line{$field}, $view, $daf, $cell, $sex, \%terms);
            if ($mdbError) {
                push(@metadataErrors, $mdbError);
            }
        }
        
        if(@metadataErrors) {
            pushError(\@errors, @metadataErrors);
        } else {
            # avoid spurious errors by not putting invalid lines into %ddfSets
            # ddfKey returnes undef if there are no variables defined
            if (defined(ddfKey(\%line, \%ddfHeader, $daf, 1))) {
            $ddfSets{ddfKey(\%line, \%ddfHeader, $daf, 0)}{VIEWS}{$view} = \%line;
            $ddfReplicateSets{ddfKey(\%line, \%ddfHeader, $daf, 1)}{VIEWS}{$view} = \%line;
            my $str = join(", ", map($line{$_}, sort(@variables)));
                    if (defined($daf->{dataVersion}) && $daf->{dataVersion} > 1) {
                        $str .= ", V" . $daf->{dataVersion};
                    }
            $metadataHash{$str} = 1;
            }
        }
        push(@ddfLines, \%line);
    } else {
        pushError(\@errors, "$errorPrefix undefined view '$view'");
    }
    HgAutomate::verbose(2, "End of parsing ddf line $ddfLineNumber\n");
}

my $tmpCount = 1;

if(!@errors) {
    # Look for missing required views and create missing, optional views, but
    # but don't bother if we have already encountered errors.
    # Could also look for replicate inconsistency here (e.g. Alignments for replicate 3 but not fastq for replicate 3).

    for my $key (keys %ddfSets) {
        for my $view (keys %{$daf->{TRACKS}}) {
            if($daf->{TRACKS}{$view}{required}) {
                if(!defined($ddfSets{$key}{VIEWS}{$view})) {
                    pushError(\@errors, "view '$view' missing for $key");
                }
            }
        }
    }

    doTime("beginning ddfReplicateSets loop") if $opt_timing;
    for my $key (keys %ddfReplicateSets) {
        # create missing optional views (e.g. ChIP-Seq RawSignal or transcriptome project PlusRawSignal and MinusRawSignal)
        # note this loop assumes these are on a per replicate basis.
        # Also note that any project (like transcriptome) that doesnt have replicates should also use
        # this for their auto-create signals.
        HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
        if($daf->{noAutoCreate} && ( $daf->{noAutoCreate} eq "no") #We are no longer AutoCreateing, we only create under duress, 9-17-10
        && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
        ##&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})   ## No longer create RawSignals
        && !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
        && !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
        && ($daf->{dataType} ne 'MethylSeq')) {
            # Make a list of the PlusRawSignal/MinusRawSignal or RawSignals we are going to have to make
            my @newViews = ();
            #push @newViews, "RawSignal" if $daf->{TRACKS}{RawSignal}{order};  ## No longer create RawSignals
            # Code is never triggered, but if triggered we no longer use the order field for track prioritization.
            #push @newViews, "PlusRawSignal" if $daf->{TRACKS}{PlusRawSignal}{order};
            #push @newViews, "MinusRawSignal" if $daf->{TRACKS}{MinusRawSignal}{order};

            foreach my $newView (@newViews) #loop around making them
            {
                my $alignmentLine = $ddfReplicateSets{$key}{VIEWS}{Alignments};
                # Time to check for fragLength by replicate (in alignments line) (die, don't just push error and build anyway)
                if($newView eq "RawSignal") {
                    if(!defined($alignmentLine->{fragLength})) {
                        if(!defined($daf->{medianFragmentLength})) {
                            die (\@errors, "Missing fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
                        } else { # Letting medianFramentLength stand in for per relicate fragLength
                            $alignmentLine->{fragLength} = $daf->{medianFragmentLength};
                        }
                    }
                    if ($alignmentLine->{fragLength} < 0 || $alignmentLine->{fragLength} > 10000) {
                        die (\@errors, "Missing or invalid fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
                    }
                }
                my %line = %{$alignmentLine};
                $line{view} = $newView;
                $line{type} = 'wig';
                $ddfReplicateSets{$key}{VIEWS}{$newView} = \%line;
                my @unzippedFiles = ();
                doTime("beginning unzipping replicates files for view [$newView] key=[$key]") if $opt_timing;
                for my $file (@{$alignmentLine->{files}}) {
                    # Unzip any zipped files - only works if they are with .gz suffix
                    my ($fbase,$dir,$suf) = fileparse($file, ".gz");
                    if ($suf eq ".gz") {
                        # If the zipped file exists then unzip it (do this each time, in case zip file is updated
                        # This check is also done above at the stage where we are testign the files in the ddf exist
                        if (-s $file) {
                            my $err = system("gunzip -c $file > $dir/$fbase");
                            if ($err) {
                                die ("File \'$file\' failed gunzip $file to [$dir/$fbase]\n");
                            }
                            HgAutomate::verbose(2, "File \'$file\' gunzipped to \'$fbase\'\n");
                        }
                        if ( ! -s "$dir/$fbase") {
                            die ("Unzipped file \'$fbase\' does not exist (or is empty) for DDF file \'$file\'\n");
                        }
                        push @unzippedFiles, "$dir/$fbase";
                    } else {
                        push @unzippedFiles, $file;
                    }
                }
                doTime("done unzipping replicates files") if $opt_timing;
                $alignmentLine->{files} = \@unzippedFiles;
                # Now we can safely sort these files as none are zipped
                my $files = join(" ", @{$alignmentLine->{files}});
                my $tmpFile = $Encode::autoCreatedPrefix . $newView. "$tmpCount.bed"; # add the type of view to the name
                $tmpCount++;
                if($opt_skipAutoCreation) {
                    HgAutomate::verbose(2, "Skipping auto-creating view '$newView' for key '$key'\n");
                } else {
                    HgAutomate::verbose(2, "Auto-creating view '$newView' for key '$key' in file '$tmpFile'\n");
                    doTime("beginning Auto-create of view $newView in file $tmpFile") if $opt_timing;

                    # XXXX gzip before saving to disk?
                    my @cmds;
                    my $sortFiles;
                    if(defined($alignmentLine->{fragLength}) && $alignmentLine->{fragLength} != 0) {
                        push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $alignmentLine->{fragLength} $files");
                        $sortFiles = " -";
                        # sorting stdin, so have to sort in mem (and control how much mem we use)
                        push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                    } else {
                        $sortFiles = $files;
                        # sort each file in place, controling mem usage, then do merge sort
                        my @sortList = split(/\s+/, $sortFiles);
                        foreach my $f (@sortList) {
                            my $err = system("sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n -o $f $f ");
                            if ($err) {
                                die ("File \'$f\' failed sort\n");
                            }
                            HgAutomate::verbose(2, "File \'$f\' sorted\n");
                        }
                        # Now do the mergesort in the pipeline
                        push @cmds, "sort -m $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                    }
                    push @cmds, "grep -v -E \"^track\" ";
                    push @cmds, "gawk '\$6 == \"+\" {print}'" if $newView eq "PlusRawSignal";
                    push @cmds, "gawk '\$6 == \"-\" {print}'" if $newView eq "MinusRawSignal";
                    push @cmds, "bedItemOverlapCount $daf->{assembly} stdin";
                    my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => $tmpFile, DEBUG => $opt_verbose - 1);
                    if(my $err = $safe->exec()) {
                        print STDERR  "ERROR: failed auto bedItemOverlap creation of bedGraph for $key" . $safe->stderr() . "\n";
                        # don't show end-user pipe error(s)
                        pushError(\@errors, "failed creation of wiggle for '$key'");
                    }
                    doTime("done Auto-create of view $newView") if $opt_timing;
                }
                $line{files} = [$tmpFile];
                push(@ddfLines, \%line);
            }  # End foreach newView loop
        }
    } # End replicate sets loop
    doTime("done ddfReplicateSets loop") if $opt_timing;
}

my $compositeTrack = Encode::compositeTrackName($daf);
### No good reason to make this an error.  Composite entry can be added when subtracks are 1st added to trackDb.
#if(!$db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack)) {
#    pushError(\@errors, "Missing composite track '$compositeTrack'; please contact your data wrangler");
#}
my $compositeExists = $db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack);

if(@errors) {
    #collapse identical errors into one line
    my %errors;
    foreach my $line (@errors) {
        $errors{$line}++;
    }
    @errors = keys(%errors);

    my $prefix = @errors > 1 ? "Error(s)" : "Error";
    die "$prefix:\n\n" . join("\n\n", @errors) . "\n";
}

# After this point, we don't use @errors and just die immediately.

# Validate files and metadata fields in all ddfLines using controlled
# vocabulary.  Create load.ra file for loader and trackDb.ra file for wrangler.
doTime("beginning out files") if $opt_timing;

if($opt_skipOutput) {
    open(LOADER_RA, ">>/dev/null");
    open(TRACK_RA, ">>/dev/null");

} else {
    open(LOADER_RA, ">$outPath/$Encode::loadFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::loadFile\' file; error: $!\n";
    open(TRACK_RA, ">$outPath/$Encode::trackFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::trackFile\' file; error: $!\n";

}
if($opt_metaDataOnly || !$opt_skipOutput) {
    open(MDB_TXT, ">$outPath/$Encode::mdbFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::mdbFile\' file; error: $!\n";
} else {
    open(MDB_TXT, ">>/dev/null");
}

# Create a composite track entry if the trackDb.ra entry was not found
if(!$opt_skipOutput && !$compositeExists) {
    printCompositeTdbSettings(*TRACK_RA,$daf,%ddfSets);
}

# XXXX Calculation of priorities still needs work; we currently don't account for multiple experiments in the same DDF.
# It may in fact be too much work to do automatic calculation of priorities (i.e. the wrangler may have to do it manually).

my $priority = $db->quickQuery("select max(priority) from trackDb where settings like '%subTrack $compositeTrack%'") || 0;
$ddfLineNumber = 1;

my $subId = 0;
foreach my $ddfLine (@ddfLines) {
    $ddfLineNumber++;
    my $diePrefix = "ERROR on DDF lineNumber $ddfLineNumber:";
    my $view = $ddfLine->{view};
    my $type = $daf->{TRACKS}{$view}{type} || die "Missing DAF entry for view '$view'\n";
    my $sql = $daf->{TRACKS}{$view}{sql};
    my $lab = $daf->{lab};
    
    #the next 3 variables are for the new long label generation mechanism
    my $longlabeldatatype = $daf->{dataType};
    my $longlabelprefix = ${${$terms{'dataType'}}{$longlabeldatatype}}{'label'};
    my $longlabelview = ${${$terms{'view'}}{$view}}{'label'};
    
    my $metadata = "project=wgEncode grant=$daf->{grant} lab=$lab";
    $metadata .= " dataType=$daf->{dataType}";
    $metadata .= " cell=$ddfLine->{cell}" if $ddfLine->{cell}; # force some order
    $metadata .= " antibody=$ddfLine->{antibody}" if $ddfLine->{antibody};
    for my $key (keys %{$ddfLine}) {
        my $value = $ddfLine->{$key};
        if($value
        && $key ne 'files'
        && $key ne 'cell'
        && $key ne 'antibody'
        && $key ne 'view'
        && $key ne 'replicate'
        && $key ne 'origAssembly'
        && $key ne 'labVersion'
        && $key ne 'softwareVersion'
        && $key ne 'origAssembly') {
            $metadata .= " $key=$value"; # and the rest
        }
    }
    if($daf->{dataType} =~/ChIPseq/i) {
        if(!$ddfLine->{setType}) {
            if($ddfLine->{antibody} =~/Input/i ) {
                $metadata .= " setType=input";
            } else {
                $metadata .= " setType=exp";
            }
        }
        if(!$ddfLine->{controlId}) {
            my $controlId = $ddfLine->{cell};
            $controlId .= "/" . $ddfLine->{treatment} if $ddfLine->{treatment};
            $controlId .= "/Input"; # default antibody for ChIPseq is "Input"
            $controlId .= "/" . $ddfLine->{protocol} if $ddfLine->{protocol};
            $controlId .= "/" . $ddfLine->{control} if $ddfLine->{control};
            $metadata .= " controlId=$controlId";
        }
    }
    # Extend meta-data for mouse to input sex,strain and age information from CV without labs needing to input
    # meta-data in DDF for cell lines and primary cell lines.
    if( ($daf->{assembly} eq 'mm9') && ($terms{'Cell Line'}->{$ddfLine->{cell}}->{'category'} ne 'Tissue')) {
       $metadata .= " sex=$terms{'Cell Line'}->{$ddfLine->{cell}}->{'sex'}" if !$ddfLine->{sex};
       $metadata .= " strain=$terms{'Cell Line'}->{$ddfLine->{cell}}->{'strain'}" if !$ddfLine->{strain};
       $metadata .= " age=$terms{'Cell Line'}->{$ddfLine->{cell}}->{'age'}" if !$ddfLine->{age};
    }
    $metadata .= " view=$view";
    $metadata .= " replicate=$ddfLine->{replicate}" if $ddfLine->{replicate} && $daf->{TRACKS}{$view}{hasReplicates};
    $metadata .= " labVersion=$ddfLine->{labVersion}" if $ddfLine->{labVersion};
    $metadata .= " softwareVersion=$ddfLine->{softwareVersion}" if $ddfLine->{softwareVersion};
    $metadata .= " origAssembly=$ddfLine->{origAssembly}" if $ddfLine->{origAssembly};
    if ($daf->{assembly} eq "mm9"){
        $metadata .= ' dataVersion="' . $Encode::mouseDataVersion . '"';
    } else {
        $metadata .= ' dataVersion="' . $Encode::dataVersion .'"';
    }
    if($submitDir =~ /(\d+)$/) {
        $subId = $1;
    } elsif($submitDir =~ /(\d+)/) {
        $subId = $1;
    }
    $metadata .= " subId=$subId";
    if (defined($daf->{dataVersion}) && $daf->{dataVersion} > 1) {
        die "Need dataVersionComment in DAF when dataVersion is supplied\n" if (!defined($daf->{dataVersionComment}));
        $metadata .= ' submittedDataVersion="' . "V$daf->{dataVersion}" . " - $daf->{dataVersionComment}" . '"';
    }

    HgAutomate::verbose(2, "  View: $view\n");
    my $replicate;
    if($hasReplicates && $daf->{TRACKS}{$view}{hasReplicates}) {
        $replicate = $ddfLine->{replicate};
        if(defined($replicate) && $replicate > 0) {
        } else {
            die "$diePrefix invalid or missing replicate value\n";
        }
    }
    # Construct table name from track name and variables
    my $tableName = "$compositeTrack";
    my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', Alignments => 'Aln', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');

    if(!defined($daf->{TRACKS}{$view}{shortLabelPrefix})) {
        $daf->{TRACKS}{$view}{shortLabelPrefix} = "";
    }
    my $shortLabel = defined($daf->{TRACKS}{$view}{shortLabelPrefix}) ? $daf->{TRACKS}{$view}{shortLabelPrefix} : "";
    
    #long label is being generated by a new subroutine, so no longer needed here.
    #my $longLabel = "ENCODE" . (defined($daf->{TRACKS}{$view}{longLabelPrefix}) ? " $daf->{TRACKS}{$view}{longLabelPrefix}" : "");
    #if(defined($replicate)) {
    #    $longLabel .= " Replicate $replicate";
    #}
    my $subGroups = "view=$view";
    my $pushQDescription = "";
    my $species;
    my $tier1 = 0;
    #the variables hash is generated out of scope for the long label mechanism, so i pre-initialized a separate hash here
    #then it's copied over in the if statement
    my %longlabelvars;
    if (@variables) {
        my %hash = map { $_ => $ddfLine->{$_} } @variables;
        #copied over here
        %longlabelvars = %hash;
        for my $var (@variables) {
            my $cvTypeVar = $var;
            if ($var eq "antibody") {
                $cvTypeVar = "Antibody";
            } elsif ($var eq "cell") {
                $cvTypeVar = "Cell Line";
            } elsif ($var eq "obtainedBy") {
                $cvTypeVar = "lab";
            }
            if(!defined($terms{$cvTypeVar}->{$hash{$var}})) {
                $cvTypeVar = "control";
            }
            my $val = $terms{$cvTypeVar}->{$hash{$var}}->{'tag'};
            $val = ucfirst(lc($val));
            if($val ne 'None') {  # Special control term does not show up in the name!
                # trailing + => Plus, - => Neg (e.g. H9ES-AFP+)
                $val =~ s/\+$/Pos/;
                $val =~ s/\-$/Neg/;
                $tableName = $tableName . $val;
            }
        }

        my $shortSuffix = "";
        #longsuffix is deprecated, as the long label is being generated by a new subroutine
        if($hash{'antibody'} && $hash{'cell'}) {
            $pushQDescription = "$hash{'antibody'} in $hash{'cell'}";
            $shortSuffix = "$hash{'cell'} $hash{'antibody'}";
        } elsif($hash{'ripAntibody'} && $hash{'ripTgtProtein'} && $hash{'cell'}) {

            $pushQDescription = "";
            $shortSuffix = "$hash{'cell'} $hash{'ripTgtProtein'} $hash{'ripAntibody'}";
        } elsif($hash{'rnaExtract'} && $hash{'localization'} && $hash{'cell'}) {
            $shortSuffix = "$hash{'cell'} $hash{'localization'} $hash{'rnaExtract'}";
            if ($hash{'mapAlgorithm'}) {
                $shortSuffix = $shortSuffix . $hash{'mapAlgorithm'};
            }
            $pushQDescription = "";
        } elsif($hash{'freezeDate'}) {
            $shortSuffix = $hash{'freezeDate'};
            $pushQDescription = "";
        } elsif ($hash{"species"}) {
            $pushQDescription = "$hash{'species'}";
            $shortSuffix = "$hash{'species'}";
            $species = "$hash{'species'}";
            $pushQDescription = "$view $daf->{dataType}";
        } elsif ($hash{"cell"}) {
            $pushQDescription = "$hash{'cell'}";
            $shortSuffix = "$hash{'cell'}";
            $tier1 = 1 if ($hash{'cell'} eq 'GM12878' || $hash{'cell'} eq 'K562' || $hash{'cell'} eq 'H1hESC');
        } else {
            warn "Warning: variables undefined for pushQDescription,shortSuffix,longSuffix\n";
        }
        if(defined($shortViewMap{$view})) {
            $shortSuffix .= " " . $shortViewMap{$view};
        }
        if(defined($replicate)) {
            $shortSuffix .= " $replicate";
            $pushQDescription .= " Replicate $replicate";
        }
        if($shortSuffix) {
            $shortLabel = $shortLabel ? "$shortLabel ($shortSuffix)" : $shortSuffix;
        }
        
       
        for my $var (sort keys %hash) {
            # The var name is over-ridden for antibody and cell, for historical reasons
            my $groupVar = $var;
            my $cvTypeVar = $groupVar;
            # handle inconsistent naming for antibody & cell type
            if ($var eq "antibody") {
                $groupVar = "factor";
                $cvTypeVar = "Antibody";
            } elsif ($var eq "cell") {
                $groupVar = "cellType";
                $cvTypeVar = "Cell Line";
            } elsif ($var eq "obtainedBy") {
              #Not sure why when we check for obtainedBy subGroups prints out and when when this is
              # not pressent the subGroups provides error of unitialized.
              # The behavior is odd since there is no $var of obtainedBy in the cv.ra
                 $cvTypeVar = "lab";
            }
            if(!defined($terms{$cvTypeVar}->{$hash{$var}})) {
                $cvTypeVar = "control";
            }
            $subGroups .= " $groupVar=$terms{$cvTypeVar}->{$hash{$var}}->{'tag'}";
        }
        #Venkat: Commented out the below line such that if any lab has replicates the replicate number will be placed
        # in the table name. The below code was found to be to specific, however if there are any problems
        # I have left the code in so that we can easily add it back in.
        #  if(defined($replicate) && ($daf->{lab} eq "HudsonAlpha" || $daf->{lab} eq "Uw") || $daf->{lab} eq "Gis") {
        if (defined($replicate)) {
            $subGroups .= " rep=rep$replicate"; # UGLY special casing
        }
    }

    # Add view and replicate to tablename
    if(defined($shortViewMap{$view})) {
        $tableName .= $shortViewMap{$view};
    } else {
        $tableName .= $view;
    }
    if(defined($replicate)) {
        $tableName .= "Rep$replicate";
    }

    # mysql doesn't allow hyphens in table names and our naming convention doesn't allow underbars; to be
    # safe, we strip non-alphanumerics.
    $tableName =~ s/[^A-Za-z0-9]//g;

    my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($ddfFileTime); # Use DDF time
    my $dateUnrestricted = sprintf("%04d-%02d-%02d", 1900 + $rYear, $rMon + 1, $rMDay);


    # dataVersion means the tableName must be different (append Vn), and the old metaddata should be used for dateSubmitted and dateUnrestricted
    if(defined($daf->{dataVersion}) && $daf->{dataVersion} > 1) {
        my $prevTableName = "$tableName";
        # Find old metadata to lookup dateSubmitted and dateUnrestricted
        my $prevTableFound = 0;
        for (my $preVer=$daf->{dataVersion} - 1; $preVer > 1; $preVer--) {
            $prevTableFound = $db->quickQuery("select count(*) from trackDb where tableName = ?", $prevTableName . "V$preVer");
            if($prevTableFound) {
                $prevTableName .= "V$preVer";
                last;
            }
        }
        if($prevTableFound == 0) {
            $prevTableFound = $db->quickQuery("select count(*) from trackDb where tableName = '$prevTableName'");
        }
        if($prevTableFound) {
            my $oldSettings = $db->quickQuery("select settings from trackDb where tableName = '$prevTableName'");
            if( $oldSettings =~ m/metadata (.*?)\n/ ) {
                #$oldSettings =~ m/metadata (.*?)\n/;    # Is this throwing away all but the contents of the metadata line?
                my ( $tagRef, $valRef ) = Encode::metadataLineToArrays($1);
                my @tags = @{$tagRef};
                my @vals = @{$valRef};
                my $tix = 0;
                while($tags[$tix]) {
                    if($tags[$tix] eq "dateUnrestricted") {
                        $dateUnrestricted = $vals[$tix];
                    } elsif($tags[$tix] eq "dateSubmitted") {
                        $metadata .= " dateResubmitted=$dateSubmitted";
                        $dateSubmitted = $vals[$tix];
                    }
                    $tix++;
                }
            }
        }
        # Now finally complete the real tableName
        $tableName = $tableName . "V" . $daf->{dataVersion};
    }
    # Delayed adding these terms to metadata so that resubmissions could have the looked up term
    $metadata .= " dateSubmitted=$dateSubmitted";
    $metadata .= " dateUnrestricted=$dateUnrestricted";

    # We should add attic terms to the mdb.txt.  At least those that we can recognize
    if ($daf->{TRACKS}{$view}{auxiliary} && $daf->{TRACKS}{$view}{auxiliary} eq "yes") {
        $metadata .= " attic=auxValid";
    } elsif ($ddfLine->{display} && $ddfLine->{display} eq "no") {
        $metadata .= " attic=auxExp";
    } elsif ($type eq "document") {
        if ($daf->{TRACKS}{$view}{supplemental} && $daf->{TRACKS}{$view}{supplemental} eq "yes") {
            # FIXME: at this point, the pipeline is unprepared to deal with "sup" which is placed in "supplemental" subdir.
            $metadata .= " attic=sup";
        } else {
            $metadata .= " attic=auxSup";
        }
    }

    $tableName =~ "/Utaustin/Uta/";  # Special case for certain transgressors
    if(length($tableName) > 64) {
        $tableName =~ "/Hudsonalpha/Haib/" if length($tableName) > 64; # Special case for certain transgressors
        $tableName =~ "/Sunyalbany/Sunya/" if length($tableName) > 64;
        $tableName =~ "/Alignments/Aln/" if length($tableName) > 64;
        $tableName =~ "/Signal/Sig/" if length($tableName) > 64;
        $tableName =~ "/Control/Ctrl/" if length($tableName) > 64;
        die "Table name [$tableName] too long, must be <= 64 chars, got [".length($tableName)."]\n" if length($tableName) > 64;
    }

    if($tableNamesUsed{$tableName}++) {
        dieTellWrangler("System Error: identical tableName '$tableName' was generated by multiple data sets\n");
    }

    if(!$opt_allowReloads) {
        if ($db->tableExist( $tableName)) {
            die "view '$view' has already been loaded as track '$tableName'\nPlease contact your wrangler if you need to reload this data\n";
        }
    }

    $submitDir = `pwd`;
    chomp $submitDir;
    unless ($pipelineInstance eq "beta" or $pipelineInstance eq "standard") {
        $tableName = $tableName . "_$pipelineInstance" . "_" . basename($submitDir);
    }

    my $targetFile = makeDownloadTargetFileName($tableName, $type, \@{$ddfLine->{files}} );
    my $downloadDir = Encode::downloadDir($daf);
    if(!$opt_allowReloads) {
        if(-e "$downloadDir/$targetFile") {
            die "view '$view' has already been loaded as file '$downloadDir/$targetFile'\nPlease contact your wrangler if you need to reload this data\n";
        }
    }
    # XXXX Move the decision about which views have tracks into the DAF?
    # Already this is used in 2 places so made it a function,
    # would be better in the DAF except we'd have to go change all the DAFs :(
    my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);


    my $fileType = $type;
    $fileType =~ s/ //g;
    $metadata .= " composite=$compositeTrack";

    if(!$downloadOnly) {
        $metadata .= " tableName=$tableName";
    }
    my $baifile;
    if ($targetFile =~ m/\S+.bam$/) {
        $baifile = $targetFile . ".bai";
    }

    if ($baifile) {
        print MDB_TXT sprintf("metadata %s fileName=%s,%s\n", $metadata, $targetFile, $baifile);
    } else {
        print MDB_TXT sprintf("metadata %s fileName=%s\n", $metadata, $targetFile);
    }
    print LOADER_RA "tablename $tableName\n";
    print LOADER_RA "view $view\n";
    print LOADER_RA "type $type\n";
    if($sql) {
        print LOADER_RA "sql $sql\n";
    }
    if($species) {
        print LOADER_RA "assembly $species\n";
    } else {
        print LOADER_RA "assembly $daf->{assembly}\n";
    }
    print LOADER_RA "files @{$ddfLine->{files}}\n";
    print LOADER_RA "downloadOnly $downloadOnly\n";
    print LOADER_RA "pushQDescription 1\n";
    print LOADER_RA "targetFile $targetFile\n";
    print LOADER_RA "\n";

        
    if(!$downloadOnly) {
        print TRACK_RA "        track $tableName\n";
        if ($tier1 eq 1) {
            # default to only Tier1 subtracks visible.  Wrangler should review if this is
            #   correct for the track
            print TRACK_RA "        parent " . $compositeTrack . "View" . $view . "\n";
        } else {
            print TRACK_RA "        parent " . $compositeTrack . "View" . $view . " off\n";
        }
        print TRACK_RA "        shortLabel $shortLabel\n";
        
        
        #call the subroutine to generate the new long label - not all the passed variables are needed or used
        #the attempt is to make the long labels comething like this
        # Cell (age strain treatment protocol antibody control localization rnaExtract readType insertLength) dataType View (Rep) from ENCODE/lab
        # not everything in the parentheses are going to be used
        #E.G. NHEK cell longPolyA RNA-seq Transcript Gencode V7 Rep 5 from ENCODE/CSHL
        my $longLabel = &generateLongLabel($lab, \%longlabelvars, $replicate, $longlabelprefix, $longlabeldatatype, $longlabelview);
        print TRACK_RA "        longLabel $longLabel\n";
        #print TRACK_RA "        longLabel $longLabel\n";
        print TRACK_RA "        subGroups $subGroups\n";
        if($type eq 'wig') {
            my $placeHolder = Encode::wigMinMaxPlaceHolder($tableName);
            print TRACK_RA "        type $type $placeHolder\n";
        } elsif($type eq 'bigWig') {
            my @cmds;
            my $tmpFile = $Encode::autoCreatedPrefix . $type  ;
            push @cmds, "bigWigInfo -minMax @{$ddfLine->{files}}";
            my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => $tmpFile, DEBUG => $opt_verbose - 1);
            if(my $err = $safe->exec()) {
                print STDERR  "ERROR: failed bigWigInfo: " . $safe->stderr() . "\n";
                # don't show end-user pipe error(s)
                pushError(\@errors, "failed creation of trackDb");
            }
            my $lines = Encode::readFile($tmpFile);
            my $line = shift(@{$lines});
            print TRACK_RA "        type bigWig " . $line . "\n";

        } elsif($type eq 'gtf') { # GTF is converted to and loaded as genePred
            print TRACK_RA "        type genePred\n";
        } elsif($type eq 'tagAlign') { # tagAligns are bed 6 but with column called 'sequence' instead of 'name'
            print TRACK_RA "        type bed 6\n";
        } else {
            print TRACK_RA "        type $type\n";
        }
        if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
            print TRACK_RA sprintf("        accession %s\n",$ddfLine->{accession});
        }
        if(defined($ddfLine->{origAssembly}) && length($ddfLine->{origAssembly}) > 0) {
            print TRACK_RA sprintf("        origAssembly %s\n",$ddfLine->{origAssembly});
        }
        # color track by color setting for cell type in cv.ra
        if(defined($ddfLine->{cell})) {
            if(defined($terms{'Cell Line'}->{$ddfLine->{cell}}->{'color'})) {
                print TRACK_RA sprintf("        color %s\n",
                        $terms{'Cell Line'}->{$ddfLine->{cell}}->{'color'});
            }
        }
        print TRACK_RA sprintf("        # subId=%s dateSubmitted=%s\n", $subId,$dateSubmitted);
        print TRACK_RA "\n";
    }
}
close(LOADER_RA);
close(TRACK_RA);
close(MDB_TXT);

doTime("done out files") if $opt_timing;

if($submitPath =~ /(\d+)$/) {
    my $id = $1;
    if(dirname($submitPath) =~ /_(\w+)/) {
        my $instance = $1;
        # XXXX rubyDb logic s/d probably be moved to Encode.pm
        my $rubyDb = HgDb->new(DB => "encpipeline_$instance");
        my @tmp = keys %metadataHash;
        my $count = scalar(@tmp);
        my $metadata = join("; ", @tmp);
        HgAutomate::verbose(2, "Updating id '$id'; metdata: '$metadata'; count: 'count'\n");
        $rubyDb->execute("update projects set count = ?, metadata = ?, db = ?, lab = ?, data_type = ?, track = ? where id = ?",
             $count, $metadata,
             $daf->{assembly}, $daf->{lab}, $daf->{dataType}, $compositeTrack, $id);
    }
}

sub generateLongLabel {
    my $lab = $_[0];
    my %vars = %{$_[1]};
    my $replicate = $_[2];
    my $prefix = $_[3];
    my $datatype = $_[4];
    my $view = $_[5];

    my $count = 0;
    foreach my $value (@_){
        unless (defined($value)){
            $_[$count] = "";
        }
        $count++;
    }

    #takes off -m if the lab does mouse also
    $lab =~ s/\-m$//g;

    #the order that the edv's should come in after cell
    my @order = qw (age strain treatment protocol antibody control localization rnaExtract readType insertLength);

    #always a cell first
    my $longlabel = "$vars{'cell'}";
    #if the particular track doesn't have an EDV, then skip it
    foreach my $key (@order){
        #go down the order and check if the incoming %vars has it, a by product of this is that if it's not in the@order above
        #it won't go in the long label name
        if (exists $vars{$key}){

            #don't put anything that matches none to the label
            my $testvars = lc($vars{$key});
            if ($testvars =~ m/none/){next}
            $longlabel = $longlabel . " $vars{$key}";
        }
    }
    #switch for yes or no on replicate
    if ($replicate){
        $longlabel = $longlabel . " $prefix $view Rep $replicate from ENCODE/$lab";
    }
    else {
        $longlabel = $longlabel . " $prefix $view from ENCODE/$lab";
    }

    #turn all _ into spaces
    $longlabel =~ s/\_/ /g;

    #length checker
    my $llength = length ($longlabel);
    if ($llength > 80){
        $longlabel = $longlabel . " #too long length = $llength";
    }

    $longlabel = $longlabel . " #autogenerated";
    return $longlabel;

}


$time0=$timeStart;
doTime("done. ") if $opt_timing;
exit 0;