#!/bin/env python

import sys
import optparse
import os
import re
import subprocess
import md5

#### Classes ###################################################################

#### Functions #################################################################

def get_file_set(path, regexp):
    """return a set of the files in the path matching a regexp"""
    expression = re.compile(regexp)
    file_set = set()
    for file in os.listdir(path):
        if expression.match(file):
            file_set.add(file)
    return file_set

def table_exists(database, table):
    """check if a table exists in a database"""
    if "\"" in table:
        raise ValueError, "table name contains a \""
    command = "hgsql %s -e \"DESC %s;\"" % (database, table)
    proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE,stderr=subprocess.PIPE)
    proc.communicate()
    status_code = proc.returncode;
    return status_code == 0

def parse_version(name):
    """parse out a the version in name like tableNameV4"""
    expression = re.compile("(.*)V([0-9]+)$")
    match = expression.match(name)
    if match == None:
        return name, 1
    else:
        return match.group(1), int(match.group(2))

def prev_version(name, version):
    """return the prev version of a name/version pair"""
    if version == 1:
        raise ValueError, "there is no previous version of table %s" % name
    else:
        return "%sV%d" % (name, version - 1)

def next_version(name, version):
    """return the next version of a name/version pair"""
    return "%sV%d" % (name, version + 1)

def get_gbdb_pathname(database, table_name):
    """extract the gbdb pathaname of a gbdb table"""
    query = "SELECT file FROM %s LIMIT 1;" % table_name
    command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
    proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
                                                 stdin=subprocess.PIPE,
                                                 stderr=subprocess.PIPE)
    name = proc.communicate()[0].rstrip()
    status_code = proc.returncode;
    if status_code == 0:
        return name
    else:
        query = "SELECT fileName FROM %s LIMIT 1;" % table_name
        command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
        proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
                                                     stdin=subprocess.PIPE,
                                                     stderr=subprocess.PIPE)
        name = proc.communicate()[0].rstrip()
        status_code = proc.returncode;

        if status_code == 0:
            return name
        else:
            return None

def same_file(x, y, do_md5=True):
    """checks if two files are the same with various definitions of same"""
    if os.path.samefile(x, y):
        return True     # same inode
    else:
        if os.path.getsize(x) != os.path.getsize(y):
            return False    # different sizes
        else:   # now check md5s of the two files
            if do_md5:
                md5_x = md5.new()
                file_x = open(x)
                block = file_x.read(2**24)
                while block != "":
                    md5_x.update(block)
                    block = file_x.read(2**24)

                md5_y = md5.new()
                file_y = open(y)
                block = file_y.read(2**24)
                while block != "":
                    md5_y.update(block)
                    block = file_y.read(2**24)

                return md5_x.digest() == md5_y.digest()
            else:
                return True

def remove_duplicates(list):
    seen = {}
    for i in list:
        if i in seen:
           pass
        else:
            seen[i] = 1
    returnlist = []
    for i in seen:
        returnlist.append(i)
    return(returnlist)


#### Main ######################################################################

def main(argv=None):
    """ Generate a human readable file describing the changes between two
        releases of an ENCODE track.
    """
    if argv is None: argv = sys.argv
    # parse the args
    parser = optparse.OptionParser(usage="%prog [options] database current_release (prev_release|-)",
        version="%prog 0.9")
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False)
    parser.add_option("-t", "--composite-name", dest="composite_name",
        help="the name of the composite track by default this is the name of the current directory",
        metavar="N", default=None)
    parser.add_option("-n", "--track-name", dest="track_name",
        help="the English name of track, by default this is \"ENCODE [composite-name]\"", metavar="N", default=None)
    parser.add_option("--files", dest="files_path", help="dump list of new files to F", metavar="F")
    parser.add_option("--tables", dest="tables_path", help="dump list of new tablesto F", metavar="F")
    parser.add_option("--disable-md5-checks", dest="do_md5_checks", action="store_false", help="disable MD5 checks on files", default=True)

    global options
    (options, args) = parser.parse_args()

    # output a usage message
    if len(args) != 3:
        parser.print_help()
        sys.exit(10)

    # default track name is the current direcotry name
    if options.composite_name == None:
        options.composite_name = os.path.basename(os.getcwd())

    # default composite name is "ENCODE composite_name"
    if options.track_name == None:
        options.track_name = "ENCODE %s" % options.composite_name

    # get the positional args
    database = args[0]
    current_release_dir = args[1]
    prev_release_dir = args[2]

    # list of addition files
    possible_addition_files = ["README.txt", "files.txt", "md5sum.txt"]
    addition_files = []

    # some re we will be using
    table_and_file = re.compile("^(narrowPeaks|narrowPeak|broadPeak|gtf|bedRnaElements|bedRrbs|bedGraph\d+|bed\d*|bedLogR|pairedTagAlign|shortFrags|peptideMapping)$")
    gbdb = re.compile("^(wig|tagAlign|bigWig|bam)$")
    file_only = re.compile("^(matrix|pdf|fastq|fasta|rpkm|bowtie|psl|csqual|csfasta|junction|fpkm|fpkm1|fpkm2|insDistrib|insDist|junction|txt|tar)$")

    # if new relase, add the full path to all files
    path_prefix = ""
    #if prev_release_dir == "-":
    path_prefix = "/usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" \
            % (database, options.composite_name, current_release_dir)
      
    # generate the list of files
    file_pattern = ".*\.(gz|bigWig|bam|tgz)$"   # the pattern that describes what files we even consider
    current_files = get_file_set(current_release_dir, file_pattern)
    if prev_release_dir == "-":
        prev_files = set()
        prev_release_dir = ""
    else:
        prev_files = get_file_set(prev_release_dir, file_pattern)

    # form the three derived sets
    removed_files = prev_files - current_files
    unchanged_files = current_files & prev_files
    new_files = current_files - prev_files

    # warnings
    warnings = []

    # the list of files that we'll be printing
    unchanged_tables_list = []
    unchanged_files_list  = []
    unchanged_gbdbs_list  = []
    removed_tables_list = []
    removed_files_list  = []
    removed_gbdbs_list  = []
    new_tables_list = []
    new_files_list  = []
    new_gbdbs_list  = []

    if not options.do_md5_checks:
        warnings.append("Use of MD5 checksums to verify unchanged files has been disabled.")

    # process the list of unchanged files
    for f in unchanged_files:
        if f.count(".") == 1:
            name, type = f.split(".")
            compression = None
        elif f.count(".") == 2:
            name, type, compression = f.split(".")

        assert compression == None or compression == "gz" or compression == "tgz"
        
        # we don't deal with revisions yet
        stem, version = parse_version(name)
        if next_version(stem, version) in new_files:
            raise ValueError, "Unimplemented: newer version of %s found, can't deal with this yet" % name
        # check to make sure the files are really the same
        if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f), options.do_md5_checks):
            warn = "file %s in %s and %s don't appear to be the same (type=%s)" % \
                (name, current_release_dir, prev_release_dir, type)
            warnings.append(warn)
            print >>sys.stderr, warn

        if gbdb.match(type):
            unchanged_files_list.append(path_prefix + f)
            if type == "bam":
                if os.path.exists(path_prefix + f + ".bai"):
                    unchanged_files_list.append(path_prefix + f + ".bai")
                else:
                    warn = "could not find .bai file for bam file %s" % f
                    warnings.append(warn)
                    print >>sys.stderr, warn
                gbdb_path = get_gbdb_pathname(database, name)
                if gbdb_path != None and os.path.exists(gbdb_path + ".bai"):
                    unchanged_gbdbs_list.append(gbdb_path + ".bai")
                else:
                    warn = "could not find %s file for %s" % (str(gbdb_path) + ".bai", f)
                    warnings.append(warn)
                    print >>sys.stderr, warn

            if table_exists(database, name):
                unchanged_tables_list.append(name)
                gbdb_path = get_gbdb_pathname(database, name)
                if gbdb_path != None and os.path.exists(gbdb_path):
                    unchanged_gbdbs_list.append(gbdb_path)
                else:
                    warn = "could not find %s file for %s" % (gbdb_path, f)
                    warnings.append(warn)
                    print >>sys.stderr, warn
            else:
                warn = "table %s does not exist, from filetype %s" % (name, type)
                warnings.append(warn)
                print >>sys.stderr, warn
        elif table_and_file.match(type):
            unchanged_files_list.append(path_prefix + f)
            if table_exists(database, name):
                unchanged_tables_list.append(name)
            else:
                warn = "table %s does not exist, from filetype %s" % (name, type)
                warnings.append(warn)
                print >>sys.stderr, warn
        elif file_only.match(type):
            unchanged_files_list.append(path_prefix + f)
        else:
            raise ValueError, "unknown type %s of file %s" % (type, f)

    # process the list of removed files
    for f in removed_files:
        if f.count(".") == 1:
            name, type = f.split(".")
            compression = None
        elif f.count(".") == 2:
            name, type, compression = f.split(".")

        assert compression == None or compression == "gz"

        if gbdb.match(type):
            removed_files_list.append(path_prefix + f)
            if type == "bam":
                if os.path.exists(path_prefix + f + ".bai"):
                    removed_files_list.append(path_prefix + f + ".bai")
                else:
                    warn = "could not find .bai file for bam file %s" % f
                    warnings.append(warn)
                    print >>sys.stderr, warn
                gbdb_path = get_gbdb_pathname(database, name)
                if gbdb_path != None and os.path.exists(gbdb_path + ".bai"):
                    removed_gbdbs_list.append(gbdb_path + ".bai")
                else:
                    warn = "could not find %s file for %s" % (str(gbdb_path) + ".bai", f)
                    warnings.append(warn)
                    print >>sys.stderr, warn

            if table_exists(database, name):
                removed_tables_list.append(name)
                gbdb_path = get_gbdb_pathname(database, name)
                if gbdb_path != None and os.path.exists(gbdb_path):
                    removed_gbdbs_list.append(gbdb_path)
                else:
                    warn = "could not find %s file for %s" % (gbdb_path, f)
                    warnings.append(warn)
                    print >>sys.stderr, warn
            else:
                warn = "table %s does not exist, from filetype %s" % (name, type)
                warnings.append(warn)
                print >>sys.stderr, warn
        elif table_and_file.match(type):
            removed_files_list.append(path_prefix + f)
            if table_exists(database, name):
                removed_tables_list.append(name)
            else:
                warn = "table %s does not exist, from filetype %s" % (name, type)
                warnings.append(warn)
                print >>sys.stderr, warn
        elif file_only.match(type):
            removed_files_list.append(path_prefix + f)
        else:
            raise ValueError, "unknown type %s of file %s" % (type, f)

    # process the list of new files
    for f in new_files:
        if f.count(".") == 1:
            name, type = f.split(".")
            compression = None
        elif f.count(".") == 2:
            name, type, compression = f.split(".")

        assert compression == None or compression == "gz" or compression == "tgz"

        if gbdb.match(type):
            new_files_list.append(path_prefix + f)
            if type == "bam":
                if os.path.exists(path_prefix + f + ".bai"):
                    new_files_list.append(path_prefix + f + ".bai")
                else:
                    warn = "could not find .bai file for bam file %s" % f
                    warnings.append(warn)
                    print >>sys.stderr, warn
                gbdb_path = get_gbdb_pathname(database, name)
                if gbdb_path == None:
                    warn = "could not find gbdb path in table %s.%s" % (database, name)
                    warnings.append(warn)
                    print >>sys.stderr, warn
                elif gbdb_path != None and os.path.exists(gbdb_path + ".bai"):
                    new_gbdbs_list.append(gbdb_path + ".bai")
                else:
                    warn = "could not find %s file for %s" % (gbdb_path + ".bai", f)
                    warnings.append(warn)
                    print >>sys.stderr, warn
                    
            if table_exists(database, name):
                new_tables_list.append(name)
                gbdb_path = get_gbdb_pathname(database, name)
                if gbdb_path == None:
                    warn = "could not find gbdb path in table %s.%s" % (database, name)
                    warnings.append(warn)
                    print >>sys.stderr, warn
                elif gbdb_path != None and os.path.exists(gbdb_path):
                    new_gbdbs_list.append(gbdb_path)
                else:
                    warn = "could not find %s file for %s" % (gbdb_path, f)
                    warnings.append(warn)
                    print >>sys.stderr, warn
            else:
                warn = "table %s does not exist, from filetype %s" % (name, type)
                warnings.append(warn)
                print >>sys.stderr, warn
        elif table_and_file.match(type):
            new_files_list.append(path_prefix + f)
            if table_exists(database, name):
                new_tables_list.append(name)
            else:
                warn = "table %s does not exist, from filetype %s" % (name, type)
                warnings.append(warn)
                print >>sys.stderr, warn
        elif file_only.match(type):
            new_files_list.append(path_prefix + f)
        else:
            raise ValueError, "unknown type %s of file %s" % (type, f)

    # check the list of addition files
    for f in possible_addition_files:
        if os.path.exists(f):
            addition_files.append(path_prefix + f)
        else:
            warn = "addition file %s not found" % f
            warnings.append(warn)
            print >>sys.stderr, warn

    # output some basic stats
    if options.verbose:
        print >>sys.stderr, "Counts:"
        print >>sys.stderr, "  unchanged tables: %d" % len(unchanged_tables_list)
        print >>sys.stderr, "  unchanged files: %d" % len(unchanged_files_list)
        print >>sys.stderr, "  unchanged gbdbs: %d" % len(unchanged_gbdbs_list)
        print >>sys.stderr, "  removed tables: %d" % len(removed_tables_list)
        print >>sys.stderr, "  removed files: %d" % len(removed_files_list)
        print >>sys.stderr, "  removed gbdbs: %d" % len(removed_gbdbs_list)
        print >>sys.stderr, "  new tables: %d" % len(new_tables_list)
        print >>sys.stderr, "  new files: %d" % len(new_files_list)
        print >>sys.stderr, "  new gbdbs: %d" % len(new_gbdbs_list)
        print >>sys.stderr, "  additional files: %d" % len(addition_files)

    unchanged_tables_list = remove_duplicates(unchanged_tables_list)
    unchanged_files_list = remove_duplicates(unchanged_files_list)
    unchanged_gbdbs_list = remove_duplicates(unchanged_gbdbs_list)
    removed_tables_list = remove_duplicates(removed_tables_list)
    removed_files_list = remove_duplicates(removed_files_list)
    removed_gbdbs_list = remove_duplicates(removed_gbdbs_list)
    new_tables_list = remove_duplicates(new_tables_list)
    new_files_list = remove_duplicates(new_files_list)
    new_gbdbs_list = remove_duplicates(new_gbdbs_list)
    addition_files = remove_duplicates(addition_files)


    # if asked, save the list of new files
    if options.files_path:
        new_files_file = open(options.files_path, "w")
        new_files_list.sort()
        for i in new_files_list:
            print >>new_files_file, i
        print >>new_files_file
        new_gbdbs_list.sort()
        for i in new_gbdbs_list:
            print >>new_files_file, i
        new_files_file.close()

    # if asked, generate list of new tables
    if options.tables_path:
        new_tables_file = open(options.tables_path, "w")
        new_tables_list.sort()
        for i in new_tables_list:
            print >>new_tables_file, i
        print >>new_tables_file
        new_tables_file.close()

    # generate the header
    print "# generated with %s" % parser.get_version()
    print "This is a %s of the \"%s\"" % (current_release_dir, options.track_name)
    print "The composite track is %s" % options.composite_name
    
    if len(warnings) > 0:
        warn_header = "# WARNINGS "
        print warn_header + "#" * (60 - len(warn_header))
        c = 1
        for w in warnings:
            print  "%0d - %s" % (c, w)
            c += 1
        print "#" * 60

    print """
Categories of tables and files('):
A) Untouched - are on public browser and should remain
B) Deprecated - are currently on RR but will no longer be needed and should not be referenced by the public site.
   NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR).  
   This list is provided for completeness. Any files marked here as in gbdb may be eliminated.
C) New - are only currently on test but will need to be pushed to the RR.
D) Additional items of note
"""
    
    # some summary counts of current files, i.e. new + untouched
    print "Summary total counts for %s (new+untouched):" % current_release_dir
    print "    Tables: %d" % (len(unchanged_tables_list) + len(new_tables_list))
    print "    Files: %d" % (len(unchanged_files_list) + len(new_files_list) + len(addition_files))
    print "    Gbdbs: %d" % (len(unchanged_gbdbs_list) + len(new_gbdbs_list))
    print

    # untouched list
    print "A) Untouched Tables (%d):" % len(unchanged_tables_list)
    unchanged_tables_list.sort()
    for i in unchanged_tables_list:
        print i
    print

    print "A') Untouched Files (%d downloadables, %d gbdbs):" % (len(unchanged_files_list),
                                                           len(unchanged_gbdbs_list))
    if prev_release_dir == "":
        assert len(unchanged_files_list) == 0
        assert len(unchanged_gbdbs_list) == 0
    else:
        print "    current location on alpha:"
        print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
        print "    on RR:"
        print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
        print
        unchanged_files_list.sort()
        for i in unchanged_files_list:
            print i
        print
        unchanged_gbdbs_list.sort()
        for i in unchanged_gbdbs_list:
            print i
    print

    # eliminated list
    print "B) Deprecated tables (%d):" % len(removed_tables_list)
    removed_tables_list.sort()
    for i in removed_tables_list:
        print i
    print

    print "B') Deprecated files (%d downloadables, %d gbdbs):" %(len(removed_files_list),
                                                                len(removed_gbdbs_list))
    if prev_release_dir == "":
        assert len(removed_files_list) == 0
        assert len(removed_gbdbs_list) == 0
    else:
        print "    NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR)."
        print "    This list is provided for completeness. Any files marked here as in gbdb may be eliminated."
        print "    current location on alpha:"
        print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, prev_release_dir)
        print "    on RR:"
        print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
        print

        removed_files_list.sort()
        for i in removed_files_list:
            print i
        print
        removed_gbdbs_list.sort()
        for i in removed_gbdbs_list:
            print i

    print

    # new list
    print "C) New tables (%d):" % len(new_tables_list)
    new_tables_list.sort()
    for i in new_tables_list:
        print i
    print

    print "C') New files (%d downloadables, %d gbdbs):" % (len(new_files_list),
                                                         len(new_gbdbs_list))
    print "    current location on alpha:"
    print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
    print "    NOT on RR but must be placed in:"
    print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
    print

    new_files_list.sort()
    for i in new_files_list:
        print i
    print
    new_gbdbs_list.sort()
    for i in new_gbdbs_list:
        print i

    print

    print "D) Additional items:"
    print "    current location on alpha:"
    print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
    print "    should be placed on the RR in (overwritting any existing copy):"
    print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
    print
    for f in addition_files:
        print f

#### Module ####################################################################

if __name__ == "__main__":
    sys.exit(main())
