/* Copyright (C) 2011 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ /* wigAsciiToBinary - convert ascii Wiggle data to binary file * plus, produces file read for load into database * * The ascii data file is simply two columns, white space separated: * first column: offset within chromosome, 1 relative closed end * second column: any real data value * * Each data block will have its own specific scale computed * * This first pass will assume the scores are specified for * a single nucleotide base. If the offset sequence has * missing data in it, the missing data will be set to the * "WIG_NO_DATA" indication. (WIG_NO_DATA == 128) * * The binary data file is simply one unsigned char value per byte. * The suffix for this binary data file is: .wib - wiggle binary * * The second file produced is a bed-like format file, with chrom, * chromStart, chromEnd, etc. See hg/inc/wiggle.h for structure * definition. The important thing about this file is that it will * have the pointers into the binary data file and definitions of * where the bytes belong. This file extension is: .wig - wiggle * * Holes in the data are allowed. A command line argument can * specify the actual span of a data point == how many nucleotide * bases the value is representative of. * * There is a concept of a "bin" size which means how many values * should be maintained in a single table row entry. This can be * controlled by command line argument. Holes in the input data * will be filled with NO_DATA values up to the size of the current * "bin". If the missing data continues past the next "bin" size * number of data points, that table row will be skipped. Thus, it * is only necessary to have table rows which contain valid data. * The "bin" size is under control with a command line argument. */ #include "common.h" #include "options.h" #include "linefile.h" #define WIG_NO_DATA 128 #define MAX_BYTE_VALUE 127 /* command line option specifications */ static struct optionSpec optionSpecs[] = { {"offset", OPTION_LONG_LONG}, {"binsize", OPTION_LONG_LONG}, {"dataSpan", OPTION_LONG_LONG}, {"chrom", OPTION_STRING}, {"wibFile", OPTION_STRING}, {"name", OPTION_STRING}, {"minVal", OPTION_FLOAT}, {"maxVal", OPTION_FLOAT}, {"obsolete", OPTION_BOOLEAN}, {NULL, 0} }; /* FEATURE REQUEST - allow specification of columns in data file * where the chrom offset and data value can be found. This would * allow any type of column data file to work. Also, it may be * necessary in the future to allow a third column of data input * which would specify the chromosome this data belongs to, or even * a fourth column which names the data element, although these * extra columns and subsequent processing would require quite a * bit of work. It is by far much easier to do pre-processing of * mixed data like that to break it up into separate files for each * chromosome and named feature. */ static long long add_offset = 0; /* to allow "lifting" of the data */ static long long binsize = 1024; /* # of data points per table row */ static long long dataSpan = 1; /* bases spanned per data point */ static char * chrom = (char *) NULL; /* to specify chrom name */ static char * wibFile = (char *) NULL; /* to name the .wib file */ static char * name = (char *) NULL; /* to specify feature name */ static double minVal = BIGNUM; /* minimum value that is allowed in data. */ static double maxVal = BIGNUM; /* maximum value that is allowed in data. */ static boolean trimVals = FALSE;/*should the data be trimmed by min & maxVal?*/ static boolean obsolete = FALSE;/*use this program even though it is obsolete*/ static void usage() /* Explain how to use program. */ { errAbort( "wigAsciiToBinary - convert ascii Wiggle data to binary file\n" "usage: wigAsciiToBinary [options] \n" "\toptions:\n" "\t - list of files to process, (use: stdin for data from pipe)\n" "\t-offset=N - add N to all coordinates, default 0\n" "\t-binsize=N - # of points per database row entry, default 1024\n" "\t-dataSpan=N - # of bases spanned for each data point, default 1\n" "\t-chrom=chrN - this data is for chrN. Use with twoColumn output\n" "\t-wibFile=chrN - to name the .wib output file\n" "\t-name= - to name the feature, default chrN or\n" "\t\t-chrom= specified\n" "\t-verbose=N - display process while underway (only N=2 does something)\n" "\t-minVal=N - minimum allowable data value, values will be capped here.\n" "\t-maxVal=N - maximum allowable data value, values will be capped here.\n" "\t-obsolete - Use this program even though it is obsolete.\n" "If the name of the input files are of the form: chrN.<....> this will\n" "\tset the output file names. Otherwise use the -wibFile option.\n" "Generally the ascii files have one to three columns.\n" "\t or \n" "\t or\n" "\t\n" "If the sequence position is not specified in a line, it is assumed to be\n" "one past the previous position. If the sequence name is not specified\n" "it is the same as the previous line.\n" ); } static unsigned long long bincount = 0; /* to count binsize for wig file */ static unsigned long long chromStart = 0; /* for table row data */ static unsigned long long chromEnd = 0; /* for table row data */ static FILE *wigout; /* file handle for wiggle database file */ static FILE *binout; /* file handle for binary data file */ static char *chromName = (char *) NULL; /* pseudo bed-6 data to follow */ static char featureName[254]; /* the name of this feature */ static unsigned long long rowCount = 0; /* to count rows output */ static off_t fileOffsetBegin = 0; /* where this bin started in binary */ static char *binfile = (char *) NULL; /* file name of binary data file */ static unsigned long long Offset = 0; /* from data input */ static off_t fileOffset = 0; /* where are we in the binary data */ static double *data_values = (double *) NULL; /* all the data values read in for this one row */ static unsigned char * validData = (unsigned char *) NULL; /* array to keep track of what data is valid */ static double overallUpperLimit; /* for the complete set of data */ static double overallLowerLimit; /* for the complete set of data */ /* * The table rows need to be printed in a couple of different * places in the code. * Definition of this row format can be found in hg/inc/wiggle.h * and hg/lib/wiggle.as, etc ... */ static void output_row() { double lowerLimit = 1.0e+300; double upperLimit = -1.0e+300; double dataRange = 0.0; double sumData = 0.0; double sumSquares = 0.0; unsigned long long i; unsigned long long validCount = 0; /* number of valid data points */ if (bincount) { /* scan the data and determine stats for it */ for (i=0; i < bincount; ++i) { if (validData[i]) { ++validCount; if (data_values[i] > upperLimit) upperLimit = data_values[i]; if (data_values[i] < lowerLimit) lowerLimit = data_values[i]; sumData += data_values[i]; sumSquares += data_values[i] * data_values[i]; } } if (validCount < 1) { errAbort("ERROR: empty row being produced at row: %d\n", rowCount); } dataRange = upperLimit - lowerLimit; if (upperLimit > overallUpperLimit) overallUpperLimit = upperLimit; if (lowerLimit < overallLowerLimit) overallLowerLimit = lowerLimit; /* With limits determined, can now scale and output the values * compressed down to byte data values for the binary file */ for (i=0; i < bincount; ++i) { int byteOutput; /* should be unsigned char, but fputc() wants int */ if (validData[i]) { if (dataRange > 0.0) { byteOutput = MAX_BYTE_VALUE * ((data_values[i] - lowerLimit)/dataRange); } else { byteOutput = 0; } } else { byteOutput = WIG_NO_DATA; } fputc(byteOutput,binout); /* output a data byte */ } chromEnd = chromStart + (bincount * dataSpan); verbose(2, "row: %llu %llu-%llu total: %llu valid: %llu, miss: %llu, file offset: %llu\n", rowCount, chromStart+add_offset, chromEnd+add_offset, bincount, validCount, bincount-validCount, fileOffsetBegin); fprintf( wigout, "%s\t%llu\t%llu\t%s.%llu\t%llu\t%llu\t%llu\t%s\t%g\t%g\t%llu\t%g\t%g\n", chromName, chromStart+add_offset, chromEnd+add_offset, featureName, rowCount, dataSpan, bincount, (unsigned long long)fileOffsetBegin, basename(binfile), lowerLimit, dataRange, validCount, sumData, sumSquares); ++rowCount; } bincount = 0; /* to count up to binsize */ for (i=0; i < binsize; ++i) /* reset valid indicator */ validData[i] = FALSE; chromStart = Offset + dataSpan; verbose(2, "next chromStart = %llu = %llu + %llu\n", chromStart, Offset, dataSpan); fileOffsetBegin = fileOffset; } /* static void output_row() */ void badFormat(struct lineFile *lf) /* Complain about format. */ { errAbort("Badly formatted line %d of %s", lf->lineIx, lf->fileName); } void wigAsciiToBinary( int argc, char *argv[] ) { int i = 0; /* general purpose int counter */ struct lineFile *lf; /* for line file utilities */ char * fileName; /* the basename of the input file */ char *line = (char *) NULL; /* to receive data input line */ char *words[4]; /* to split data input line */ int wordCount = 0; /* result of split */ int validLines = 0; /* counting only lines with data */ unsigned long long previousOffset = 0; /* for data missing detection */ double dataValue = 0.0; /* from data input */ char *wigfile = (char *) NULL; /* file name of wiggle database file */ boolean firstInChrom; /* Is this the first line in chromosome? */ /* for each input data file */ for (i = 1; i < argc; ++i) { verbose(2, "translating file: %s\n", argv[i]); fileName = basename(argv[i]); if (name) /* Is the name of this feature specified ? */ { safef( featureName, sizeof(featureName) - 1, "%s", name); } if (chrom) /* Is the chrom name specified ? */ { chromName = cloneString(chrom); if (! name) /* that names the feature too if not already */ safef( featureName, sizeof(featureName) - 1, "%s", chrom); } /* Name mangling to determine output file name */ if (wibFile) /* when specified, simply use it */ { binfile = addSuffix(wibFile, ".wib"); wigfile = addSuffix(wibFile, ".wig"); } else { /* not specified, construct from input names */ if (startsWith("chr",fileName)) { char *tmpString; tmpString = cloneString(fileName); chopSuffix(tmpString); binfile = addSuffix(tmpString, ".wib"); wigfile = addSuffix(tmpString, ".wig"); if (! chrom) /* if not already taken care of */ chromName = cloneString(tmpString); if (! name && ! chrom) /* if not already done */ safef(featureName, sizeof(featureName) - 1, "%s", tmpString); freeMem(tmpString); } else { errAbort("Can not determine output file name, no -wibFile specified\n"); } } verbose(2, "output files: %s, %s\n", binfile, wigfile); validLines = 0; /* to count only lines with data */ rowCount = 0; /* to count rows output */ bincount = 0; /* to count up to binsize */ fileOffset = 0; /* current location within binary data file */ fileOffsetBegin = 0;/* location in binary data file where this bin starts*/ firstInChrom = TRUE; freeMem(data_values); freeMem(validData); data_values = needMem( (size_t) (binsize * sizeof(double))); validData = needMem( (size_t) (binsize * sizeof(unsigned char))); overallLowerLimit = 1.0e+300; /* for the complete set of data */ overallUpperLimit = -1.0e+300; /* for the complete set of data */ binout = mustOpen(binfile,"w"); /* binary data file */ wigout = mustOpen(wigfile,"w"); /* table row definition file */ lf = lineFileOpen(argv[i], TRUE); /* input file */ while (lineFileNextReal(lf, &line)) { boolean readingFrameSlipped; char *valEnd; char *val; ++validLines; wordCount = chopByWhite(line, words, ArraySize(words)); if (wordCount == 1) { Offset += 1; val = words[0]; } else if (wordCount == 2) { Offset = atoll(words[0]) - 1; val = words[1]; } else if (wordCount == 3) { char *newChrom = words[0]; boolean sameChrom = (chromName == NULL || sameString(chromName, newChrom)); Offset = atoll(words[1]) - 1; val = words[2]; if (!sameChrom) { output_row(); firstInChrom = TRUE; freez(&chromName); } if (chromName == NULL) chromName = cloneString(newChrom); } else { val = NULL; badFormat(lf); } if (Offset < 0) errAbort("Illegal offset %llu at line %d of %s", Offset+1, lf->lineIx, lf->fileName); dataValue = strtod(val, &valEnd); if(trimVals) { dataValue = max(minVal, dataValue); dataValue = min(maxVal, dataValue); } if ((*val == '\0') || (*valEnd != '\0')) errAbort("Not a valid float at line %d: %s\n", lf->lineIx, val); /* see if this is the first time through, establish chromStart */ if (firstInChrom) { chromStart = Offset; verbose(2, "first offset: %llu\n", chromStart); } else if (!firstInChrom && (Offset <= previousOffset)) errAbort("ERROR: chrom positions not in order. line %d of %s\n" "previous: %llu >= %llu <-current", lf->lineIx, lf->fileName, previousOffset+1, Offset+1); /* if we are working on a zoom level and the data is not exactly * spaced according to the span, then we need to put each value * in its own row in order to keep positioning correct for these * data values. The number of skipped bases has to be an even * multiple of dataSpan */ readingFrameSlipped = FALSE; if (!firstInChrom && (dataSpan > 1)) { int skippedBases; int spansSkipped; skippedBases = Offset - previousOffset; spansSkipped = skippedBases / dataSpan; if ((spansSkipped * dataSpan) != skippedBases) readingFrameSlipped = TRUE; } if (readingFrameSlipped) { verbose(2, "data not spanning %llu bases, prev: %llu, this: %llu, at line: %d\n", dataSpan, previousOffset, Offset, lf->lineIx); output_row(); chromStart = Offset; /* a full reset here */ } /* Check to see if data is being skipped */ else if ( (!firstInChrom) && (Offset > (previousOffset + dataSpan)) ) { unsigned long long off; unsigned long long fillSize; /* number of bytes */ verbose(2, "missing data offsets: %llu - %llu\n", previousOffset+1,Offset-1); /* If we are just going to fill the rest of this bin with * no data, then may as well stop here. No need to fill * it with nothing. */ fillSize = (Offset - (previousOffset + dataSpan)) / dataSpan; verbose(2, "filling NO_DATA for %llu bytes at bincount: %llu\n", fillSize, bincount); if (fillSize + bincount >= binsize) { verbose(2, "completing a bin due to NO_DATA for %llu bytes, only %llu - %llu = %llu to go\n", fillSize, binsize, bincount, binsize - bincount); verbose(2, "Offset: %llu, previousOffset: %llu\n", Offset, previousOffset); output_row(); chromStart = Offset; /* a full reset here */ } else { fillSize = 0; /* fill missing data with NO_DATA indication */ for (off = previousOffset + dataSpan; off < Offset; off += dataSpan) { ++fillSize; ++fileOffset; ++bincount; /* count scores in this bin */ if (bincount >= binsize) break; } verbose(2, "filled NO_DATA for %llu bytes at bincount: %llu\n", fillSize, bincount); /* If that finished off this bin, output it * This most likely should not happen here. The * check above: if (fillSize + bincount >= binsize) * should have caught this case already. */ if (bincount >= binsize) { output_row(); chromStart = Offset; /* a full reset here */ } } } /* With perhaps the missing data taken care of, back to the * real data. */ data_values[bincount] = dataValue; validData[bincount] = TRUE; ++fileOffset; ++bincount; /* count scores in this bin */ /* Is it time to output a row definition ? */ if (bincount >= binsize) { output_row(); } previousOffset = Offset; firstInChrom = FALSE; } /* reading file input loop end */ /* Done with input file, any data points left in this bin ? */ if (bincount) { output_row(); } verbose(2, "fini: %s, read %d lines, table rows: %llu, data bytes: %lld\n", argv[i], lf->lineIx, rowCount, fileOffset); verbose(1, "data limits: [%g:%g], range: %g\n", overallLowerLimit, overallUpperLimit, overallUpperLimit - overallLowerLimit); lineFileClose(&lf); fclose(binout); fclose(wigout); freeMem(binfile); freeMem(wigfile); freeMem(chromName); binfile = (char *) NULL; wigfile = (char *) NULL; chromName = (char *) NULL; } return; } int main( int argc, char *argv[] ) { optionInit(&argc, argv, optionSpecs); obsolete = optionExists("obsolete"); if (! obsolete) { verbose(1,"ERROR: This loader is obsolete. Please use: 'wigEncode'\n"); verbose(1,"\t(use -obsolete flag to run this encoder despite it being obsolete)\n"); errAbort("ERROR: wigAsciiToBinary is obsolete, use 'wigEncode' instead"); } if (argc < 2) usage(); add_offset = optionLongLong("offset", 0); binsize = optionLongLong("binsize", 1024); dataSpan = optionLongLong("dataSpan", 1); chrom = optionVal("chrom", NULL); wibFile = optionVal("wibFile", NULL); name = optionVal("name", NULL); minVal = optionFloat("minVal", -1 * INFINITY); maxVal = optionFloat("maxVal", INFINITY); trimVals = optionExists("minVal") || optionExists("maxVal"); verbose(2, "options: -verbose, offset= %llu, binsize= %llu, dataSpan= %llu\n", add_offset, binsize, dataSpan); if (chrom) { verbose(2, "-chrom=%s\n", chrom); } if (wibFile) { verbose(2, "-wibFile=%s\n", wibFile); } if (name) { verbose(2, "-name=%s\n", name); } wigAsciiToBinary(argc, argv); exit(0); }