/* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */ /* Copyright (C) 2013 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "obscure.h" #include "memalloc.h" #include "linefile.h" #include "localmem.h" #include "hash.h" #include "options.h" #include "sqlNum.h" #include "dystring.h" #include "cirTree.h" #include "sig.h" #include "zlibFace.h" #include "bPlusTree.h" #include "bbiFile.h" #include "bwgInternal.h" #include "bigWig.h" #include "portable.h" char *version = "2.10"; // when changing, change in bedToBigBed, bedGraphToBigWig, and wigToBigWig /* Version history from 2.8 on at least - * 2.10 - allow chromsomes to be in non-lexicographic order * 2.9 - ability to specify chromAlias bigBed as chromSizes file * 2.8 sync up version numbers with bedToBigBed */ static int blockSize = 256; static int itemsPerSlot = 1024; static boolean doCompress = FALSE; static int maxGigs = 100; // Maximum number of gigs to allocate in one block. // Undocumented on purpose. static boolean sizesIsBb = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedGraphToBigWig v %s - Convert a bedGraph file to bigWig format (bbi version: %d).\n" "usage:\n" " bedGraphToBigWig in.bedGraph chrom.sizes out.bw\n" "where in.bedGraph is a four column file in the format:\n" " \n" "and chrom.sizes is a two-column file/URL: \n" "and out.bw is the output indexed big wig file.\n" "If the assembly is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath//bigZips/.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" "If not hosted by UCSC, a chrom.sizes file can be generated by running\n" "twoBitInfo on the assembly .2bit file.\n" "The input bedGraph file must be sorted, use the unix sort command:\n" " LC_ALL=C sort -k1,1 -k2,2n unsorted.bedGraph > sorted.bedGraph\n" "The LC_ALL=C variable activates case-sensitive sorting.\n" "options:\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -sizesIsBb -- If set, the chrom.sizes file is assumed to be a bigBed file.\n" " -unc - If set, do not use compression." , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"sizesIsBb", OPTION_BOOLEAN}, {"unc", OPTION_BOOLEAN}, {"maxGigs", OPTION_INT}, {NULL, 0}, }; struct sectionItem /* An item in a section of a bedGraph. */ { bits32 start, end; /* Position in chromosome, half open. */ float val; /* Single precision value. */ }; void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[], boolean doCompress, bits32 *retMaxSectionSize) /* Read through lf, chunking it into sections that get written to f. Save info * about sections in bounds. */ { int maxSectionSize = 0; struct bbiChromUsage *usage = usageList; int itemIx = 0, sectionIx = 0; bits32 reserved32 = 0; UBYTE reserved8 = 0; struct sectionItem items[itemsPerSlot]; struct sectionItem *lastB = NULL; bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; struct dyString *stream = dyStringNew(0); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Figure out whether need to output section. */ boolean sameChrom = FALSE; if (rowSize > 0) sameChrom = sameString(row[0], usage->name); if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom) { /* Figure out section position. */ bits32 chromId = usage->id; bits32 sectionStart = items[0].start; bits32 sectionEnd = items[itemIx-1].end; /* Save section info for indexing. */ assert(sectionIx < sectionCount); struct bbiBoundsArray *section = &bounds[sectionIx++]; section->offset = ftell(f); section->range.chromIx = chromId; section->range.start = sectionStart; section->range.end = sectionEnd; /* Output section header to stream. */ dyStringClear(stream); UBYTE type = bwgTypeBedGraph; bits16 itemCount = itemIx; dyStringWriteOne(stream, chromId); // chromId dyStringWriteOne(stream, sectionStart); // start dyStringWriteOne(stream, sectionEnd); // end dyStringWriteOne(stream, reserved32); // itemStep dyStringWriteOne(stream, reserved32); // itemSpan dyStringWriteOne(stream, type); // type dyStringWriteOne(stream, reserved8); // reserved dyStringWriteOne(stream, itemCount); // itemCount /* Output each item in section to stream. */ int i; for (i=0; istart); dyStringWriteOne(stream, item->end); dyStringWriteOne(stream, item->val); } /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxSectionSize) maxSectionSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); char compBuf[maxCompSize]; int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); /* If at end of input we are done. */ if (rowSize == 0) break; /* Set up for next section. */ itemIx = 0; if (!sameChrom) { usage = usage->next; assert(usage != NULL); if (!sameString(row[0], usage->name)) errAbort("Error - read %s, expecting %s on line %d in file %s\n", row[0], usage->name, lf->lineIx, lf->fileName); assert(sameString(row[0], usage->name)); lastB = NULL; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } } /* Parse out input. */ lineFileExpectWords(lf, 4, rowSize); bits32 start = lineFileNeedNum(lf, row, 1); bits32 end = lineFileNeedNum(lf, row, 2); float val = lineFileNeedDouble(lf, row, 3); /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */ if (start > end) errAbort("Error - start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName); if (lastB != NULL) { if (lastB->start > start) errAbort("Error - bedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName); if (lastB->end > start) errAbort("Error - overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName); } /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } /* Save values in output array. */ struct sectionItem *b = &items[itemIx]; b->start = start; b->end = end; b->val = val; lastB = b; itemIx += 1; } assert(sectionIx == sectionCount); *retMaxSectionSize = maxSectionSize; } static struct bbiSummary *bedGraphWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiSummary oneSummary, *sum = NULL; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); boolean firstRow = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Output last section and break if at end of file. */ if (rowSize == 0 && sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); break; } /* Parse out row. */ char *chrom = row[0]; bits32 start = sqlUnsigned(row[1]); bits32 end = sqlUnsigned(row[2]); float val = sqlFloat(row[3]); /* Update total summary stuff. */ bits32 size = end-start; if (firstRow) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstRow = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If new chromosome output existing block. */ if (differentString(chrom, usage->name)) { usage = usage->next; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If start past existing block then output it. */ else if (sum != NULL && sum->end <= start) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { verbose(3, "Splitting start %d, end %d, sum->start %d, sum->end %d\n", start, end, sum->start, sum->end); /* Fold in bits that overlap with existing summary and output. */ bits32 overlap = rangeIntersection(start, end, sum->start, sum->end); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; } void bedGraphToBigWig(char *inName, char *chromSizes, char *outName) /* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */ { mustBeReadableAndRegularFile(inName); verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); int minDiff = 0, i; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList; if (sizesIsBb) usageList = bbiChromUsageFromBedFileAlias(lf, chromSizes, NULL, &minDiff, &aveSize, &bedCount, FALSE); else { struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes); usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, NULL, &minDiff, &aveSize, &bedCount, FALSE); } verboseTime(2, "pass1"); verbose(2, "%d chroms in %s, minDiff=%d, aveSize=%g, bedCount=%lld\n", slCount(usageList), inName, minDiff, aveSize, bedCount); /* Write out dummy header, zoom offsets. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); bits64 sectionCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); writeOne(f, sectionCount); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, sectionCount); lineFileRewind(lf); bits32 maxSectionSize = 0; writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f, resTryCount, resScales, resSizes, doCompress, &maxSectionSize); verboseTime(2, "pass2"); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), sectionCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; /* Call monster zoom maker library function that bedToBigBed also uses. */ int zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot, bedGraphWriteReducedOnceReturnReducedTwice, 4, doCompress, indexOffset - dataOffset, usageList, resTryCount, resScales, resSizes, zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum); /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxSectionSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigWigSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits16 reserved16 = 0; bits32 reserved32 = 0; bits64 reserved64 = 0; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, reserved16); // fieldCount writeOne(f, reserved16); // definedFieldCount writeOne(f, reserved64); // autoSqlOffset writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); writeOne(f, reserved64); // nameIndexOffset assert(ftell(f) == 64); /* Write summary headers with data. */ verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i 1) printVmPeak(); return 0; }