//-------------------------------------------------------------------------------- // File: TRFScorer.cs // Author: Timothy O'Connor // © Copyright University of Queensland, 2012-2014. All rights reserved. // License: //-------------------------------------------------------------------------------- namespace Genomics { using System; using System.Collections.Generic; using System.Linq; using System.IO; using Shared; using Tools; using Data; /// /// Utility methods for scoring or transforming ChIP-seq peak or other data at collections of genomic locations. /// public class TRFScorer { /// /// Initializes a new instance of the class. /// public TRFScorer() { } /// /// Wrapper of indexed scoring data /// public class IndexedScores { /// /// Gets or sets scoring index. /// /// The index. public Dictionary Index { get; set; } /// /// Gets or sets the scores. /// /// The scores. public List Scores { get; set; } } public class ScoreEntry { public string Tss { get; set; } public List PromoterScores { get; set; } public List Locicores { get; set; } public int LocusCount { get; set; } public double Exp { get; set; } } /// /// Score type. /// public enum ScoreType { /// /// Scores at Locus regions /// Locus, /// /// Scores in the promoter region near the TSS /// Tss } /// /// Missing score action. /// public enum MissingScoreAction { /// /// Use raw scores. /// None, /// /// Fill missing scores with expected value /// Expected, } /// /// Scoring function. /// public enum ScoringFunction { /// /// Raw summation of all associated peak heights. /// Sum, /// /// The shuffled sum. /// ShuffledSum, /// /// Weighted summation of all associated peak heights. /// CorrelationWeightedSum, /// /// The shuffled correlation weighted sum. /// ShuffledCorrelationWeightedSum, /// /// The confidence weighted sum. /// ConfidenceWeightedSum, /// /// The shuffled confidence weighted sum. /// ShuffledConfidenceWeightedSum, /// /// The distance weighted function. /// DistanceWeightedSum, } /// /// Evaluates a genomic range. /// /// The range. /// Items. /// Start. /// End. /// Initial value. /// Pre start tester. /// Pre end tester. /// Evaluator. /// The 1st type parameter. public static T EvaluateRange( List items, int start, int end, T initialValue, Func PreStartTester, Func PreEndTester, Func Evaluator) { int i = FindRangeStart(items, start, PreStartTester); T val = initialValue; if (i > 0 && !PreStartTester(items[i-1], start)) { Console.WriteLine("Pos {0}; start {1}; end {2}", items[i-1], start, end); } int length = end - start + 1; while (i < items.Count && PreEndTester(items[i], end)) { //Console.WriteLine("Pos {0}; start {1}; end {2}", items[i].Mid, start, end); val = Evaluator(val, length, items[i]); i++; } return val; } /// /// Gets the closest location to a given position outside the given range /// /// The closes exclude range. /// Items. /// Position. /// Range. public static Genomics.Location GetClosestExcludeRange(List items, int position, int range) { List filteredItems = items .Where(x => Math.Abs(x.Start - position) > range && Math.Abs(x.End - position) > range) .ToList(); return GetClosestMid(filteredItems, position); } /// /// Intersects the two location sets returing the left items that overlap anything from the right /// /// The left. /// F1. /// F2. /// Chr. public static List IntersectLeft(BedFile f1, IAnnotation f2, string chr, int buffer) { int maxFeatureSize = Math.Max(f1.MaxLocationSize.Max(x => x.Value), f2.MaxFeatureSize); int numFlankingBins = maxFeatureSize / (2 * BedFile.IndexSize) + 1; if (!f1.ChromosomeIndexedLocations.ContainsKey(chr) || !f2.IndexedTranscriptLocations.ContainsKey(chr)) { //Console.WriteLine("Invalid chromosome: " + chr); return new List(); } return f1.ChromosomeIndexedLocations [chr].Select(x => { return x.Value .Where(loc => Stats.Sequence(x.Key - numFlankingBins, x.Key + numFlankingBins, 1) .Any(bin => f2.IndexedTranscriptLocations[chr].ContainsKey(bin) && f2.IndexedTranscriptLocations[chr][bin] .Any(l2 => loc.Overlaps(buffer == 0 ? l2 : new Genomics.Location{ Start = l2.Start - buffer, End = l2.End + buffer})))) .ToList(); }) .SelectMany(x => x) .ToList(); } /// /// Intersects the two location sets returing the left items that overlap anything from the right /// /// The left. /// F1. /// F2. /// Chr. public static List IntersectLeft(IAnnotation f1, BedFile f2, string chr, int buffer) { int maxFeatureSize = Math.Max(f2.MaxLocationSize.Max(x => x.Value), f1.MaxFeatureSize); int numFlankingBins = maxFeatureSize / (2 * BedFile.IndexSize) + 1; if (!f1.IndexedTranscriptLocations.ContainsKey(chr) || !f2.ChromosomeIndexedLocations.ContainsKey(chr)) { //Console.WriteLine("Invalid chromosome: " + chr); return new List(); } return f1.IndexedTranscriptLocations [chr].Select(x => { return x.Value .Where(loc => Stats.Sequence(x.Key - numFlankingBins, x.Key + numFlankingBins, 1) .Any(bin => f2.ChromosomeIndexedLocations[chr].ContainsKey(bin) && f2.ChromosomeIndexedLocations[chr][bin] .Any(l2 => loc.Overlaps(buffer == 0 ? l2 : new Genomics.Location{ Start = l2.Start - buffer, End = l2.End + buffer})))) .ToList(); }) .SelectMany(x => x) .ToList(); } /// /// Intersects the two location sets returing the left items that overlap anything from the right /// /// The left. /// F1. /// F2. /// Chr. public static List IntersectLeft(BedFile f1, BedFile f2, string chr, int buffer) { int maxFeatureSize = Math.Max(f1.MaxLocationSize.Max(x => x.Value), f2.MaxLocationSize.Max(x => x.Value)); int numFlankingBins = maxFeatureSize / (2 * BedFile.IndexSize) + 1; if (!f1.ChromosomeIndexedLocations.ContainsKey(chr) || !f2.ChromosomeIndexedLocations.ContainsKey(chr)) { //Console.WriteLine("Invalid chromosome: " + chr); return new List(); } return f1.ChromosomeIndexedLocations [chr].Select(x => { return x.Value .Where(loc => Stats.Sequence(x.Key - numFlankingBins, x.Key + numFlankingBins, 1) .Any(bin => f2.ChromosomeIndexedLocations[chr].ContainsKey(bin) && f2.ChromosomeIndexedLocations[chr][bin] .Any(l2 => loc.Overlaps(buffer == 0 ? l2 : new Genomics.Location{ Start = l2.Start - buffer, End = l2.End + buffer})))) .ToList(); }) .SelectMany(x => x) .ToList(); } /// /// Gets locations that overlap a given item /// /// The overlaps. /// Item. /// Indexed locations. /// Bin size. /// Max feature size. public static List GetOverlaps( Location item, Dictionary>> indexedLocations, int binSize, int maxFeatureSize, Func overlap = null) { maxFeatureSize = Math.Max(maxFeatureSize, item.End - item.Start); int numFlankingBins = maxFeatureSize / (2 * BedFile.IndexSize) + 1; int itemStartBin = item.Start / binSize; int itemEndBin = item.End / binSize; if (!indexedLocations.ContainsKey(item.Chromosome)) { return null; } var overlappingItems = Stats.Sequence(itemStartBin - numFlankingBins, itemEndBin + numFlankingBins, 1) .Where(bin => indexedLocations[item.Chromosome].ContainsKey(bin)) .Select(bin => indexedLocations[item.Chromosome][bin] .Where(x => overlap == null ? item.Overlaps(x) : overlap(item, x))) .Where(x => x.Any()) .SelectMany(x => x) .ToList(); return overlappingItems; } /// /// Gets the closest. /// /// The closest. /// Items. /// Selector. /// Position. public static Genomics.Location GetClosest(List items, Func selector, int position) { int index = FindRangeStart(items, position, (item, start) => selector(item) < start); if (index == 0) { return items [0]; } if (index >= items.Count - 1) { return items [items.Count - 1]; } int leftDist = Math.Abs(selector(items [index - 1]) - position); int centerDist = Math.Abs(selector(items [index]) - position); int rightDist = Math.Abs(selector(items [index + 1]) - position); if (leftDist < centerDist && leftDist < rightDist) { return items [index - 1]; } if (centerDist < rightDist) { return items[index]; } return items[index + 1]; } /// /// Gets the closest upstream or downstream location. /// Items must be sorted by start or end if searching downstream or upstream, respectively. /// /// The closest upstream and downstream. /// Position. /// Items. /// Strand. public static Location GetClosestDirectional( int position, List items, bool upstream) { int index = FindRangeStart(items, position, (item, start) => item.Start < start); if (index == 0) { return upstream ? null : items[0]; } if (index >= items.Count - 1) { return upstream ? items [items.Count - 1] : null; } return upstream ? items[index - 1] : items[index]; } /// /// Get closes location on a chromosome to a given position. /// The items must be sorted by position. /// /// The closest. /// Items. /// Position. public static Genomics.Location GetClosestMid(List items, int position) { return GetClosest(items, x => x.Mid, position); } /// /// Get closes location to a given position /// /// The closest. /// Items. /// Position. public static Genomics.Location GetClosestMinDistance(List items, int position, int minDistance) { int index = FindRangeStart(items, position, (item, start) => item.Mid < start); if (index == 0) { return items [0]; } if (index >= items.Count - 1) { return items [items.Count - 1]; } int leftDist = Math.Abs(items [index - 1].Mid - position); int centerDist = Math.Abs(items [index].Mid - position); int rightDist = Math.Abs(items [index + 1].Mid - position); if (leftDist < centerDist && leftDist < rightDist) { return items [index - 1]; } if (centerDist < rightDist) { return items[index]; } return items[index + 1]; } /// /// Creates the tss scores. /// /// The tss scores. /// Tss data. /// Peak file. /// D0. /// Max range. public static Dictionary CreateTssScores( Dictionary tssData, BedFile peakFile, double D0, List maxRange) { if (maxRange.Count > 1 && D0 != 0) { throw new Exception("Scoring with exponential decay only allows single promoter range value"); } int upstreamDistance = maxRange[0]; int dowstreamDistance = maxRange[0]; if (maxRange.Count == 2) { dowstreamDistance = maxRange[1]; } return CreateScores( tssData, peakFile, (items, location) => TRFScorer.EvaluateRange( items, location.Start - (location.Strand == "+" ? upstreamDistance : dowstreamDistance), location.End + (location.Strand == "+" ? dowstreamDistance : upstreamDistance), 0, (item, start) => item.Mid < start, (item, end) => item.Mid <= end, (aggregateScore, length, peak) => aggregateScore + (D0 == 0 ? peak.Score : Math.Exp(-(double)Math.Abs(peak.Mid - location.Start) / D0) * peak.Score)) ); } /// /// Creates the tss scores. /// /// The tss scores. /// Tss data. /// Peak file. /// D0. /// Max range. public static Dictionary CreateTssChangeScores( Dictionary tssData, BedFile peakFile1, BedFile peakFile2, double D0, List maxRange) { if (maxRange.Count > 1 && D0 != 0) { throw new Exception("Scoring with exponential decay only allows single promoter range value"); } var scoreset1 = CreateTssScores(tssData, peakFile1, D0, maxRange); var scoreset2 = CreateTssScores(tssData, peakFile2, D0, maxRange); return scoreset1.Keys.Select(x => new { Key = x, Value = scoreset1[x] / scoreset2[x] }) .ToDictionary(x => x.Key, x => x.Value); } /// /// Creates the Locus scores for all Loci /// /// The Locus scores. /// Locus file. /// Peak file. /// If set to true normalize. public static Dictionary CreateLocicores( BedFile LocusFile, BedFile peakFile, bool normalize) { var temp = CreateScores( LocusFile.Locations, peakFile, (items, location) => TRFScorer.EvaluateRange( items, location.Start, location.End - 1, 0, (item, start) => item.End < start, (item, end) => item.Start <= end, (aggregateScore, length, peak) => aggregateScore + peak.Score / (normalize ? length : 1))); return temp; } /*public static Dictionary CreateLocicores( BedFile LocusFile, BedFile peakFile) { CreateScores( LocusFile.Locations, peakFile, (items, location) => GetOverlaps(location, peakFile.ChromosomeIndexedLocations, BedFile.IndexSize, peakFile.MaxLocationSize).Sum(x => x.Score)); }*/ /// /// Creates the Locus scores for all Loci /// /// The Locus scores. /// Locus file. /// /// /// If set to true normalize. public static Dictionary CreateLocusChangeScores( BedFile LocusFile, BedFile peakFile1, BedFile peakFile2, bool normalize) { var scoreset1 = CreateLocicores(LocusFile, peakFile1, normalize); var scoreset2 = CreateLocicores(LocusFile, peakFile2, normalize); return scoreset1.Keys.Select(x => new { Key = x, Value = scoreset1[x] / scoreset2[x] }) .ToDictionary(x => x.Key, x => x.Value); } /// /// Creates a single Locus score. /// /// The Locus score. /// Location. /// Peak file. public static double CreateLocicore( Genomics.Location location, BedFile peakFile) { if (peakFile.ChromosomeOrderedLocations.ContainsKey(location.Chromosome)) { int n = 0; return TRFScorer.EvaluateRange( peakFile.ChromosomeOrderedLocations[location.Chromosome], location.Start, location.End - 1, 0, (item, start) => item.End < start, (item, end) => item.Start <= end, (aggregateScore, length, peak) => { return (aggregateScore * (n-1) + peak.Score) / n; }); } return 0.0; } /// /// Creates the scores. /// /// The scores. /// Locations. /// Peak file. /// Scorer. public static Dictionary CreateScores( Dictionary locations, BedFile peakFile, Func, Genomics.Location, double> Scorer) { var peaks = peakFile.GetOrderedLocations(x => x.Mid); return (from location in locations where peaks.ContainsKey(location.Value.Chromosome) select new { Name = location.Key, Score = Scorer(peaks [location.Value.Chromosome], location.Value) }).ToDictionary(x => x.Name, x => x.Score); } /// /// Creates index of TRF score files for efficient loading /// /// The index. /// Locations. /// Type. /// Cell line. public static Dictionary CreateIndex(Dictionary locations, ScoreType scoreType, string dataSource) { var data = locations.Keys.Select((x, i) => new { Name = x, Index = i }); using (TextWriter tw = Helpers.CreateStreamWriter(string.Format("../temp/GenomicFeatures/{0}.{1}.index", scoreType, dataSource))) { tw.WriteLine(string.Join("\n", data.Select(x => string.Format("{0}\t{1}\n", x.Name, x.Index)))); } return data.ToDictionary(x => x.Name, x => x.Index); } /// /// Retrieves a score index for efficient loading of DRM or TSS scores /// /// The index. /// Type. /// Cell line. public static Dictionary ReadIndex(ScoreType scoreType, string tissue) { using (TextReader tr = new StreamReader(string.Format("../temp/GenomicFeatures/{0}.{1}.index", scoreType, tissue))) { return tr.ReadToEnd().Split('\n') .Where(line => !string.IsNullOrWhiteSpace(line)) .Select(line => { var fields = line.Split('\t').Select(y => y.Trim()).ToArray(); return new { Name = fields[0], Index = int.Parse(fields[1]) }; }) .ToDictionary(x => x.Name, x => x.Index); } } /// /// Writes the score name value pairs. /// /// File name. /// Scores. /// Used items. /// The 1st type parameter. public static void WriteScoreNameValuePairs(string fileName, Dictionary scores, HashSet usedItems) { using (TextWriter tw = Helpers.CreateStreamWriter(fileName)) { if (usedItems == null) { tw.WriteLine(scores.Count); tw.WriteLine(string.Join("\n", scores.Select(x => string.Format("{0}\t{1}", x.Key, x.Value)))); } else { var data = scores.Where(x => usedItems.Contains(x.Key)); tw.WriteLine(data.Count()); tw.WriteLine(string.Join("\n", data.Select(x => string.Format("{0}\t{1}", x.Key, x.Value)))); } } } /// /// Reads the score name value pairs. /// /// The score name value pairs. /// Index. /// File name. /// Value parser. /// The 1st type parameter. public static TValueType[] ReadScoreNameValuePairs(Dictionary index, string fileName, Func ValueParser) { using (TextReader tr = new StreamReader(fileName)) { tr.ReadLine(); int itemCount = index.Count(); GC.Collect(); TValueType[] array = new TValueType[itemCount]; int tabIndex = 0; int newLineIndex = 0; int lineStartIndex = 0; string lines = tr.ReadToEnd(); int i; do { tabIndex = lines.IndexOf('\t', tabIndex + 1); if (tabIndex > 0) { newLineIndex = lines.IndexOf('\n', tabIndex); // string t = lines.Substring(lineStartIndex, tabIndex - lineStartIndex ); i = index[lines.Substring(lineStartIndex, tabIndex - lineStartIndex )]; array[i] = ValueParser(lines.Substring(tabIndex + 1, newLineIndex - tabIndex - 1)); lineStartIndex = newLineIndex + 1; } } while (tabIndex > 0 && newLineIndex > 0); return array; } } /// /// Reads the score name value pairs. /// /// The score name value pairs. /// File name. /// Value parser. /// The 1st type parameter. public static Dictionary ReadScoreNameValuePairs(string fileName, Func ValueParser) { using (TextReader tr = new StreamReader(fileName)) { tr.ReadLine(); // Dispose of count return tr.ReadToEnd().Split('\n').Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => { string[] fields = line.Split('\t'); return new { Key = fields [0], Value = ValueParser(fields [1]) }; }) .ToDictionary(x => x.Key, x => x.Value); } } public static void WriteSingleIndexedScore( ScoreType scoreType, string peakName, string peakFile, string tissue, string rnaSource, Dictionary locations) { var peaks = new BedFile(peakFile, BedFile.Bed6Plus4Layout); TRFScorer.CreateIndex(locations, scoreType, string.Join(".", new List { peakName, tissue, rnaSource })); string baseFileName = Path.GetFileNameWithoutExtension(peakFile); TRFScorer.WriteScoreNameValuePairs( Utilities.GetLocicoreFileName(baseFileName, Utilities.TFScoreSource(peakName, tissue, rnaSource)), TRFScorer.CreateScores( locations, peaks, (items, location) => TRFScorer.GetOverlaps( location, peaks.ChromosomeIndexedLocations, BedFile.IndexSize, peaks.MaxLocationSize[location.Chromosome]) .Sum(x => double.Parse(x.Data[BedFile.Bed6Plus4Layout.SignalValue]))), null); } public static void WriteScoreEntries(string fileName, List scores, List chipSets) { Func, List, double, string[]> tableLine = (tss, tssScores, Locicores, expression) => { return new string[] { tss } .Concat(tssScores != null ? tssScores.Select(x => x.ToString()) : new List()) .Concat(Locicores != null ? Locicores.Select(x => x.ToString()) : new List()) .Concat(new string[] { expression.ToString() }) .ToArray(); }; var promoterColumns = scores.First().PromoterScores != null ? chipSets.Select(x => string.Format("Tss_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0])) : new List(); var LocusColumns = scores.First().Locicores != null ? chipSets.Select(x => string.Format("Drm_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0])) : new List(); Tables.ToNamedTsvFile( fileName, scores.Select(x => tableLine(x.Tss, x.PromoterScores, x.Locicores, x.Exp)), promoterColumns.Concat(LocusColumns).Concat(new string[] { "esc" })); } /// /// Creates the tss Locus and combined table. /// /// The tss Locus and combined table. /// Locus map. /// Tss scores. /// Locus scores. /// Expression. /// Out file name. /// Chip sets. public static int CreateTssLocusAndCombinedTable( Dictionary> LocusMap, TRFScorer.IndexedScores tssScores, TRFScorer.IndexedScores Locicores, Dictionary expression, string outFileName, List chipSets) { var rawScores = LocusMap .Where(mapEntry => expression.ContainsKey(mapEntry.Key) && tssScores.Index.ContainsKey(mapEntry.Key)) .Select(mapEntry => { var atTss = tssScores.Scores.Select(chipData => chipData[tssScores.Index[mapEntry.Key]]).ToList(); var atLoci = Locicores.Scores.Select(chipData => mapEntry.Value.Where(x => Locicores.Index.ContainsKey(x)).Sum(Locus => chipData[Locicores.Index[Locus]])).ToList(); return new { Tss = mapEntry.Key, PromoterScores = atTss, Locicores = atLoci, LocusCount = mapEntry.Value.Count(x => Locicores.Index.ContainsKey(x)), Exp = expression[mapEntry.Key] }; }) .ToList(); var expectedLocicores = rawScores[0].Locicores.Select((x, i) => rawScores.Average(y => y.Locicores[i])).ToList(); var scores = rawScores.Select(x => { var atLoci = x.LocusCount == 0 ? expectedLocicores : x.Locicores; var total = x.PromoterScores.Select((f, i) => f + atLoci[i]); return new { x.Tss, x.PromoterScores, Locicores = atLoci, CombinedScores = total, x.Exp }; }) .ToList(); Console.WriteLine("Number of TSSes: {0}", scores.Count); using (TextWriter tw = Helpers.CreateStreamWriter(outFileName)) { // table format tw.WriteLine("{0}\t{1}\t{2}\tesc", string.Join("\t", chipSets.Select(x => string.Format("Tss_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0]))), string.Join("\t", chipSets.Select(x => string.Format("Drm_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0]))), string.Join("\t", chipSets.Select(x => string.Format("TssDrm_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0])))); foreach (var gene in scores) { tw.WriteLine( "{0}\t{1}\t{2}\t{3}\t{4}", gene.Tss, string.Join("\t", gene.PromoterScores), string.Join("\t", gene.Locicores), string.Join("\t", gene.CombinedScores), gene.Exp); } } return scores.Count; } /// /// Creates the tss Locus and combined table. /// /// The tss Locus and combined table. /// Locus map. /// Tss scores. /// Locus scores. /// Expression. /// Out file name. /// Chip sets. public static int CreateLocusTable( TRFScorer.MissingScoreAction action, TssRegulatoryMap LocusMap, TRFScorer.IndexedScores Locicores, Dictionary expression, string outFileName, List chipSets, Func, TRFScorer.IndexedScores, int, double> Locicorer) { var rawScores = LocusMap .Where(mapEntry => expression.ContainsKey(mapEntry.Key)) .Select(mapEntry => { var atLoci = Locicores.Scores .Select((chipData, index) => Locicorer(mapEntry.Value.Values, Locicores, index)) .ToList(); return new TRFScorer.ScoreEntry { Tss = mapEntry.Key, Locicores = atLoci, LocusCount = mapEntry.Value.Values.Count(link => Locicores.Index.ContainsKey(link.LocusName)), Exp = expression[mapEntry.Key] }; }) .ToList(); List scores; if (action == TRFScorer.MissingScoreAction.Expected) { var expectedLocicores = rawScores[0].Locicores.Select((x, i) => rawScores.Average(y => y.Locicores[i])).ToList(); scores = rawScores.Select(x => { var atLoci = x.LocusCount == 0 ? expectedLocicores : x.Locicores; return new TRFScorer.ScoreEntry { Tss = x.Tss, Locicores = atLoci, Exp = x.Exp }; }) .ToList(); } else { scores = rawScores; } Console.WriteLine("Number of TSSes: {0}", scores.Count); using (TextWriter tw = Helpers.CreateStreamWriter(outFileName)) { // table format tw.WriteLine("{0}\tesc", string.Join("\t", chipSets.Select(x => string.Format("Drm_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0])))); foreach (var gene in scores) { tw.WriteLine( "{0}\t{1}\t{2}", gene.Tss, string.Join("\t", gene.Locicores), gene.Exp); } } return scores.Count; } /// /// Creates table of TSS and Locus data for regression using flat indexed tss and drm scores /// For TSSes with no Loci linked, the expected score is filled in /// /// The count of rows in the table. /// Locus map. /// Tss index. /// Tss scores. /// Locus index. /// Locus scores. /// Expression. /// Out file name. /// Chip sets. public static int CreateTable( Dictionary> LocusMap, Dictionary tssIndex, List tssScores, Dictionary LocusIndex, List Locicores, Dictionary expression, string outFileName, List chipSets) { var rawScores = LocusMap .Where(mapEntry => expression.ContainsKey(mapEntry.Key) && tssIndex.ContainsKey(mapEntry.Key)) .Select(mapEntry => new { Tss = mapEntry.Key, PromoterScores = tssScores.Select(chipData => chipData[tssIndex[mapEntry.Key]]), Locicores = Locicores.Select(chipData => mapEntry.Value.Where(x => LocusIndex.ContainsKey(x)).Sum(Locus => chipData[LocusIndex[Locus]])).ToList(), LocusCount = mapEntry.Value.Count(x => LocusIndex.ContainsKey(x)), Exp = expression[mapEntry.Key] }) .ToList(); var expectedLocicores = rawScores[0].Locicores.Select((x, i) => rawScores.Average(y => y.Locicores[i])).ToList(); var scores = rawScores.Select(x => new { x.Tss, x.PromoterScores, Locicores = x.LocusCount == 0 ? expectedLocicores : x.Locicores, x.Exp }) .ToList(); //Console.WriteLine("Number of TSSes: {0}", scores.Count); using (TextWriter tw = Helpers.CreateStreamWriter(outFileName)) { // table format tw.WriteLine("{0}\t{1}\tesc", string.Join("\t", chipSets.Select(x => string.Format("Tss_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0]))), string.Join("\t", chipSets.Select(x => string.Format("Drm_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0])))); foreach (var gene in scores) { tw.WriteLine("{0}\t{1}\t{2}\t{3}", gene.Tss, string.Join("\t", gene.PromoterScores), string.Join("\t", gene.Locicores), gene.Exp); } } return scores.Count; } /// /// convert non finite values to NA /// /// The filter double. /// The x coordinate. private static string NAFilterDouble(double x) { if (double.IsNaN(x)) // || double.IsInfinity(x)) { return "0"; } return x.ToString(); } /// /// Creates table of TSS and DRM data for regression using flat indexed tss and drm scores /// /// The table. /// Drm map. /// Tss index. /// Tss scores. /// Drm index. /// Drm scores. /// Expression. /// Out file name. /// Chip sets. public static int CreateTableFilterNA( Dictionary> drmMap, Dictionary tssIndex, List tssScores, Dictionary drmIndex, List drmScores, Dictionary expression, string outFileName, List chipSets) { var v = drmMap .Where(mapEntry => expression.ContainsKey(mapEntry.Key) && tssIndex.ContainsKey(mapEntry.Key)) .Select(mapEntry => new { Tss = mapEntry.Key, TssOnlyScore = tssScores.Select(chipData => chipData[tssIndex[mapEntry.Key]]), Score = drmScores.Select(chipData => mapEntry.Value.Where(drm => drmIndex.ContainsKey(drm)).Sum(drm => chipData[drmIndex[drm]])), Exp = expression[mapEntry.Key] }); Console.WriteLine("Number of TSSes: {0}", v.Count()); using (TextWriter tw = Helpers.CreateStreamWriter(outFileName)) { // table format tw.WriteLine("{0}\t{1}\tesc", string.Join("\t", chipSets.Select(x => string.Format("Tss_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0]))), string.Join("\t", chipSets.Select(x => string.Format("Drm_{0}", x.Trim().Split('/').Last().Split('.').First().Split('-')[0])))); foreach (var gene in v) { tw.WriteLine( "{0}\t{1}\t{2}\t{3}", gene.Tss, string.Join("\t", gene.TssOnlyScore.Select(x => NAFilterDouble(x))), string.Join("\t", gene.Score.Select(x => NAFilterDouble(x))), gene.Exp); } } return v.Count(); } /// /// Reads the single indexed score. /// /// The single indexed score. /// Score type. /// Peak name. /// Peak file. /// Cell line. /// Rna source. public static IndexedScores ReadSingleIndexedScore( ScoreType scoreType, string peakName, string peakFile, string tissue, string rnaSource) { string scoreSource = Utilities.TFScoreSource(peakName, tissue, rnaSource); string baseFileName = Path.GetFileNameWithoutExtension(peakFile); var index = TRFScorer.ReadIndex(scoreType, scoreSource); return new IndexedScores { Index = index, Scores = new List { TRFScorer.ReadScoreNameValuePairs( index, Utilities.GetLocicoreFileName(baseFileName, scoreSource), x => double.Parse(x)) } }; } /// /// Finds the range start. /// /// The range start. /// Items. /// Start. /// Pre start tester. /// The 1st type parameter. private static int FindRangeStart(List items, int start, Func PreStartTester) { int lower = 0; int upper = items.Count; while (upper > lower) { int mid = (lower + upper) / 2; if (PreStartTester(items[mid], start)) { lower = mid + 1; } else { upper = mid; } } return upper; } /// /// Loci the summation scorer. /// /// The summation scorer. /// Map entry. /// Locus scores. /// Index. public static double LociummationScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => Locicores.Scores[index][Locicores.Index[link.LocusName]]); } public static double LocihuffledSummationScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => Locicores.Scores[index][Stats.RNG.Next(Locicores.Scores[index].Length)]); } /// /// Creates a weighted Locus score /// /// The weighted scorer. /// Map entry. /// Locus scores. /// Index. public static double LocusCorrelationWeightedScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => Math.Pow(link.Correlation, 2) * Locicores.Scores[index][Locicores.Index[link.LocusName]]); } public static double[] LocihuffledCorrelationWeightedScorerSample { get; set; } /// /// Creates a weighted Locus score /// /// The weighted scorer. /// Map entry. /// Locus scores. /// Index. public static double LocihuffledCorrelationWeightedScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { if (LocihuffledCorrelationWeightedScorerSample == null) { LocihuffledCorrelationWeightedScorerSample = map.Links.Select(x => x.Correlation).ToArray(); } var correlations = mapEntry.Select(x => x.Correlation).ToArray(); //return mapEntry // .Where(link => Locicores.Index.ContainsKey(link.LocusName)) // .Sum(link => Math.Pow(correlations[Stats.RNG.Next(correlations.Length)], 2) * Locicores.Scores[index][Locicores.Index[link.LocusName]]); return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => Math.Pow(correlations[Stats.RNG.Next(correlations.Length)], 2) * Locicores.Scores[index][Locicores.Index[link.LocusName]]); } /// /// Creates a weighted Locus score /// /// The weighted scorer. /// Map entry. /// Locus scores. /// Index. public static double LocusConfidenceWeightedScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => -Math.Log10(Math.Max(link.ConfidenceScore, 1e-20)) * Locicores.Scores[index][Locicores.Index[link.LocusName]]); } public static double[] LocihuffledConfidenceWeightedScorerSample { get; set; } /// /// Creates a weighted Locus score /// /// The weighted scorer. /// Map entry. /// Locus scores. /// Index. public static double LocihuffledConfidenceWeightedScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { //var correlations = mapEntry.Select(x => x.Correlation).ToArray(); if (LocihuffledConfidenceWeightedScorerSample == null) { LocihuffledConfidenceWeightedScorerSample = map.Links.Select(x => x.Correlation).ToArray(); } return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => -Math.Log10( Math.Max(LocihuffledConfidenceWeightedScorerSample[Stats.RNG.Next(LocihuffledConfidenceWeightedScorerSample.Length)], 1e-20)) * Locicores.Scores[index][Locicores.Index[link.LocusName]]); } /// /// Loci the distance weighted scorer. /// /// The distance weighted scorer. /// Map entry. /// Locus scores. /// Index. public static double LocusDistanceWeightedScorer( TssRegulatoryMap map, IEnumerable mapEntry, TRFScorer.IndexedScores Locicores, int index) { return mapEntry .Where(link => Locicores.Index.ContainsKey(link.LocusName)) .Sum(link => Math.Exp(-link.AbsLinkLength / (1000)) * Locicores.Scores[index][Locicores.Index[link.LocusName]]); } /// /// Selects the scorer. /// /// The scorer. /// Requested scoring function. public static Func, TRFScorer.IndexedScores, int, double> SelectScorer(ScoringFunction scoring) { switch (scoring) { case ScoringFunction.Sum: return LociummationScorer; case ScoringFunction.ShuffledSum: return LocihuffledSummationScorer; case ScoringFunction.CorrelationWeightedSum: return LocusCorrelationWeightedScorer; case ScoringFunction.ShuffledCorrelationWeightedSum: return LocihuffledCorrelationWeightedScorer; case ScoringFunction.DistanceWeightedSum: return LocusDistanceWeightedScorer; case ScoringFunction.ConfidenceWeightedSum: return LocusConfidenceWeightedScorer; case ScoringFunction.ShuffledConfidenceWeightedSum: return LocihuffledConfidenceWeightedScorer; default: return null; } } /// /// Creates the tss Locus and combined table. /// /// The tss Locus and combined table. /// Locus map. /// Tss scores. /// Locus scores. /// Expression. /// Out file name. /// Chip sets. public static List ScoreMap( MissingScoreAction action, TssRegulatoryMap LocusMap, TRFScorer.IndexedScores tssScores, TRFScorer.IndexedScores Locicores, Dictionary expression, Func, TRFScorer.IndexedScores, int, double> Locicorer) { var rawScores = LocusMap .Where(mapEntry => expression.ContainsKey(mapEntry.Key)) .Select(mapEntry => { var atTss = tssScores != null ? tssScores.Scores.Select(chipData => chipData[tssScores.Index[mapEntry.Key]]).ToList() : null; var atLoci = Locicores != null ? Locicores.Scores .Select((chipData, index) => Locicorer(LocusMap, mapEntry.Value.Values, Locicores, index)) .ToList() : null; return new ScoreEntry { Tss = mapEntry.Key, PromoterScores = atTss, Locicores = atLoci, LocusCount = mapEntry.Value.Values.Count(link => Locicores.Index.ContainsKey(link.LocusName)), Exp = expression[mapEntry.Key] }; }) .ToList(); List scores; if (action == MissingScoreAction.Expected) { var expectedLocicores = rawScores[0].Locicores.Select((x, i) => rawScores.Average(y => y.Locicores[i])).ToList(); scores = rawScores.Select(x => { var atLoci = x.LocusCount == 0 ? expectedLocicores : x.Locicores; return new ScoreEntry { Tss = x.Tss, Locicores = atLoci, Exp = x.Exp }; }) .ToList(); } else { scores = rawScores; } return scores; } } }