namespace Genomics { using System; using System.Collections.Generic; using System.Linq; using Shared; using Tools; public class TssRegulatoryMap : RegulatoryMap { public TssRegulatoryMap() { } public TssRegulatoryMap(Dictionary> source) : base (source) { } /// /// Initializes a new instance of the class. /// /// Source links. public TssRegulatoryMap(IEnumerable source) : base(source.ToLookup(x => x.TranscriptName, x => x) .ToDictionary(x => x.Key, x => x .ToLookup(y => y.LocusName, y => y) .ToDictionary(y => y.Key, y => y.First()))) { } /// /// Convert the specified source dictionary to a map /// /// A regulatory map /// Source links. public static TssRegulatoryMap Convert(Dictionary> source) { return new TssRegulatoryMap(source); } /// /// Convert the specified source key-value pairs to a map /// /// A regulatory map /// Source links. public static TssRegulatoryMap Convert(IEnumerable>> source) { return new TssRegulatoryMap(source.ToDictionary(x => x.Key, x => x.Value)); } /// /// Converts a flat list of links into a regulatory map. /// /// Source links. public static implicit operator TssRegulatoryMap(List source) { return new TssRegulatoryMap(source); } /// /// Gets the transcripts in the map /// /// The transcripts. public override HashSet Transcripts { get { return Helpers.CheckInit( ref this.transcripts, () => new HashSet(this.Keys)); } } /// /// Gets the Loci in the map /// /// The Loci. public override HashSet Loci { get { return Helpers.CheckInit( ref this.loci, () => new HashSet(this.Values.Select(x => x.Keys).SelectMany(x => x))); } } /// /// Gets the tss degree. /// /// The tss degree. public Dictionary TssDegree { get { return Helpers.CheckInit( ref this.tssDegree, () => this.ToDictionary(x => x.Key, x => x.Value.Count)); } } /// /// Converts to genes. /// /// The to genes. /// Expression data. public TssRegulatoryMap ConvertToGenes(IExpressionData expression) { return Convert(expression.Transcripts .Where(x => this.ContainsKey((MapLink.Tss)x.Key)) .ToLookup( x => new MapLink.Tss(x.Value.AlternateName), x => this[x.Value.Name].Values) .ToDictionary( x => x.Key, x => new Dictionary( x.SelectMany(y => y) //// x.Key is gene name .ToLookup(y => y.LocusName, y => y) .ToDictionary( y => y.Key, y => { var transcriptLink = y.OrderBy(z => z.ConfidenceScore).First(); return new MapLink { ConfidenceScore = transcriptLink.ConfidenceScore, Correlation = transcriptLink.Correlation, TranscriptName = transcriptLink.TranscriptName, TssName = x.Key, GeneName = x.Key, LinkLength = transcriptLink.LinkLength, LocusName = transcriptLink.LocusName, Strand = expression.Genes[x.Key].Strand, }; })))); } public TssRegulatoryMap ConvertToGenes() { return Convert(this.Links.ToLookup(x => new MapLink.Tss(x.GeneName), x => x) .ToDictionary( x => x.Key, x => new Dictionary(x.ToLookup(y => y.LocusName, y => y).ToDictionary( y => y.Key, y => { var transcriptLink = x.OrderBy(z => z.ConfidenceScore).First(); return new MapLink { ConfidenceScore = transcriptLink.ConfidenceScore, Correlation = transcriptLink.Correlation, TranscriptName = transcriptLink.TranscriptName, TssName = x.Key, GeneName = x.Key, LinkLength = transcriptLink.LinkLength, LocusName = transcriptLink.LocusName, Strand = transcriptLink.Strand, }; })))); } /// /// Gets the TSS link degree distribution /// /// The degree distribution. public List TssDegreeDistribution() { ////var distribution = this.Links //// .GroupBy(x => x.TranscriptName) // Get links by transcript //// .Select(x => x.Count()) // Count the transcript links //// .GroupBy(x => x, x => x) // Group the link counts //// .ToDictionary(x => x.Key, x => x.Count()); // Count the frequency of link counts var distribution = this.TssDegree.ToLookup(x => x.Value, x => x).ToDictionary(x => x.Key, x => x.Count()); var maxDegree = this.TssDegree.Values.Max(); ////int maxDegree = distribution.Keys.Max(); var degreeDistribution = new List(); for (int i = 0; i <= maxDegree; i++) { degreeDistribution.Add(distribution.ContainsKey(i) ? distribution[i] : 0); } return degreeDistribution; } /// /// Removes the highly connected tsses. /// /// The highly connected tsses. /// Minimum count. public TssRegulatoryMap RemoveHighlyConnectedTsses(int minCount) { var tsses = new HashSet(this.TssDegree.Where(x => x.Value < minCount).Select(x => x.Key)); return Convert(this.Where(x => tsses.Contains(x.Key))); } /// /// Gets the map for a given maximum range /// /// The range. /// Max range. public TssRegulatoryMap ApplyRange(int maxRange) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => y.Value.AbsLinkLength <= maxRange).ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } /// /// Creates a map with links shorter than the range removed /// /// The range. /// Minimum range. public TssRegulatoryMap RemoveRange(int minRange) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => y.Value.AbsLinkLength >= minRange).ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } /// /// Removes Loci fromthe range relative to the TSS. May be a fixed distance or a relative upstream/downstream distance /// /// The range. /// Minimum range. public TssRegulatoryMap RemoveRange(List minRange) { if (minRange.Count == 1) { return this.RemoveRange(minRange[0]); } else if (minRange.Count == 2) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => (y.Value.LocusPosition < 0 && y.Value.LocusPosition < minRange[0]) || (y.Value.LocusPosition >= 0 && y.Value.LocusPosition > minRange[1])) .ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } else { throw new Exception("Invalid number genomic range: " + string.Join(", ", minRange)); } } /// /// Gets the nearest neighbor map for the nearest N neighbors /// /// The nearest neighbor map. /// Number of nearest neighbors. public TssRegulatoryMap GetNearestNeighborMap(int n) { return new TssRegulatoryMap(this.Select( x => x.Value.Values.OrderBy(y => y.AbsLinkLength).Take(n)) .SelectMany(x => x)); } /// /// Gets the map for given threshold. /// /// The map for threshold. /// Confidence threshold. public TssRegulatoryMap ApplyThreshold(double threshold) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => y.Value.ConfidenceScore <= threshold).ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } /// /// Gets the map for given threshold. /// /// The map for threshold. /// Confidence threshold. public TssRegulatoryMap ApplyThresholdExclusive(double threshold) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => y.Value.ConfidenceScore < threshold).ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } /// /// Applies the correlation. /// /// A map with a minimum correlation. /// Correlation threshols. public TssRegulatoryMap ApplyCorrelation(double correlation) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => y.Value.Correlation > correlation).ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } /// /// Gets the map for given threshold. /// /// The map for threshold. /// Confidence threshold. public TssRegulatoryMap RemoveThreshold(double threshold) { return Convert(this.Select(x => new { Key = x.Key, Value = x.Value.Where(y => y.Value.ConfidenceScore >= threshold).ToDictionary(y => y.Key, y => y.Value) }) .Where(x => x.Value.Count > 0) .ToDictionary(x => x.Key, x => x.Value)); } /// /// Creates a Locus-centric NN map included only TSSes which are targeted by a Locus /// /// The centric NN map. public LocusRegulatoryMap LocusCentricNNMap() { return new LocusRegulatoryMap(this.Reverse().GetNearestNeighborMap(1).Links); } /// /// Reverse this instance. /// /// A regulatory map with Loci as keys public LocusRegulatoryMap Reverse() { return new LocusRegulatoryMap(this.Links .ToLookup(x => x.LocusName, x => x) .ToDictionary(x => x.Key, x => x.ToDictionary(y => y.TranscriptName, y => y))); } /// /// Samples the map. If there are 5000 sample failures, then the number of length bins is reduced by 2 and more /// sampling is redone to add more maps. /// /// The map. /// Map basis to mimic in the sampled map. /// Length bins. /// Number of sampled maps to generate public TssRegulatoryMap[] SampleMap(TssRegulatoryMap basis, int lengthBins, int sampleCount) { List sampleMaps = new List(); while (lengthBins > 0 && sampleMaps.Count < sampleCount) { var newRound = this.SampleMapInternal(basis, lengthBins, sampleCount - sampleMaps.Count); sampleMaps.AddRange(newRound); lengthBins -= 2; } if (sampleMaps.Count != sampleCount) { throw new Exception(string.Format("Tried to sample {0} maps but only obtained {1}", sampleCount, sampleMaps.Count)); } return sampleMaps.ToArray(); } /// /// Samples the map. /// /// The map. /// Basis map to emulate in sampling. /// Length bins. /// Number of samples to create. private List SampleMapInternal(TssRegulatoryMap basis, int lengthBins, int sampleCount) { var tssBasisDegree = basis.ToDictionary(x => x.Key, x => x.Value.Count); var basisLinksByLength = basis.Links .OrderBy(x => x.LocusPosition).ToList(); ////var basisUpstreamLinks = basisLinksByLength.Where(x => x.LocusPosition < 0).OrderBy(x => -x.LocusPosition).ToList(); ////var basisDownstreamLinks = basisLinksByLength.Where(x => x.LocusPosition >= 0).ToList(); double basisBinSize = (double)basisLinksByLength.Count / lengthBins; var boundaryIndices = Stats.Sequence(basisBinSize, basisLinksByLength.Count - basisBinSize, basisBinSize); var boundaryLengths = boundaryIndices .Select(i => (basisLinksByLength[i - 1].LocusPosition + basisLinksByLength[i].LocusPosition) / 2) .ToList(); ////Console.WriteLine("Boundary count = {0};\nBin count = {1}\nBoundaries = ({2})", boundaryLengths.Count, lengthBins, string.Join(", ", boundaryLengths)); Func getBin = (length) => { if (length < boundaryLengths[0]) { return 0; } if (length >= boundaryLengths.Last()) { return lengthBins - 1; } for (int i = 1; i < boundaryLengths.Count; i++) { if (length >= boundaryLengths[i - 1] && length < boundaryLengths[i]) { return i; } } throw new Exception("Could not find a bin for length " + length); }; // Bin all links to a tss in the basis map var binnedLinks = this.Links .Where(x => tssBasisDegree.ContainsKey(x.TranscriptName)) .Select(x => new { Bin = getBin(x.LocusPosition), Link = x }) .ToLookup(x => x.Bin, x => x.Link) .ToDictionary(x => x.Key, x => x.ToList()); // Construct bins of all links for each tss separately var perTssBinnedBasisLinks = binnedLinks.Select(x => x.Value .Select(y => new { Bin = x.Key, TranscriptName = y.TranscriptName, Link = y })) .SelectMany(x => x) .ToLookup(x => x.TranscriptName, x => x) .ToDictionary( x => x.Key, x => x.ToLookup(y => y.Bin, y => y.Link).ToDictionary(y => y.Key, y => y.ToList())); // Bin the basis map var binnedBasisLinks = basis.Links .Where(x => tssBasisDegree.ContainsKey(x.TranscriptName)) .Select(x => new { Bin = getBin(x.LocusPosition), Link = x }) .ToLookup(x => x.Bin, x => x.Link) .ToDictionary(x => x.Key, x => x.ToList()); ////Console.WriteLine("Sampleable bin counts\n{0}", string.Join("\n", binnedLinks.OrderBy(x => x.Key).Select(x => x.Key + "\t" + x.Value.Count))); ////Console.WriteLine("Basis bin counts\n{0}", string.Join("\n", binnedBasisLinks.OrderBy(x => x.Key).Select(x => x.Key + "\t" + x.Value.Count))); Func makeMap = () => { // Ensure at least one link per tss sampled evenly from the equal-occupancy bins var tssSampledLinkData = perTssBinnedBasisLinks.Select(x => { var bins = x.Value.Keys.ToList(); var bin = bins[Stats.RNG.Next(bins.Count)]; return new { Bin = bin, Link = x.Value[bin][Stats.RNG.Next(x.Value[bin].Count)] }; }); var tssSampledLinks = new HashSet(tssSampledLinkData.Select(x => x.Link)); var tssSampledBinCounts = tssSampledLinkData.ToLookup(x => x.Bin, x => x).ToDictionary(x => x.Key, x => x.Count()); // Set aside tsses with only one link var tssSingleLinks = tssSampledLinks.Take(basis.OriginDegree.Count(x => x.Value == 1)); var singleLinkTsses = new HashSet(tssSingleLinks.Select(x => x.TranscriptName)); // Construct bins of all links for each tss separately /*var filteredPerTssBinnedBasisLinks = binnedLinks.Select(x => x.Value .Where(y => !singleLinkTsses.Contains(y.TranscriptName)) .Select(y => new { Bin = x.Key, TranscriptName = y.TranscriptName, Link = y })) .SelectMany(x => x) .ToLookup(x => x.TranscriptName, x => x) .ToDictionary( x => x.Key, x => x.ToLookup(y => y.Bin, y => y.Link).ToDictionary(y => y.Key, y => y.ToList())); var tssDoubleSampledLinkData = filteredPerTssBinnedBasisLinks.Select(x => { var bins = x.Value.Keys.ToList(); var bin = bins[Stats.RNG.Next(bins.Count)]; return new { Bin = bin, Link = x.Value[bin][Stats.RNG.Next(x.Value[bin].Count)] }; }); var tssDoubleSampledLinks = new HashSet(tssDoubleSampledLinkData.Select(x => x.Link)); var tssDoubleSampledBinCounts = tssDoubleSampledLinkData.ToLookup(x => x.Bin, x => x).ToDictionary(x => x.Key, x => x.Count()); // Set aside tsses with only one link var tssDoubleLinks = tssDoubleSampledLinks.Take(basis.TssDegree.Count(x => x.Value == 2)); var doubleLinkTsses = new HashSet(tssDoubleLinks.Select(x => x.TranscriptName));*/ // Remove pre-sampled links single links from bins as well as all links to TSSes that will have only one link var filteredBinnedLinks = binnedLinks .ToDictionary( x => x.Key, x => x.Value.Where(y => !tssSampledLinks.Contains(y) && !singleLinkTsses.Contains(y.TranscriptName)) ////&& !tssDoubleSampledLinks.Contains(y) && !doubleLinkTsses.Contains(y.TranscriptName) .ToList()); // Filter from bins var sampledLinks = binnedBasisLinks .Select(x => Stats.SampleWithoutReplacement( x.Value.Count - (tssSampledBinCounts.ContainsKey(x.Key) ? tssSampledBinCounts[x.Key] : 0), ////+ (tssDoubleSampledBinCounts.ContainsKey(x.Key) ? tssDoubleSampledBinCounts[x.Key] : 0) filteredBinnedLinks[x.Key].Count) .Select(i => filteredBinnedLinks[x.Key][i])) .SelectMany(x => x) .ToList(); //// Remove from bin samples enough to account for pre-sampled tss links ////var filteredSampledLinks = Stats.SampleWithoutReplacement(basis.LinkCount - basis.Count, sampledLinks.Count).Select(i => sampledLinks[i]); var sampledMap = new TssRegulatoryMap(tssSampledLinks ////.Concat(tssDoubleSampledLinks) .Concat(sampledLinks)); ////var sampleTssDegree = sampledMap.ToDictionary(x => x.Key, x => x.Value.Count); /*Console.WriteLine("Map tss and link count: {0}, {1} ", basis.Count, basis.LinkCount); Console.WriteLine("Tss degree = {{\n{0}\n}}", string.Join("\n", tssBasisDegree .ToLookup(x => x.Value, x => x) .ToDictionary(x => x.Key, x => x.Count()) .OrderBy(x => x.Key) .Select(x => "\t" + x.Key + "\t" + x.Value))); Console.WriteLine("Sampledtss and link count: {0}, {1}", sampledMap.Count, sampledMap.LinkCount); Console.WriteLine("Sampled Tss degree = {{\n{0}\n}}", string.Join("\n", sampleTssDegree .ToLookup(x => x.Value, x => x) .ToDictionary(x => x.Key, x => x.Count()) .OrderBy(x => x.Key) .Select(x => "\t" + x.Key + "\t" + x.Value)));*/ return sampledMap; }; var sampledMaps = new List(); int count = 0; for (int i = 0; i < sampleCount; i++) { count++; TssRegulatoryMap map = null; bool success = true; try { map = makeMap(); } catch { i--; success = false; } if (count % 100 == 0) { Console.Write("."); } if (count > 1000) { Console.WriteLine("Could not create sample maps in 1,000 tries"); return sampledMaps; } if (success && map != null) { sampledMaps.Add(map); } } return sampledMaps; } } }