//-------------------------------------------------------------------------------- // File: GtfGencodeFile.cs // Author: Timothy O'Connor // © Copyright University of Queensland, 2012-2014. All rights reserved. // License: //-------------------------------------------------------------------------------- namespace Genomics { using System; using System.Collections.Generic; using System.Linq; public class GtfGencodeGeneFile : GtfGencodeFile { override protected string Feature { get { return "gene"; } } public GtfGencodeGeneFile(string filename) : base(filename) { } } public class GtfGencodeTranscriptFile : GtfGencodeFile { override protected string Feature { get { return "transcript"; } } public GtfGencodeTranscriptFile(string filename) : base(filename) { } } /// /// Gtf gencode file. /// public abstract class GtfGencodeFile : BedFile { public enum TranscriptType { protein_coding = 0x01, processed_transcript = 0x02, }; /// /// The gencode files. /// private static readonly Dictionary GencodeFiles = new Dictionary(); /// /// Cache of valid transcript sets. Key is filename.TranscriptType.TranscriptType... /// private static readonly Dictionary> staticValidTranscripts = new Dictionary>(); /// /// The static valid genes. /// private static readonly Dictionary> staticValidGenes = new Dictionary>(); /// /// Cache of ordered gene locations. /// private static readonly Dictionary>> staticOrderedTranscriptLocations = new Dictionary>>(); /// /// The valid types backint the ValidGeneTypes property /// private TranscriptType[] validTranscriptTypes; /// /// Filename of this instance /// private readonly string filename; /// /// The feature type to use /// abstract protected string Feature { get; } /// /// Load the specified filename and feature. /// /// Filename. /// Feature. public static GtfGencodeFile LoadGenes(string filename) { if (!GencodeFiles.ContainsKey(filename + "_gene")) { GencodeFiles.Add(filename, new GtfGencodeGeneFile(filename)); } return GencodeFiles [filename]; } public static GtfGencodeFile LoadTranscripts(string filename) { if (!GencodeFiles.ContainsKey(filename + "_transcript")) { GencodeFiles.Add(filename, new GtfGencodeTranscriptFile(filename)); } return GencodeFiles [filename]; } /// /// Initializes a new instance of the class. /// /// Filename. protected GtfGencodeFile(string filename) : base (filename, new Layout { Chromosome = 0, Start = 3, End = 4, Strand = 6, Name = 8 } ) { this.filename = filename; } /// /// Queries the interface. /// /// The interface. /// T. override protected object QueryInterface(Type t) { return t == typeof(IAnnotation) ? new AnnotationProxy(this) as IAnnotation : base.QueryInterface(t); } /// /// Gets or sets the valid gene types. /// /// The valid gene types. public string[] ValidTranscriptTypes { get { return this.validTranscriptTypes.Select(x => x.ToString()).ToArray(); } set { if (value.Count(x => !string.IsNullOrWhiteSpace(x)) == 0) { this.validTranscriptTypes = new TranscriptType[]{ }; } else { this.validTranscriptTypes = value.Select(x => (TranscriptType)Enum.Parse(typeof(TranscriptType), x)).ToArray(); } } } private string InitValidFeature(string feature, Dictionary> validFeatures) { string validGeneKey = GenerateKey(this.validTranscriptTypes); if (!validFeatures.ContainsKey(validGeneKey)) { var featureSet = this.Locations.Where(x => x.Value.Data [2] == feature); Func TestType = l => this.ValidTranscriptTypes.Contains(l); // All transcript types are valid if (this.validTranscriptTypes.Length == 0) { TestType = l => true; } validFeatures.Add(validGeneKey, new HashSet(featureSet.Where(x => TestType(x.Value.AdditionalFields ["transcript_type"])).Select(x => x.Value.Name))); } return validGeneKey; } /// /// Gets the valid transcripts. /// /// The valid transcripts. public HashSet ValidTranscripts { get { string validTranscriptKey = this.InitValidFeature("transcript", staticValidTranscripts); return staticValidTranscripts [validTranscriptKey]; } } /// /// Gets the valid genes. /// /// The valid genes. public HashSet ValidGenes { get { string validGeneKey = this.InitValidFeature("gene", staticValidGenes); return staticValidGenes [validGeneKey]; } } /// /// Gets the ordered gene locations by chromosome and then position /// /// The ordered gene locations. public Dictionary> OrderedTranscriptLocations { get { var key = GenerateKey(this.validTranscriptTypes); if (!staticOrderedTranscriptLocations.ContainsKey(key)) { staticOrderedTranscriptLocations.Add(key, this.ChromosomeOrderedLocations.ToDictionary(x => x.Key, x => x.Value.Where(y => this.ValidTranscripts.Contains(y.Name)).ToList())); } return staticOrderedTranscriptLocations [key]; } } /// /// Gets maximum feature size /// /// The size of the max feature. public int MaxFeatureSize { get { return this.MaxLocationSize.Max(x => x.Value); } } /// /// Gets the indexed transcript locations. /// /// The indexed transcript locations. public Dictionary>> IndexedTranscriptLocations { get { return this.ChromosomeIndexedLocations; } } /// /// Parses the fields. /// /// Fields. /// Layout. /// Data. /// Entry count. protected override void ParseFields(string[] fields, Layout layout, List> data, ref int entryCount) { if (fields[0][0] == '#') { return; } if (fields [2] != this.Feature) { return; } var nameData = ParseNameData(fields [layout.Name]); var startSite = new Genomics.Location { Start = int.Parse(fields [layout.Start]), End = int.Parse(fields [layout.End]) }; linecount++; if (linecount % 10000 == 0) { Console.WriteLine(linecount); } var name = nameData["transcript_id"]; string geneName = nameData["gene_id"]; var location = new Genomics.Location { Name = nameData["transcript_id"], Chromosome = fields [layout.Chromosome], Start = int.Parse(fields [layout.Start]), End = int.Parse(fields [layout.End]), Strand = fields [layout.Strand], Data = fields, AdditionalFields = nameData, AlternateName = geneName + "." + startSite.DirectionalStart }; data.Add(new Tuple( location, nameData["transcript_id"]) ); } private int linecount = 0; /// /// Parses the name data producing a dictionary of all named data element lists /// /// The name data. /// Name. protected Dictionary ParseNameData(string name) { return name.Split(';').Where(x => !string.IsNullOrWhiteSpace(x)) .Select(x => { var fields = x.Trim().Split(' '); var value = (fields[0] == "level" ? fields[1] : fields[1].Replace("\"", "")).Trim(); if (fields[0] == "gene_id" || fields[0] == "transcript_id") { value = value.Split('.')[0].Trim(); } return new { Key = fields[0], Value = value }; } ) .ToLookup(x => x.Key, x => x.Value) .ToDictionary(x => x.Key, x => x.First()); // Only unused fields 'ont' and 'tag' are multiply used per transcript } /// /// Generates keys for cached data /// /// The key. /// Types. string GenerateKey(TranscriptType[] types) { if (types == null) { throw new Exception("Transcript types not set in annotation implementation " + System.Reflection.MethodInfo.GetCurrentMethod().Name); } return this.filename + "." + string.Join(".", types.OrderBy(x => x).Select(x => x.ToString())); } private class AnnotationProxy : IAnnotation { private readonly GtfGencodeFile data; public AnnotationProxy(GtfGencodeFile data) { this.data = data; } override public string[] ValidTranscriptTypes { get { return this.data.ValidTranscriptTypes; } set { this.data.ValidTranscriptTypes = value; } } override public HashSet ValidGenes { get { return this.data.ValidGenes; } } override public HashSet ValidTranscripts { get { return this.data.ValidTranscripts; } } override public Dictionary> OrderedTranscriptLocations { get { return this.data.OrderedTranscriptLocations; } } override public Dictionary>> IndexedTranscriptLocations { get { return this.data.IndexedTranscriptLocations; } } override public int MaxFeatureSize { get { return this.data.MaxFeatureSize; } } // Analysis disable AccessToStaticMemberViaDerivedType // Analysis disable MemberHidesStaticFromOuterClass override public int IndexSize { get { return GtfGencodeFile.IndexSize; } } // Analysis restore MemberHidesStaticFromOuterClass // Analysis restore AccessToStaticMemberViaDerivedType } } }