//--------------------------------------------------------------------------------
// File: GtfGencodeFile.cs
// Author: Timothy O'Connor
// © Copyright University of Queensland, 2012-2014. All rights reserved.
// License:
//--------------------------------------------------------------------------------
namespace Genomics
{
using System;
using System.Collections.Generic;
using System.Linq;
public class GtfGencodeGeneFile : GtfGencodeFile
{
override protected string Feature
{
get
{
return "gene";
}
}
public GtfGencodeGeneFile(string filename)
: base(filename)
{
}
}
public class GtfGencodeTranscriptFile : GtfGencodeFile
{
override protected string Feature
{
get
{
return "transcript";
}
}
public GtfGencodeTranscriptFile(string filename)
: base(filename)
{
}
}
///
/// Gtf gencode file.
///
public abstract class GtfGencodeFile : BedFile
{
public enum TranscriptType
{
protein_coding = 0x01,
processed_transcript = 0x02,
};
///
/// The gencode files.
///
private static readonly Dictionary GencodeFiles = new Dictionary();
///
/// Cache of valid transcript sets. Key is filename.TranscriptType.TranscriptType...
///
private static readonly Dictionary> staticValidTranscripts = new Dictionary>();
///
/// The static valid genes.
///
private static readonly Dictionary> staticValidGenes = new Dictionary>();
///
/// Cache of ordered gene locations.
///
private static readonly Dictionary>> staticOrderedTranscriptLocations = new Dictionary>>();
///
/// The valid types backint the ValidGeneTypes property
///
private TranscriptType[] validTranscriptTypes;
///
/// Filename of this instance
///
private readonly string filename;
///
/// The feature type to use
///
abstract protected string Feature { get; }
///
/// Load the specified filename and feature.
///
/// Filename.
/// Feature.
public static GtfGencodeFile LoadGenes(string filename)
{
if (!GencodeFiles.ContainsKey(filename + "_gene"))
{
GencodeFiles.Add(filename, new GtfGencodeGeneFile(filename));
}
return GencodeFiles [filename];
}
public static GtfGencodeFile LoadTranscripts(string filename)
{
if (!GencodeFiles.ContainsKey(filename + "_transcript"))
{
GencodeFiles.Add(filename, new GtfGencodeTranscriptFile(filename));
}
return GencodeFiles [filename];
}
///
/// Initializes a new instance of the class.
///
/// Filename.
protected GtfGencodeFile(string filename)
: base (filename, new Layout
{
Chromosome = 0,
Start = 3,
End = 4,
Strand = 6,
Name = 8
}
)
{
this.filename = filename;
}
///
/// Queries the interface.
///
/// The interface.
/// T.
override protected object QueryInterface(Type t)
{
return t == typeof(IAnnotation) ? new AnnotationProxy(this) as IAnnotation : base.QueryInterface(t);
}
///
/// Gets or sets the valid gene types.
///
/// The valid gene types.
public string[] ValidTranscriptTypes
{
get
{
return this.validTranscriptTypes.Select(x => x.ToString()).ToArray();
}
set
{
if (value.Count(x => !string.IsNullOrWhiteSpace(x)) == 0)
{
this.validTranscriptTypes = new TranscriptType[]{ };
}
else
{
this.validTranscriptTypes = value.Select(x => (TranscriptType)Enum.Parse(typeof(TranscriptType), x)).ToArray();
}
}
}
private string InitValidFeature(string feature, Dictionary> validFeatures)
{
string validGeneKey = GenerateKey(this.validTranscriptTypes);
if (!validFeatures.ContainsKey(validGeneKey))
{
var featureSet = this.Locations.Where(x => x.Value.Data [2] == feature);
Func TestType = l => this.ValidTranscriptTypes.Contains(l);
// All transcript types are valid
if (this.validTranscriptTypes.Length == 0)
{
TestType = l => true;
}
validFeatures.Add(validGeneKey, new HashSet(featureSet.Where(x => TestType(x.Value.AdditionalFields ["transcript_type"])).Select(x => x.Value.Name)));
}
return validGeneKey;
}
///
/// Gets the valid transcripts.
///
/// The valid transcripts.
public HashSet ValidTranscripts
{
get
{
string validTranscriptKey = this.InitValidFeature("transcript", staticValidTranscripts);
return staticValidTranscripts [validTranscriptKey];
}
}
///
/// Gets the valid genes.
///
/// The valid genes.
public HashSet ValidGenes
{
get
{
string validGeneKey = this.InitValidFeature("gene", staticValidGenes);
return staticValidGenes [validGeneKey];
}
}
///
/// Gets the ordered gene locations by chromosome and then position
///
/// The ordered gene locations.
public Dictionary> OrderedTranscriptLocations
{
get
{
var key = GenerateKey(this.validTranscriptTypes);
if (!staticOrderedTranscriptLocations.ContainsKey(key))
{
staticOrderedTranscriptLocations.Add(key, this.ChromosomeOrderedLocations.ToDictionary(x => x.Key, x => x.Value.Where(y => this.ValidTranscripts.Contains(y.Name)).ToList()));
}
return staticOrderedTranscriptLocations [key];
}
}
///
/// Gets maximum feature size
///
/// The size of the max feature.
public int MaxFeatureSize
{
get
{
return this.MaxLocationSize.Max(x => x.Value);
}
}
///
/// Gets the indexed transcript locations.
///
/// The indexed transcript locations.
public Dictionary>> IndexedTranscriptLocations
{
get
{
return this.ChromosomeIndexedLocations;
}
}
///
/// Parses the fields.
///
/// Fields.
/// Layout.
/// Data.
/// Entry count.
protected override void ParseFields(string[] fields, Layout layout, List> data, ref int entryCount) {
if (fields[0][0] == '#') { return; }
if (fields [2] != this.Feature) { return; }
var nameData = ParseNameData(fields [layout.Name]);
var startSite = new Genomics.Location
{
Start = int.Parse(fields [layout.Start]),
End = int.Parse(fields [layout.End])
};
linecount++;
if (linecount % 10000 == 0)
{
Console.WriteLine(linecount);
}
var name = nameData["transcript_id"];
string geneName = nameData["gene_id"];
var location = new Genomics.Location
{
Name = nameData["transcript_id"],
Chromosome = fields [layout.Chromosome],
Start = int.Parse(fields [layout.Start]),
End = int.Parse(fields [layout.End]),
Strand = fields [layout.Strand],
Data = fields,
AdditionalFields = nameData,
AlternateName = geneName + "." + startSite.DirectionalStart
};
data.Add(new Tuple(
location,
nameData["transcript_id"])
);
}
private int linecount = 0;
///
/// Parses the name data producing a dictionary of all named data element lists
///
/// The name data.
/// Name.
protected Dictionary ParseNameData(string name)
{
return name.Split(';').Where(x => !string.IsNullOrWhiteSpace(x))
.Select(x =>
{
var fields = x.Trim().Split(' ');
var value = (fields[0] == "level" ? fields[1] : fields[1].Replace("\"", "")).Trim();
if (fields[0] == "gene_id" || fields[0] == "transcript_id")
{
value = value.Split('.')[0].Trim();
}
return new
{
Key = fields[0],
Value = value
};
}
)
.ToLookup(x => x.Key, x => x.Value)
.ToDictionary(x => x.Key, x => x.First()); // Only unused fields 'ont' and 'tag' are multiply used per transcript
}
///
/// Generates keys for cached data
///
/// The key.
/// Types.
string GenerateKey(TranscriptType[] types) {
if (types == null)
{
throw new Exception("Transcript types not set in annotation implementation " + System.Reflection.MethodInfo.GetCurrentMethod().Name);
}
return this.filename + "." + string.Join(".", types.OrderBy(x => x).Select(x => x.ToString()));
}
private class AnnotationProxy : IAnnotation
{
private readonly GtfGencodeFile data;
public AnnotationProxy(GtfGencodeFile data)
{
this.data = data;
}
override public string[] ValidTranscriptTypes { get { return this.data.ValidTranscriptTypes; } set { this.data.ValidTranscriptTypes = value; } }
override public HashSet ValidGenes { get { return this.data.ValidGenes; } }
override public HashSet ValidTranscripts { get { return this.data.ValidTranscripts; } }
override public Dictionary> OrderedTranscriptLocations { get { return this.data.OrderedTranscriptLocations; } }
override public Dictionary>> IndexedTranscriptLocations { get { return this.data.IndexedTranscriptLocations; } }
override public int MaxFeatureSize { get { return this.data.MaxFeatureSize; } }
// Analysis disable AccessToStaticMemberViaDerivedType
// Analysis disable MemberHidesStaticFromOuterClass
override public int IndexSize { get { return GtfGencodeFile.IndexSize; } }
// Analysis restore MemberHidesStaticFromOuterClass
// Analysis restore AccessToStaticMemberViaDerivedType
}
}
}