/* * vcf_file_output.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 249 $) */ #include "vcf_file.h" void vcf_file::output_frequency(const string &output_file_prefix, bool output_counts, bool suppress_allele_output, bool derived) { // Output statistics of frequency at each site if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Frequency Statistics."); LOG.printLOG("Outputting Frequency Statistics...\n"); string output_file = output_file_prefix + ".frq"; if (output_counts) output_file += ".count"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); if (suppress_allele_output == false) { out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:"; if (output_counts) out << "COUNT}" << endl; else out << "FREQ}" << endl; } else { if (output_counts) out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{COUNT}" << endl; else out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{FREQ}" << endl; } vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; string vcf_line; vcf_entry e(N_indv); unsigned int aa_idx = 0; for (unsigned int s=0; s freq(N_entries, 0.0); vector allele_counts; vector N_non_missing_chr(N_entries,0); string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s 0) freq[s] = allele_counts[1] / double(N_non_missing_chr[s]); else freq[s] = -1; } vector N_sites_included(N_indv, 0); vector N_obs_hom(N_indv, 0); vector N_expected_hom(N_indv, 0.0); pair alleles; for (unsigned int s=0; s::epsilon()) || (1.0 - freq[s] <= numeric_limits::epsilon())) continue; for (unsigned int ui=0; ui 0) { double F = (N_obs_hom[ui] - N_expected_hom[ui]) / double(N_sites_included[ui] - N_expected_hom[ui]); out << indv[ui] << "\t" << N_obs_hom[ui] << "\t"; out.precision(1); out << N_expected_hom[ui] << "\t"; out.precision(5); out << N_sites_included[ui] << "\t" << F << endl; } } out.close(); } void vcf_file::output_hwe(const string &output_file_prefix) { // Output HWE statistics for each site as described in Wigginton, Cutler, and Abecasis (2005) if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output HWE Statistics."); // Note this assumes Biallelic SNPs. LOG.printLOG("Outputting HWE statistics (but only for biallelic loci)\n"); string output_file = output_file_prefix + ".hwe"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHR\tPOS\tOBS(HOM1/HET/HOM2)\tE(HOM1/HET/HOM2)\tChiSq\tP" << endl; /* PLINK code: // b11 = Nhom1, b12 = Nhet, b22 = Nhom2 double tot = b11 + b12 + b22; double exp_11 = freq * freq * tot; double exp_12 = 2 * freq * (1-freq) * tot; double exp_22 = (1-freq) * (1-freq) * tot; double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ; p = chiprobP(chisq,1); */ double freq; unsigned int b11, b12, b22; double exp_11, exp_12, exp_22; double chisq; double tot; double p; unsigned int precision = out.precision(); vector allele_counts; unsigned int N_non_missing_chr; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s depth_sum(N_indv, 0.0); vector count(N_indv, 0); int depth; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s= 0) { depth_sum[ui] += depth; count[ui]++; } } } } for (unsigned int ui=0; ui max_pos; map min_pos; string vcf_line; string CHROM; int POS; vcf_entry e(N_indv); for (unsigned int s=0; s max_pos[CHROM]) max_pos[CHROM] = POS; } else max_pos[CHROM] = POS; if (min_pos.find(CHROM) != min_pos.end()) { if (POS < min_pos[CHROM]) min_pos[CHROM] = POS; } else min_pos[CHROM] = POS; } } map::iterator it; unsigned int N_bins; map > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size)); bins[CHROM].resize(N_bins, 0); } unsigned int idx; double C = 1.0 / double(bin_size); for (unsigned int s=0; s 0) output = true; if (output == true) out << CHROM << "\t" << s*bin_size << "\t" << bin_tot << "\t" << bin_tot * C << endl; } } out.close(); double mean_SNP_density = sum1 / sum2 * 1000; LOG.printLOG("Mean SNP density: " + output_log::dbl2str(mean_SNP_density, 5) + " variants / kb\n"); } void vcf_file::output_missingness(const string &output_file_prefix) { // Output missingness by individual and site if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Missingness Statistics."); LOG.printLOG("Outputting Site and Individual Missingness\n"); string output1 = output_file_prefix + ".imiss"; ofstream out1(output1.c_str()); if (!out1.is_open()) LOG.error("Could not open Individual Missingness Output File: " + output1, 3); string output2 = output_file_prefix + ".lmiss"; ofstream out2(output2.c_str()); if (!out2.is_open()) LOG.error("Could not open Site Missingness Output File: " + output2, 4); out1 << "INDV\tN_DATA\tN_GENOTYPES_FILTERED\tN_MISS\tF_MISS" << endl; unsigned int ui, s; vector indv_N_missing(N_indv, 0), indv_N_tot(N_indv, 0); vector indv_N_geno_filtered(N_indv, 0); unsigned int site_N_missing, site_N_tot, site_N_geno_filtered; pair alleles; string vcf_line; vcf_entry e(N_indv); out2 << "CHR\tPOS\tN_DATA\tN_GENOTYPE_FILTERED\tN_MISS\tF_MISS" << endl; for (s=0; s geno1, geno2; string vcf_line, vcf_line2; vcf_entry e(N_indv), e2(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_vcf_entry(s, vcf_line); e.reset(vcf_line); e.parse_basic_entry(true); if (e.get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; // Isn't biallelic } for (s2 = s+1; s2 snp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } get_vcf_entry(s2, vcf_line2); e2.reset(vcf_line2); e2.parse_basic_entry(true); if (e.get_CHROM() != e2.get_CHROM()) { s2 = N_entries; // No need to go any further (assuming SNPs are sorted) continue; } if ((e2.get_POS() - e.get_POS()) > bp_window_size) { s2 = N_entries; // No need to go any further (assuming SNPs are sorted) continue; } if (e2.get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; } x11=0; x12=0; x21=0; x22=0; X=0, X2=0; Y=0; Y2=0; XY=0; chr_count = 0; for (ui=0; ui 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << chr_count << "\t" << r2 << "\t" << D << "\t" << Dprime << "\t" << endl; } } out.close(); } void vcf_file::output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; unsigned int ui; LOG.printLOG("Outputting Pairwise LD (bi-allelic only)\n"); string output = output_file_prefix + ".geno.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR\tPOS1\tPOS2\tN_INDV\tR^2" << endl; unsigned int indv_count; double r2; int sx, sy; double X, X2, Y, Y2, XY; double var1, var2, cov12; pair geno1, geno2; string vcf_line, vcf_line2; vcf_entry e(N_indv), e2(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_vcf_entry(s, vcf_line); e.reset(vcf_line); e.parse_basic_entry(true); if (e.get_N_alleles() != 2) { LOG.one_off_warning("\tgenoLD: Only using biallelic variants."); continue; // Isn't biallelic } for (s2 = s+1; s2 snp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } get_vcf_entry(s2, vcf_line2); e2.reset(vcf_line2); e2.parse_basic_entry(true); if (e2.get_N_alleles() != 2) { LOG.one_off_warning("\tgenoLD: Only using biallelic variants."); continue; // Isn't biallelic } if (e.get_CHROM() != e2.get_CHROM()) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } if ((e2.get_POS() - e.get_POS()) > bp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } X=0, X2=0; Y=0; Y2=0; XY=0; indv_count = 0; for (ui=0; ui 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl; } } out.close(); } // TODO - provide similar function for haplotype r2. void vcf_file::output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; unsigned int ui; LOG.printLOG("Outputting Interchromosomal Pairwise LD (bi-allelic only)\n"); string output = output_file_prefix + ".interchrom.geno.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl; unsigned int indv_count; double r2; int sx, sy; double X, X2, Y, Y2, XY; double var1, var2, cov12; pair geno1, geno2; string vcf_line, vcf_line2; vcf_entry e(N_indv), e2(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_vcf_entry(s, vcf_line); e.reset(vcf_line); e.parse_basic_entry(true); if (e.get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } for (s2 = s+1; s2 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << e2.get_CHROM() << "\t" << e2.get_POS() << "\t" << indv_count << "\t" << r2 << endl; } } out.close(); } void vcf_file::output_singletons(const string &output_file_prefix) { // Locate and output singletons (and private doubletons) if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Singletons."); LOG.printLOG("Outputting Singleton Locations\n"); string output = output_file_prefix + ".singletons"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Singleton Output File: " + output, 3); out << "CHROM\tPOS\tSINGLETON/DOUBLETON\tALLELE\tINDV" << endl; unsigned int ui; int a; vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; pair geno; string allele; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; string FILTER; string vcf_line; vcf_entry e(N_indv); map > FILTER_to_TsTv; map FILTER_to_Nsites; map::iterator FILTER_to_Nsites_it; for (unsigned int s=0; s > count_to_FILTER; for ( FILTER_to_Nsites_it=FILTER_to_Nsites.begin() ; FILTER_to_Nsites_it != FILTER_to_Nsites.end(); ++FILTER_to_Nsites_it ) { FILTER = (*FILTER_to_Nsites_it).first; int Nsites = (*FILTER_to_Nsites_it).second; count_to_FILTER.push_back(make_pair(Nsites, FILTER)); } sort(count_to_FILTER.begin(), count_to_FILTER.end()); string output = output_file_prefix + ".FILTER.summary"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Filter Summary Output File: " + output, 7); out << "FILTER\tN_VARIANTS\tN_Ts\tN_Tv\tTs/Tv" << endl; for (int i=count_to_FILTER.size()-1; i > -1; i--) { FILTER = count_to_FILTER[i].second; int Ts = FILTER_to_TsTv[FILTER].first; int Tv = FILTER_to_TsTv[FILTER].second; int Nsites = FILTER_to_Nsites[FILTER]; out << FILTER << "\t" << Nsites << "\t"; out << Ts << "\t" << Tv << "\t" << double(Ts)/Tv << endl; } out.close(); } void vcf_file::output_TsTv(const string &output_file_prefix, int bin_size) { // Output Ts/Tv ratios in bins of a given size. LOG.printLOG("Outputting Ts/Tv in bins of " + output_log::int2str(bin_size) + "bp\n"); map model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; map max_pos; string vcf_line, CHROM; vcf_entry e(N_indv); for (unsigned int s=0; s max_pos[CHROM]) max_pos[CHROM] = e.get_POS(); } else max_pos[CHROM] = e.get_POS(); } } map::iterator it; unsigned int N_bins; map > Ts_counts; map > Tv_counts; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size)); Ts_counts[CHROM].resize(N_bins, 0); Tv_counts[CHROM].resize(N_bins, 0); } vector model_counts(6,0); double C = 1.0 / double(bin_size); unsigned int idx; string model; for (unsigned int s=0; s Ts_counts, Tv_counts; unsigned int N_kept_indv = N_kept_individuals(); Ts_counts.resize(2*N_kept_indv); Tv_counts.resize(2*N_kept_indv); string vcf_line, model; vcf_entry e(N_indv); map model_to_Ts_or_Tv; model_to_Ts_or_Tv["AC"] = 1; model_to_Ts_or_Tv["CA"] = 1; model_to_Ts_or_Tv["AG"] = 0; // Ts model_to_Ts_or_Tv["GA"] = 0; // Ts model_to_Ts_or_Tv["AT"] = 1; model_to_Ts_or_Tv["TA"] = 1; model_to_Ts_or_Tv["CG"] = 1; model_to_Ts_or_Tv["GC"] = 1; model_to_Ts_or_Tv["CT"] = 0; // Ts model_to_Ts_or_Tv["TC"] = 0; // Ts model_to_Ts_or_Tv["GT"] = 1; model_to_Ts_or_Tv["TG"] = 1; unsigned int idx; vector allele_counts; unsigned int allele_count; unsigned int N_included_indv; for (unsigned int s=0; s > TsTv_counts; double max_qual = -numeric_limits::max(), min_qual=numeric_limits::max(); string vcf_line, model; vcf_entry e(N_indv); map model_to_Ts_or_Tv; model_to_Ts_or_Tv["AC"] = 1; model_to_Ts_or_Tv["CA"] = 1; model_to_Ts_or_Tv["AG"] = 0; // Ts model_to_Ts_or_Tv["GA"] = 0; // Ts model_to_Ts_or_Tv["AT"] = 1; model_to_Ts_or_Tv["TA"] = 1; model_to_Ts_or_Tv["CG"] = 1; model_to_Ts_or_Tv["GC"] = 1; model_to_Ts_or_Tv["CT"] = 0; // Ts model_to_Ts_or_Tv["TC"] = 0; // Ts model_to_Ts_or_Tv["GT"] = 1; model_to_Ts_or_Tv["TG"] = 1; unsigned int idx; double QUAL; for (unsigned int s=0; s max_qual) max_qual = QUAL; if (QUAL < min_qual) min_qual = QUAL; model = e.get_REF() + e.get_ALT_allele(0); if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end()) { idx = model_to_Ts_or_Tv[model]; if (idx == 0) // Ts { TsTv_counts[QUAL].first++; } else if (idx == 1) // Tv; TsTv_counts[QUAL].second++; else LOG.error("Unknown model type\n"); } else LOG.warning("Unknown model type. Not a SNP? " + e.get_CHROM() + ":" + output_log::int2str(e.get_POS()) +"\n"); } } string output = output_file_prefix + ".TsTv.qual"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open TsTv by Count Output File: " + output, 7); out << "QUAL_THRESHOLD"; out << "\tN_Ts_LT_QUAL_THRESHOLD\tN_Tv_LT_QUAL_THRESHOLD\tTs/Tv_LT_QUAL_THRESHOLD"; out << "\tN_Ts_GT_QUAL_THRESHOLD\tN_Tv_GT_QUAL_THRESHOLD\tTs/Tv_GT_QUAL_THRESHOLD" << endl; unsigned int N_TsTv = TsTv_counts.size(); vector Ts_sum_below(N_TsTv+1, 0.0), Tv_sum_below(N_TsTv+1, 0.0); vector QUAL_vector(N_TsTv+1, 0.0); QUAL_vector[0] = min_qual; QUAL_vector[N_TsTv] = max_qual; idx = 1; for (map >::iterator it=TsTv_counts.begin(); it != TsTv_counts.end(); ++it) { QUAL = (it->first); double Ts = (it->second).first; double Tv = (it->second).second; Ts_sum_below[idx] = Ts_sum_below[idx-1]+Ts; Tv_sum_below[idx] = Tv_sum_below[idx-1]+Tv; QUAL_vector[idx-1] = QUAL; idx++; } QUAL_vector[N_TsTv] = max_qual; vector Ts_sum_above(N_TsTv+1, 0.0), Tv_sum_above(N_TsTv+1, 0.0); idx = N_TsTv; for (map >::reverse_iterator it=TsTv_counts.rbegin(); it != TsTv_counts.rend(); ++it) { QUAL = (it->first); double Ts = (it->second).first; double Tv = (it->second).second; Ts_sum_above[idx] = Ts_sum_above[idx+1]+Ts; Tv_sum_above[idx] = Tv_sum_above[idx+1]+Tv; idx--; } double Ts_sum, Tv_sum, ratio; for (unsigned int ui=1; ui<(N_TsTv+1); ui++) { QUAL = QUAL_vector[ui-1]; out << QUAL; Ts_sum = Ts_sum_below[ui-1]; Tv_sum = Tv_sum_below[ui-1]; ratio = Ts_sum / Tv_sum; out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio; Ts_sum = Ts_sum_above[ui+1]; Tv_sum = Tv_sum_above[ui+1]; ratio = Ts_sum / Tv_sum; out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio; out << endl; } out.close(); } void vcf_file::output_site_quality(const string &output_file_prefix) { // Output per-site quality information. LOG.printLOG("Outputting Quality for Each Site\n"); string output = output_file_prefix + ".lqual"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Site Depth Output File: " + output, 7); out << "CHROM\tPOS\tQUAL" << endl; string vcf_line; vcf_entry e(N_indv); for (unsigned int s=0; s= 0) { sum += depth; sumsq += (depth*depth); n++; } } if (output_mean) { double mean = double(sum) / n; double var = ((double(sumsq) / n) - (mean*mean)) * double(n) / double(n-1); out << mean << "\t" << var << endl; } else out << sum << "\t" << sumsq << endl; } out.close(); } void vcf_file::output_hapmap_fst_from_two_files(const string &output_file_prefix, vcf_file &vcf_fst) { // Calculate, and output, Fst using the formula outlined in HapMap I // Namely: // Fst = 1 - (Pi_within / Pi_combined) // where // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2)) // and // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1)) // where j is the population index, and i is the SNP index LOG.printLOG("Outputting HapMap-style Fst estimates (for bi-allelic only)\n"); if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); string output = output_file_prefix + ".hapmap.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tPOS\tHAPMAP_FST" << endl; map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(vcf_fst, CHROMPOS_to_filepos_pair); string vcf_line; int n_1, n_2, n_1_choose_2 = 0, n_2_choose_2=0; int last_n_1=-1, last_n_2=-1; unsigned int n_i1, n_i2, n_iT; int N_alleles1, N_alleles2; vector allele_counts1, allele_counts2; double x_i1, x_i2, x_iT; int POS; int s1, s2; double tmp1, tmp2, tmpT; double sum1=0.0, sum2=0.0, sumT=0.0; double Fst; string CHROM; unsigned int N_intersecting_sites = 0; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; if ((s1 == -1) || (s2 == -1)) continue; CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; get_vcf_entry(s1, vcf_line); vcf_entry e1(N_indv, vcf_line); vcf_fst.get_vcf_entry(s2, vcf_line); vcf_entry e2(vcf_fst.N_indv, vcf_line); e1.parse_basic_entry(true); e2.parse_basic_entry(true); // Check sites have same alternative alleles N_alleles1 = e1.get_N_alleles(); N_alleles2 = e2.get_N_alleles(); if ((N_alleles1 != 2) || (N_alleles2 != 2)) { LOG.one_off_warning("\tFst: Only using biallelic SNPs."); continue; } if ((N_alleles1 == 2) && (N_alleles2 == 2)) if (e1.get_ALT_allele(0) != e2.get_ALT_allele(0)) { LOG.one_off_warning("\tFst: Only using sites with matching reference alleles."); continue; } e1.parse_genotype_entries(true); e2.parse_genotype_entries(true); // Calculate allele frequencies e1.get_allele_counts(allele_counts1, n_i1, include_indv, include_genotype[s1]); e2.get_allele_counts(allele_counts2, n_i2, vcf_fst.include_indv, vcf_fst.include_genotype[s2]); if ((n_i1 == 0) || (n_i2 == 0)) continue; n_1 = e1.get_N_chr(include_indv, include_genotype[s1]); n_2 = e2.get_N_chr(vcf_fst.include_indv, vcf_fst.include_genotype[s2]); if (last_n_1 != -1) { if ((n_1 != last_n_1) || (n_2 != last_n_2)) { LOG.error("Cannot mix sites with different ploidy. Are you including sex-chromosomes?\n"+ CHROM +":" + output_log::int2str(POS)+"\n"); } } else { last_n_1 = n_1; last_n_2 = n_2; } n_1_choose_2 = n_1 * (n_1 - 1) / 2; n_2_choose_2 = n_2 * (n_2 - 1) / 2; N_intersecting_sites++; x_i1 = allele_counts1[0] / double(n_i1); x_i2 = allele_counts2[0] / double(n_i2); n_iT = (n_i1 + n_i2); x_iT = (allele_counts1[0] + allele_counts2[0]) / double(n_iT); tmp1 = 2 * (n_i1 / (n_i1 - 1.0)) * x_i1 * (1-x_i1); tmp2 = 2 * (n_i2 / (n_i2 - 1.0)) * x_i2 * (1-x_i2); tmpT = 2 * (n_iT / (n_iT - 1.0)) * x_iT * (1-x_iT); Fst = 1.0 - (((n_1_choose_2 * tmp1) + (n_2_choose_2 * tmp2)) / (n_1_choose_2 + n_2_choose_2) / tmpT); out << CHROM << "\t" << POS << "\t" << Fst << endl; sum1 += tmp1; sum2 += tmp2; sumT += tmpT; last_n_1 = n_1; last_n_2 = n_2; } Fst = 1.0 - (((n_1_choose_2 * sum1) + (n_2_choose_2 * sum2)) / (n_1_choose_2 + n_2_choose_2) / sumT); LOG.printLOG("Found " + output_log::int2str(N_intersecting_sites) + " intersecting sites\n"); LOG.printLOG("HapMap-style Fst = " + output_log::dbl2str(Fst, 6) + "\n"); out.close(); } void vcf_file::output_hapmap_fst(const string &output_file_prefix, const vector &indv_files) { // Calculate Fst using individuals in one (rather than two VCF files) // Calculate, and output, Fst using the formula outlined in HapMap I // Namely: // Fst = 1 - (Pi_within / Pi_combined) // where // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2)) // and // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1)) // where j is the population index, and i is the SNP index if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting HapMap-style Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(N_indv, false)); vector all_indv(N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } string output = output_file_prefix + ".hapmap.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tPOS\tHAPMAP_FST" << endl; vcf_entry e(N_indv); string vcf_line; vector allele_counts1; double Fst_tot_num=0.0, Fst_tot_denom=0.0; for (unsigned int s=0; s counts(N_pops, 0); vector pop_N_chr(N_pops, 0); vector pop_N_choose_2(N_pops, 0); for (unsigned int p=0; p &indv_files) { //Implements the bi-allelic version of the Bruce Weir formula, // found on page 178 of Genetic Data Analysis II. if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting Weir and Cockerham Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(N_indv, false)); vector all_indv(N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; count++; } ss.clear(); } indv_file.close(); LOG.printLOG("Found " + LOG.int2str(count) + " individuals in file: " + indv_files[ui] + "\n"); } string output = output_file_prefix + ".weir.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tPOS\tWEIR_AND_COCKERHAM_FST" << endl; vcf_entry e(N_indv); string vcf_line; vector allele_counts1; double snp_Fst; double sum1=0.0, sum2 = 0.0; double sum3=0.0, count = 0.0; for (unsigned int s=0; s n; n.resize(N_pops, 0); vector p; p.resize(N_pops, 0); vector h; h.resize(N_pops, 0); double nbar = 0.0, pbar=0.0, hbar=0.0; //double Csqr=0.0; double ssqr=0.0; double sum_nsqr = 0.0; double n_sum = 0.0; unsigned int N_hom1, N_het, N_hom2; for (unsigned int j=0; j allele_counts; for (unsigned int s=0; s(pairs)); out << e.get_CHROM() << "\t" << e.get_POS() << "\t" << pi << endl; } } // Output Tajima's D // Carlson et al. Genome Res (2005) void vcf_file::output_Tajima_D(const string &output_file_prefix, int window_size) { if (window_size <= 0) return; if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Tajima's D Statistic."); LOG.printLOG("Outputting Tajima's D Statistic...\n"); string output_file = output_file_prefix + ".Tajima.D"; double a1=0.0, a2=0.0, b1, b2, c1, c2, e1, e2; unsigned int n = N_kept_individuals()*2; if (n < 2) LOG.error("Require at least two chromosomes!"); for (unsigned int ui=1; ui max_pos; string vcf_line, CHROM; vcf_entry e(N_indv); for (unsigned int s=0; s max_pos[CHROM]) max_pos[CHROM] = e.get_POS(); } else max_pos[CHROM] = e.get_POS(); } } map::iterator it; unsigned int N_bins; map > > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size)); bins[CHROM].resize(N_bins, make_pair(0,0)); } unsigned int idx; double C = 1.0 / double(window_size); vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; for (unsigned int s=0; s 0.0) && (p < 1.0)) { bins[CHROM][idx].first++; bins[CHROM][idx].second += p * (1.0-p); } } ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tBIN_START\tN_SNPS\tTajimaD" << endl; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; bool output = false; for (unsigned int s=0; s 1) { double pi = 2.0*bins[CHROM][s].second*n/double(n-1); double tw = double(S) / a1; double var = (e1*S) + e2*S*(S-1); D = (pi - tw) / sqrt(var); output = true; } if (S > 0) output = true; if (output == true) out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << D << endl; } } out.close(); } void vcf_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size, int window_step) { // Output nucleotide diversity, as calculated in windows. // Average number of pairwise differences in windows. if (window_size <= 0) return; if ((window_step <= 0) || (window_step > window_size)) window_step = window_size; if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics."); LOG.printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n"); string output_file = output_file_prefix + ".windowed.pi"; // Find maximum position on each chromosome map max_pos; map::iterator it; string vcf_line, CHROM; vcf_entry e(N_indv); for (unsigned int s=0; s max_pos[CHROM]) max_pos[CHROM] = e.get_POS(); } else max_pos[CHROM] = e.get_POS(); } } // Calculate number of bins for each chromosome and allocate memory for them. // Each bin is a vector with four entries: // N_variant_sites: Number of sites in a window that have VCF entries // N_variant_site_pairs: Number of possible pairwise mismatches at polymorphic sites within a window // N_mismatches: Number of actual pairwise mismatches at polymorphic sites within a window // N_polymorphic_sites: number of sites within a window where there is at least 1 sample that is polymorphic with respect to the reference allele unsigned int N_bins; const unsigned int N_variant_sites = 0; const unsigned int N_variant_site_pairs = 1; const unsigned int N_mismatches = 2; const unsigned int N_polymorphic_sites = 3; const vector< unsigned int > empty_vector(4, 0); map > > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int) ceil( (max_pos[CHROM]+1) / double(window_step)); bins[CHROM].resize(N_bins, empty_vector); } // Count polymorphic sites and pairwise mismatches vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_comparisons; for (unsigned int s=0; s::iterator ac = allele_counts.begin(); ac != allele_counts.end(); ++ac) N_site_mismatches += (*ac * (N_non_missing_chr - *ac)); // Place the counts into bins unsigned int pos = e.get_POS(); int first = (int) ceil((pos - window_size)/double(window_step)); if (first < 0) first = 0; int last = (int) ceil(pos/double(window_step)); N_comparisons = N_non_missing_chr * (N_non_missing_chr - 1); for(int idx = first; idx < last; idx++) { bins[CHROM][idx][N_variant_sites]++; bins[CHROM][idx][N_variant_site_pairs] += N_comparisons; bins[CHROM][idx][N_mismatches] += N_site_mismatches; if(allele_counts[0] < (signed)N_non_missing_chr) bins[CHROM][idx][N_polymorphic_sites]++; } } // Calculate and print nucleotide diversity statistics ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tPI" << endl; unsigned int N_monomorphic_sites = 0; N_comparisons = (N_indv * (N_indv - 1)); // Number of pairwise comparisons at a monomorphic site unsigned int N_pairs = 0; // Number of pairwise comparisons within a window double pi = 0; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; for (unsigned int s=0; s 0) || (bins[CHROM][s][N_mismatches] > 0) ) { // This number can be slightly off for the last bin since the // window size can go off the end of the chromosome. N_monomorphic_sites = 2 * (window_size - bins[CHROM][s][N_variant_sites]); // The total number of possible pairwise comparisons is the sum of // pairwise comparisons at polymorphic sites and pairwise // comparisons at monomorphic sites. N_pairs = bins[CHROM][s][N_variant_site_pairs] + (N_monomorphic_sites * N_comparisons); pi = bins[CHROM][s][N_mismatches] / double(N_pairs); out << CHROM << "\t" << s*window_step + 1 << "\t" << (s*window_step + window_size) << "\t" << bins[CHROM][s][N_polymorphic_sites] << "\t" << pi << endl; } } } out.close(); } void vcf_file::output_kept_and_removed_sites(const string &output_file_prefix) { // Output lists of sites that have been filtered (or not). LOG.printLOG("Outputting Kept and Removed Sites...\n"); string output_file1 = output_file_prefix + ".kept.sites"; string output_file2 = output_file_prefix + ".removed.sites"; string vcf_line, CHROM; int POS; vcf_entry e(N_indv); ofstream out1(output_file1.c_str()); if (!out1.is_open()) LOG.error("Could not open output file: " + output_file1, 12); out1 << "CHROM\tPOS" << endl; ofstream out2(output_file2.c_str()); if (!out2.is_open()) LOG.error("Could not open output file: " + output_file2, 12); out2 << "CHROM\tPOS" << endl; for (unsigned int s=0; s alleles; vector s_vector; vector > p_emission; vector > p_trans; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tAUTO_START\tAUTO_END\tN_VARIANTS\tINDV" << endl; // TODO - refactor this so that Entries loop is on the outside. for (unsigned int ui=0; ui 0) { // Assume 1cM/Mb. r = (POS - last_POS) / 1000000.0 / 100.0; // Morgans } double e = (1.0 - exp(-2.0*nGen*r)); double p_trans_auto_to_nonauto = (1.0 - p_auto_prior) * e; //A[1] double p_trans_nonauto_to_auto = p_auto_prior * e; //A[2] double p_trans_auto_to_auto = 1.0 - p_trans_nonauto_to_auto; //A[0] double p_trans_nonauto_to_nonauto = 1.0 - p_trans_auto_to_nonauto; // A[3] vector A(4); A[0] = p_trans_auto_to_auto; A[1] = p_trans_auto_to_nonauto; A[2] = p_trans_nonauto_to_auto; A[3] = p_trans_nonauto_to_nonauto; s_vector.push_back(s); p_trans.push_back(A); last_POS = POS; } // Forward-backward algorithm int N_obs = (int)p_emission.size(); if (N_obs == 0) continue; vector > alpha(N_obs, vector(2,0)); vector > beta(N_obs, vector(2,0)); alpha[0][0] = p_emission[0].first; alpha[0][1] = p_emission[0].second; for (int i=1; i=0; i--) { beta[i][0] = beta[i+1][0] * p_trans[i][0] * p_emission[i].first; beta[i][0] += beta[i+1][1] * p_trans[i][2] * p_emission[i].first; beta[i][1] = beta[i+1][1] * p_trans[i][3] * p_emission[i].second; beta[i][1] += beta[i+1][0] * p_trans[i][1] * p_emission[i].second; while (beta[i][0] + beta[i][1] < 1e-20) { // Renormalise to prevent underflow beta[i][0] *= 1e20; beta[i][1] *= 1e20; } } // Calculate probability of each site being autozygous vector p_auto(N_obs); for (int i=0; ithreshold. // TODO: Also would be good to report heterozygotic SNPs found in homozygotic regions. bool in_auto=false; int start_pos=0, end_pos=0; int N_SNPs = 0; for (int i=0; i p_auto_threshold) { if (in_auto == false) { // Start of autozygous region unsigned int s = s_vector[i]; get_vcf_entry(s, vcf_line); e.reset(vcf_line); e.parse_basic_entry(true); CHROM = e.get_CHROM(); start_pos = e.get_POS(); } N_SNPs++; in_auto = true; } else { if (in_auto == true) { // end of autozygous region unsigned int s = s_vector[i]; get_vcf_entry(s, vcf_line); e.reset(vcf_line); e.parse_basic_entry(true); end_pos = e.get_POS(); if (N_SNPs >= min_SNPs) out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl; } in_auto = false; N_SNPs = 0; } } if (in_auto == true) { // Report final region if needed unsigned int s = s_vector[N_obs-1]; get_vcf_entry(s, vcf_line); e.reset(vcf_line); e.parse_basic_entry(true); end_pos = e.get_POS(); if (N_SNPs >= min_SNPs) out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl; } } out.close(); } void vcf_file::output_indv_relatedness(const string &output_file_prefix) { // Calculate and output a relatedness statistic based on the method of // Yang et al, 2010 (doi:10.1038/ng.608). Specifically, calculate the // unadjusted Ajk statistic (equation 6 of paper). // Expectation of Ajk is zero for individuals within a populations, and // one for an individual with themselves. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Individual Relatedness."); LOG.printLOG("Outputting Individual Relatedness\n"); string output = output_file_prefix + ".relatedness"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Individual Relatedness Output File: " + output, 2); out << "INDV1\tINDV2\tRELATEDNESS" << endl; string vcf_line; vcf_entry e(N_indv); vector allele_counts; unsigned int N_alleles, N_non_missing_chr; double freq; pair geno_id; vector > Ajk(N_indv, vector(N_indv, 0.0)); vector > N_sites(N_indv, vector(N_indv, 0.0)); for (unsigned int s=0; s::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; vector x(N_indv, -1.0); for (unsigned int ui=0; ui= N_sites) LOG.error("PCA computation requires that there are more sites than individuals."); string vcf_line; vcf_entry e(N_indv); pair geno_id; double x, freq; vector allele_counts; unsigned int N_alleles, N_non_missing_chr; // Store list of included individuals vector included_indvs(N_indvs); unsigned int ui_prime = 0; for (unsigned int ui=0; ui::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; double mu = freq*2.0; double div = 1.0 / sqrt(freq * (1.0-freq)); ui_prime = 0; for (unsigned int ui=0; ui -1) { if (use_normalisation == true) M[ui_prime][s_prime] = (x - mu) * div; else M[ui_prime][s_prime] = (x - mu); } ui_prime++; } s_prime++; } // Now construct X = (1/n)MM'. double **X = new double *[N_indvs]; for (unsigned int ui=0; ui 0) { // Output SNP loadings LOG.printLOG("Outputting " + output_log::int2str(SNP_loadings_N_PCs) + " SNP loadings\n"); output = output_file_prefix + ".pca.loadings"; out.open(output.c_str()); if (!out.good()) LOG.error("Could not open Principal Component SNP Loading Output File: " + output, 2); out << "CHROM\tPOS"; for (unsigned int ui=0; ui<(unsigned int)SNP_loadings_N_PCs; ui++) out << "\tGAMMA_" << ui; out << endl; for (unsigned int s=0; s::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; vector gamma(SNP_loadings_N_PCs, 0.0); vector a_sum(SNP_loadings_N_PCs, 0.0); ui_prime = 0; for (unsigned int ui=0; ui -1) { for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++) { gamma[uj] += (x * Evecs[ui_prime][uj]); a_sum[uj] += (Evecs[ui_prime][uj]*Evecs[ui_prime][uj]); } } ui_prime++; } out << e.get_CHROM() << "\t" << e.get_POS(); for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++) out << "\t" << gamma[uj] / a_sum[uj]; out << endl; } out.close(); } delete [] Er; delete [] Ei; delete [] Evecs; delete [] X; #endif } void vcf_file::output_indel_hist(const string &output_file_prefix) { string vcf_line; vcf_entry e(N_indv); string allele; unsigned int ref_len, N_alleles; int indel_len, smallest_len, largest_len, snp_count; vector s_vector; string output = output_file_prefix + ".indel.hist"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Indel Hist File: " + output, 7); out << "LENGTH\tCOUNT\tPRCT" << endl; largest_len = 0; smallest_len = 0; snp_count = 0; for (unsigned int s=0; s largest_len) largest_len = indel_len; else if (indel_len < smallest_len) smallest_len = indel_len; } } } double total = s_vector.size() + snp_count; double pct; for (int i=smallest_len; i<=largest_len; i++) { int icount = (int) count (s_vector.begin(), s_vector.end(), i); if (icount > 0) { pct = 100.0*icount/total; out << i << "\t" << icount << "\t" << pct << endl; } else if ((i == 0) and (snp_count>0)) { pct = 100.0*snp_count/total; out << i << "\t" << snp_count << "\t" << pct << endl; } } out.close(); }