/* Functions relevant to Beta, Gamma, and Dirichlet densities, * including simple and mixture Dirichlet priors. * * Contents: * 1. The object for mixture Dirichlet priors * 2. Dirichlet likelihood functions * 3. Sampling from Dirichlets * 4. Reading mixture Dirichlets from files * 5. Unit tests * 6. Test driver * 7. Example * * To-do: * - Fit*() functions should return eslEINVAL on n=0, eslENORESULT * on failure due to small n. Compare esl_gumbel. xref J12/93. * SRE, Wed Nov 27 11:18:12 2013 */ #include "esl_config.h" #include #include #include #include #include "easel.h" #include "esl_fileparser.h" #include "esl_minimizer.h" #include "esl_random.h" #include "esl_stats.h" #include "esl_vectorops.h" #include "esl_dirichlet.h" /***************************************************************** *# 1. The object for mixture Dirichlet priors *****************************************************************/ /* Function: esl_mixdchlet_Create() * * Purpose: Create a new mixture Dirichlet prior with components, * each with parameters. * * Returns: initialized on success. * * Throws: NULL on allocation failure. */ ESL_MIXDCHLET * esl_mixdchlet_Create(int N, int K) { int status; ESL_MIXDCHLET *pri = NULL; int q; ESL_DASSERT1( (N > 0) ); ESL_DASSERT1( (K > 0) ); ESL_ALLOC(pri, sizeof(ESL_MIXDCHLET)); pri->pq = NULL; pri->alpha = NULL; ESL_ALLOC(pri->pq, sizeof(double) * N); ESL_ALLOC(pri->alpha, sizeof(double *) * N); pri->alpha[0] = NULL; ESL_ALLOC(pri->alpha[0], sizeof(double) * N * K); if (pri->alpha[0] == NULL) goto ERROR; // to silence clang static analysis, which gets overzealous about N=0/K=0 -> NULL result for (q = 1; q < N; q++) pri->alpha[q] = pri->alpha[0] + q*K; pri->N = N; pri->K = K; return pri; ERROR: esl_mixdchlet_Destroy(pri); return NULL; } /* Function: esl_mixdchlet_PerfectBipartiteMatchExists() * Synopsis: Given a 2D table representing presence of edges between vertices represented by * the rows and columns, test whether a perfect matching exists. * Note 1: this doesn't find a perfect matching, just checks if one exists. * Note 2: written as a private function for use by esl_mixdchlet_Compare * Incept: TW, Fri Nov 6 14:23:23 EST 2009 [janelia] * * Args: A - 2-dimensional square table representing presence of edges between vertices * N - size of that table * * Returns: if a perfect matching exists; otherwise. */ int esl_mixdchlet_PerfectBipartiteMatchExists(int **A, int N ) { /* Basic idea: -Scan through the rows, and create a matching edge any time a row has only one matching column (i.e. a single column with eslOK value) * This is conservative: if the row isn't matched with this column, no perfect matching is possible. -Repeat, this time scanning columns. -Repeat rows then columns - until no rows or columns are found with a single eslOK value. -If a row or column is found with no possible matches, then no complete matching is possible. -If a point is reached where all rows and all columns have more than one match, I'm pretty sure a perfect matching is guaranteed. - This is unproven; the intuition is that for any imperfect matching an augmenting path should (I think) exist: it will contain an edge from one unmatched element to a matched element, followed by the existing edge from that element to it's mate, followed by a 2nd edge from that mate to another, and so on. It's a O(n^3) algorithm, though it'll typically run fast in practice */ int matched_row[N], matched_col[N]; esl_vec_ISet(matched_row, N, 0); esl_vec_ISet(matched_col, N, 0); int i,j; int unassigned = N; int do_row = 1; // otherwise, do_column while (unassigned > 0) { int changed = 0; for (i=0; i 1), leave it for a later pass } unassigned -= changed; if (changed == 0) { // All had multiple hits, so (I think) we are guaranteed of being able to pick some mapping that will be legal return eslOK; } do_row = 1 - do_row; // flip value } //got here, all mapping must've been done return eslOK; } /* Function: esl_mixdchlet_Compare() * Synopsis: Compare two mixture Dirichlets for equality. * * Purpose: Compare mixture Dirichlet objects and * for equality. For real numbered values, equality * is defined by with a fractional * tolerance . * * Returns: on equality; otherwise. */ int esl_mixdchlet_Compare(ESL_MIXDCHLET *d1, ESL_MIXDCHLET *d2, double tol) { int i,j; int **A = NULL; int status; if (d1->N != d2->N) return eslFAIL; if (d1->K != d2->K) return eslFAIL; //set up a 2-D matrix, to store the pairs of components that meet tolerance requirements ESL_ALLOC(A, d1->N * sizeof(int*)); for (i = 0; i < d1->N; i++) A[i] = NULL; for (i = 0; i < d1->N; i++) ESL_ALLOC(A[i], d1->N * sizeof(int) ); // Fill in matrix - OK if component i from d1 is a viable match with component q from d2 for (i=0; iN; i++) { for (j=0; jN; j++) { A[i][j] = esl_DCompare( d1->pq[i], d2->pq[j], tol); if (A[i][j] == eslOK) A[i][j] = esl_vec_DCompare(d1->alpha[i], d2->alpha[j], d1->K, tol) ; } } /* In most cases, there should be only a one-to-one mapping (if * any), which is easy to test. But in the unlikely case of a * many-to-one mapping, we need to do a little more. The problem * amounts to asking whether there exists a perfect bipartite * matching (aka the marriage problem) */ status = esl_mixdchlet_PerfectBipartiteMatchExists( A, d1->N); /* fallthrough */ ERROR: if (A) { for (i = 0; i < d1->N; i++) if (A[i]) free (A[i]); free (A); } return status; } /* Function: esl_mixdchlet_Copy() * Synopsis: Copy a mixture Dirichlet object. * * Purpose: Copies mixture dirichlet object to . * Both objects are of size and . * is unchanged. * * Returns: on equality; otherwise. */ int esl_mixdchlet_Copy(ESL_MIXDCHLET *d, ESL_MIXDCHLET *d_dst) { int q; if (d->N != d_dst->N) return eslFAIL; if (d->K != d_dst->K) return eslFAIL; esl_vec_DCopy(d->pq, d->N, d_dst->pq); for (q = 0; q < d->N; q++) esl_vec_DCopy(d->alpha[q], d->K, d_dst->alpha[q]); return eslOK; } /* Function: esl_mixdchlet_Destroy() * * Purpose: Free's the mixture Dirichlet . */ void esl_mixdchlet_Destroy(ESL_MIXDCHLET *pri) { if (pri == NULL) return; if (pri->pq != NULL) free(pri->pq); if (pri->alpha != NULL) { if (pri->alpha[0] != NULL) free(pri->alpha[0]); free(pri->alpha); } free(pri); } /* Function: esl_mixdchlet_Dump() * * Purpose: Dump the mixture Dirichlet . */ int esl_mixdchlet_Dump(FILE *fp, ESL_MIXDCHLET *d) { int q; /* counter over mixture components */ int i; /* counter over params */ fprintf(fp, "Mixture Dirichlet: N=%d K=%d\n", d->N, d->K); for (q = 0; q < d->N; q++) { fprintf(fp, "q[%d] %f\n", q, d->pq[q]); for (i = 0; i < d->K; i++) fprintf(fp, "alpha[%d][%d] %f\n", q, i, d->alpha[q][i]); } return eslOK; } /* esl_dirichlet_MixturePosterior() * * Purpose: For a count vector of cardinality , and a * mixture Dirichlet prior . Calculate mix[], * the posterior probability P(q | c) of mixture * component q given the count vector c. Caller must * provide allocated space for , of length . * * Returns: on success, contains posterior probabilities of * the Dirichlet components. */ static int esl_dirichlet_MixturePosterior(double *c, int K, ESL_MIXDCHLET *pri, double *mix) { int q; /* counter over mixture components */ double val; for (q = 0; q < pri->N; q++) { if (pri->pq[q] > 0.0) { esl_dirichlet_LogProbData(c, pri->alpha[q], K, &val); mix[q] = val + log(pri->pq[q]); } else { mix[q] = -HUGE_VAL; } } esl_vec_DLogNorm(mix, pri->N); /* mix[q] is now P(q|c) */ return eslOK; } /* Function: esl_mixdchlet_MPParameters() * * Purpose: Parameter estimation for a count vector of cardinality * , and a mixture Dirichlet prior . Calculates * mean posterior estimates for probability parameters, and * returns them in

. Also returns the posterior probabilities * of each Dirichlet mixture component, $P(q \mid c)$, in . * Caller must provide allocated space for and

, both * of length . * * Returns: on success; contains posterior probabilities of * the Dirichlet components, and

contains mean posterior * probability parameter estimates. * * Throws: if has different cardinality than . */ int esl_mixdchlet_MPParameters(double *c, int K, ESL_MIXDCHLET *pri, double *mix, double *p) { int q; /* counter over mixture components */ int x; double totc; double tota; if (K != pri->K) ESL_EXCEPTION(eslEINCOMPAT, "cvec's K != mixture Dirichlet's K"); /* Calculate mix[], the posterior probability * P(q | c) of mixture component q given the count vector c. */ esl_dirichlet_MixturePosterior(c, K, pri, mix); /* Compute mean posterior estimates for probability parameters */ totc = esl_vec_DSum(c, K); esl_vec_DSet(p, K, 0.); for (x = 0; x < K; x++) for (q = 0; q < pri->N; q++) { tota = esl_vec_DSum(pri->alpha[q], K); p[x] += mix[q] * (c[x] + pri->alpha[q][x]) / (totc + tota); } /* should be normalized already, but for good measure: */ esl_vec_DNorm(p, K); return eslOK; } /* Function: esl_mixdchlet_BILD_score() * * Purpose: Compute the BILD score (sensu Altschul et al PLos Compbio 2010) * for a given count vector of cardinality (alphabet size) , * under a mixture Dirichlet prior , and a background * character distribution , also cardinality K. The score is * in bits. Also computes posterior values for (1) Dirichlet mixture * coefficients ($P(q \mid c)$, performed and returned in a previously * allocated array, ). * * Caller must provide allocated space for (length K), and * (length 1). * * Returns: on success; contains posterior probabilities of * the Dirichlet components, and contains the BILD score of * observation under the prior and bg. * * Throws: if has different cardinality than . */ int esl_mixdchlet_BILD_score(double *c, int K, int N, ESL_MIXDCHLET *pri, double *mix, double *bg, double *sc) { int i; /* counter over mixture components */ int j; double tmp; double val; double totc; double tota; if (K != pri->K) ESL_EXCEPTION(eslEINCOMPAT, "cvec's K != mixture Dirichlet's K"); if (N != pri->N) ESL_EXCEPTION(eslEINCOMPAT, "cvec's N != mixture Dirichlet's N"); /* Calculate mix[], the posterior probability * P(q | c) of mixture component q given the count vector c. */ esl_dirichlet_MixturePosterior(c, K, pri, mix); /* Compute probability of observing the given count vector * under the mixture Dirichlet prior, which depends on the * posterior. */ *sc = 0.0; totc = esl_vec_DSum(c, K); for (i = 0; i < N; i++) { if (mix[i] > 0) { tota = esl_vec_DSum(pri->alpha[i], K); esl_stats_LogGamma(tota, &tmp); val = tmp; esl_stats_LogGamma(tota + totc, &tmp); val -= tmp; for (j = 0; j < K; j++) { esl_stats_LogGamma(pri->alpha[i][j] + c[j], &tmp); val += tmp; esl_stats_LogGamma(pri->alpha[i][j], &tmp); val -= tmp; } *sc += mix[i] * exp(val); } } /* At this point, sc holds the Q value from the Altschul paper. * Get the odds ratio by dividing by the product of background * probabilities for observed counts, (accounting for sequence * weighting). */ for (j = 0; j < K; j++) { *sc /= pow(bg[j], c[j]); } *sc = log(*sc)*eslCONST_LOG2R; return eslOK; } /*---------------- end, ESL_MIXDCHLET ---------------------------*/ /***************************************************************** *# 2. Dirichlet likelihood functions *****************************************************************/ /* Function: esl_dirichlet_LogProbData() * * Purpose: Given an observed count vector $c[0..K-1]$, * and a simple Dirichlet density parameterized by * $\alpha[0..K-1]$; * calculate $\log P(c \mid \alpha)$. * * This is $\int P(c \mid p) P(p \mid \alpha) dp$, * an integral that can be solved analytically. * * Args: c - count vector, [0..K-1] * alpha - Dirichlet parameters, [0..K-1] * K - size of c, alpha vectors * ret_answer - RETURN: log P(c | \alpha) * * Returns: on success, and puts result $\log P(c \mid \alpha)$ * in . */ int esl_dirichlet_LogProbData(double *c, double *alpha, int K, double *ret_answer) { double lnp; double sum1, sum2, sum3; double a1, a2, a3; int x; sum1 = sum2 = sum3 = lnp = 0.0; for (x = 0; x < K; x++) { sum1 += c[x] + alpha[x]; sum2 += alpha[x]; sum3 += c[x]; esl_stats_LogGamma(alpha[x] + c[x], &a1); esl_stats_LogGamma(c[x] + 1., &a2); esl_stats_LogGamma(alpha[x], &a3); lnp += a1 - a2 - a3; } esl_stats_LogGamma(sum1, &a1); esl_stats_LogGamma(sum2, &a2); esl_stats_LogGamma(sum3 + 1., &a3); lnp += a2 + a3 - a1; *ret_answer = lnp; return eslOK; } /* Function: esl_dirichlet_LogProbData_Mixture() * * Purpose: Given an observed count vector $c[0..K-1]$, * and a mixture Dirichlet density parameterized by * $\alpha_1[0..K-1]$ ... $\alpha_N[0..K-1]$, * calculate $\log \sum_i pq_i * P(c \mid \alpha_i)$. * * * Args: c - count vector, [0..K-1] * d - Dirichlet parameters, [0..K-1] * ret_answer - RETURN: log P(c | \alpha) * * Returns: on success, and puts result $\log P(c \mid \alpha)$ * in . * * Throws: on allocation error. Now <*ret_answer> is * <-eslINFINITY>. */ int esl_dirichlet_LogProbData_Mixture(double *c, ESL_MIXDCHLET *d, double *ret_answer) { double *mixq = NULL; double lnp; double val; int q; /* counter over mixture components */ int status; ESL_ALLOC(mixq, sizeof(double)*d->N); for (q = 0; q < d->N; q++) { esl_dirichlet_LogProbData(c, d->alpha[q], d->K, &val); mixq[q] = val + log(d->pq[q]); } lnp = esl_vec_DLogSum(mixq, d->N); free(mixq); *ret_answer = lnp; return eslOK; ERROR: free(mixq); *ret_answer = -eslINFINITY; return status; } /* esl_dirichlet_LogProbDataSet_Mixture() * * Purpose: Given an observed set of count vectors $c[0..N-1][0..K-1]$, * and a mixture Dirichlet density parameterized by * $\alpha_1[0..K-1]$ ... $\alpha_N[0..K-1]$, * calculate $ \sum_n \log \sum_i pq_i * P(c[n] \mid \alpha_i)$. * This is a convenience function, which simply wraps * esl_dirichlet_LogProbData_Mixture * * Args: ntrials - number of count vectors (aka N) * counts - count vector set, [0..N-1][0..K-1] * md - Dirichlet parameters * ret_answer - RETURN: log P(c | \alpha) * * Returns: on success, and puts result $\log P(c \mid \alpha)$ * in . * * Throws: on allocation error. Now <*ret_answer> is * <-eslINFINITY>. */ static int esl_dirichlet_LogProbDataSet_Mixture(int ntrials, double** counts, ESL_MIXDCHLET* md, double *ret_answer) { double val; int i; int status; *ret_answer = 0; for (i = 0; i < ntrials; i++) { if (( status = esl_dirichlet_LogProbData_Mixture(counts[i], md, &val)) != eslOK) goto ERROR; *ret_answer += val; } return eslOK; ERROR: *ret_answer = -eslINFINITY; return status; } /* Function: esl_dirichlet_LogProbProbs() * * Purpose: Given Dirichlet parameter vector and a probability * vector

, both of cardinality ; return * $\log P(p \mid alpha)$. * * Returns: on success, and the result is in . * * Xref: Sjolander (1996) appendix, lemma 2. */ int esl_dirichlet_LogProbProbs(double *p, double *alpha, int K, double *ret_answer) { double sum; /* for Gammln(|alpha|) in Z */ double logp; /* RETURN: log P(p|alpha) */ double val; int x; sum = logp = 0.0; for (x = 0; x < K; x++) if (p[x] > 0.0) /* any param that is == 0.0 doesn't exist */ { esl_stats_LogGamma(alpha[x], &val); logp -= val; logp += (alpha[x]-1.0) * log(p[x]); sum += alpha[x]; } esl_stats_LogGamma(sum, &val); logp += val; *ret_answer = logp; return eslOK; } /*----------- end, Dirichlet likelihood functions ---------------*/ /***************************************************************** * Dirichlet Maximum likelihood fit from counts *****************************************************************/ /* This structure is used to sneak the data into minimizer's generic * (void *) API for all aux data */ struct mixdchlet_data { ESL_MIXDCHLET *d; /* the dirichlet mixture parameters */ double **c; /* count vector array [0..nc-1][0..alphabet_size(d->K)] */ int nc; /* number of count samples */ }; /***************************************************************** * Parameter vector packing/unpacking * * The conjugate gradient code takes a single parameter vector

, * where the values are unconstrained real numbers. * * We have a mixture Dirichlet with two kinds of parameters. * pq_i are mixture coefficients, constrained to be >= 0 and * \sum_i pq_i = 1. alpha^i_x are the Dirichlet parameters * for component i, constrained to be > 0. * * Our p's are therefore not only packed into a single vector; * they're reparameterized to implement the constraints: * for a Dirichlet parameter: * alpha = exp(p) p = log(alpha) * (thus, alpha > 0 for all real p) * * for a mixture coefficient: * pq = exp(-exp(p)) / \sum_a exp(-exp(p_a)) * (thus, 0 < pq < 1 and \sum_a pq_a = 1, for all real p) * * In my hands (ER), this parametrization works better that * pq = exp(p) / \sum_a exp(p_a) * * Conjugate gradients optimizes the

parameter vector, * but we can convert that back out into a Dirichlet answer. * * The packing order is: the first N terms of a parameter vector are * the mixture coefficients pq_i. N different alpha_i vectors follow. * * [0 ... N-1] [0 ... K-1] [0 ... K-1] ... * q's alpha_0 alpha_1 ... * * In both functions below, p, pq, and alpha are all allocated * and free'd by the caller. * p : length N + N*K = N*(K+1) [0.. N*(K+1)-1] * pq : length N, [0..N-1] * alpha : length NxK, [0..N-1][0..K-1]. * * Special cases: * * - For (N >= 1 && K == 1) there is nothing to optimize. * * - For (N == 1 && K > 1) the only variables to optimize are the K alphas * * [0 ... K-1] * alpha * * p : length N*K = N*K [0.. N*K-1] * alpha : length NxK, [0][0..K-1]. * */ static void mixdchlet_pack_paramvector(double *p, int np, ESL_MIXDCHLET *d) { int nq; /* number the mixture components to optimize */ int q; /* counter over mixture components */ int x; /* counter in alphabet size */ nq = (d->N > 1)? d->N : 0; /* the mixture coeficients */ for (q = 0; q < nq; q++) p[q] = log(d->pq[q]); //p[q] = log(-log(d->pq[q])); TW changed to the above; this was causing fit to fail /* the dirichlet parameters */ for (q = 0; q < d->N; q++) for (x = 0; x < d->K; x++) p[nq + q*d->K + x] = log(d->alpha[q][x]); } /* Same as above but in reverse: given parameter vector

, * do appropriate c.o.v. back to desired parameter space, and * update the mixdchlet . */ static void mixdchlet_unpack_paramvector(double *p, int np, ESL_MIXDCHLET *d) { int nq; /* number the mixture components to optimize */ int q; /* counter over mixture components */ int x; /* counter in alphabet size */ nq = (d->N > 1)? d->N : 0; /* the mixture coeficients */ for (q = 0; q < nq; q++) d->pq[q] = exp(p[q]); //d->pq[q] = exp(-exp(p[q])); TW changed to the above; this was causing fit to fail esl_vec_DNorm(d->pq, d->N); /* the dirichlet parameters */ for (q = 0; q < d->N; q++) for (x = 0; x < d->K; x++) d->alpha[q][x] = exp(p[nq + q*d->K + x]); /*esl_mixdchlet_Dump(stdout, d);*/ } /* The log likelihood function to be optimized by ML fitting: * This needs to be careful of a case where a lambda = inf. */ static double mixdchlet_complete_func(double *p, int np, void *dptr) { struct mixdchlet_data *data = (struct mixdchlet_data *) dptr; ESL_MIXDCHLET *d = data->d; double logPsample; double logP = 0.; int m; /* counter over count samples */ mixdchlet_unpack_paramvector(p, np, d); for (m = 0; m < data->nc; m++) { esl_dirichlet_LogProbData_Mixture(data->c[m], d, &logPsample); logP += logPsample; } if (isnan(logP)) esl_fatal("logP is NaN"); return -logP; } /* The gradient of the NLL w.r.t. each free parameter in p. * Modified by ER 11/03/09 to compute derivative of log(alpha) instead of alpha * (committed by TW) */ static void mixdchlet_complete_gradient(double *p, int np, void *dptr, double *dp) { struct mixdchlet_data *data = (struct mixdchlet_data *) dptr; ESL_MIXDCHLET *d = data->d; double sum_alpha; /* \sum_x alpha[q][x] */ double sum_c; /* \sum_x c[m][x] */ double val; /* val is p_q * P(c_m | alpha_q) */ double *valsum; /* valsum is sum_q [p_q * P(c_m | alpha_q)] */ double term; /* term is q * P(alpha_q | c_m) */ double psi1; /* Psi(sum_alpha[q]) */ double psi2; /* Psi(sum_alpha[q] + sum_c[m]) */ double psi3; /* Psi(sum_alpha[q][x]+ c[m][x]) */ double psi4; /* Psi(sum_alpha[q][x]) */ int nq; /* number the mixture components to optimize */ int m; /* counter over count samples */ int q; /* counter over mixture components */ int x; /* counter in alphabet size */ nq = (d->N > 1)? d->N : 0; mixdchlet_unpack_paramvector(p, np, d); /* initialize */ valsum = malloc(sizeof(double) * data->nc); esl_vec_DSet(dp, np, 0.0); /* Some precalculation of sums for efficiency. * valsum is sum_q [p_q * P(c_m | alpha_q)] */ for (m = 0; m < data->nc; m++) esl_dirichlet_LogProbData_Mixture(data->c[m], d, &(valsum[m])); for (q = 0; q < d->N; q++) { sum_alpha = esl_vec_DSum(d->alpha[q], d->K); esl_stats_Psi(sum_alpha, &psi1); /* psi1 = Psi(sum_alpha[q]) */ for (m = 0; m < data->nc; m++) { sum_c = esl_vec_DSum(data->c[m], d->K); esl_stats_Psi(sum_alpha+sum_c, &psi2); /* psi2 = Psi(sum_alpha[q] + sum_c[m]) */ /* val is pq * P(c_m | alpha_q) */ esl_dirichlet_LogProbData(data->c[m], d->alpha[q], d->K, &val); /* derivative respect to the mixture coeficients */ /* term is pq * P(alpha_q | c_m) */ term = exp(val - valsum[m] + log(d->pq[q])); if (nq > 0) dp[q] += term - d->pq[q]; /* derivative respect to the dirichlet parameters */ for (x = 0; x < d->K; x++) { esl_stats_Psi(d->alpha[q][x]+data->c[m][x], &psi3); /* psi3 = Psi(sum_alpha[q][x]+ c[m][x]) */ esl_stats_Psi(d->alpha[q][x], &psi4); /* psi4 = Psi(sum_alpha[q][x]+ c[m][x]) */ dp[nq + q*d->K + x] += term * d->alpha[q][x] * (psi1 - psi2 + psi3 - psi4); } } } /* Return the negative, because we're minimizing the NLP, not maximizing. */ for (q = 0; q < nq; q++) { if (isnan(dp[q])) esl_fatal("dp for pq[%d] is NaN", q); dp[q] *= -1.; } for (q = 0; q < d->N; q++) for (x = 0; x < d->K; x++) { if(isnan(dp[nq + q*d->K + x])) esl_fatal("dp for alpha[%d][%d] is NaN", q, x); dp[nq + q*d->K + x] *= -1.0; } free(valsum); } /* Function: esl_mixdchlet_Fit() * * Purpose: Given a count vector , and an initial guess for * a mixdchlet, find maximum likelihood parameters * by conjugate gradient descent optimization, starting * from and leaving the final optimized solution in * . * * Returns: on success, and contains the fitted * mixdchlet parameters. * * Throws: on allocation error, and is left in * in its initial state. */ int esl_mixdchlet_Fit(double **c, int nc, ESL_MIXDCHLET *d, int be_verbose) { struct mixdchlet_data data; double *p = NULL; double *u = NULL; double *wrk = NULL; double tol; double fx; int np; /* number of parameters to optimize */ int nq; /* number the mixture components to optimize */ int i; int status; /* nothing to optimize for a dirichlet of K = 1 (alphabet size = 1)*/ if (d->K == 1) return eslOK; tol = 1e-6; /* Allocate parameters */ nq = (d->N > 1)? d->N : 0; np = nq + d->N*d->K; ESL_ALLOC(p, sizeof(double) * np); ESL_ALLOC(u, sizeof(double) * np); ESL_ALLOC(wrk, sizeof(double) * np * 4); /* Copy shared info into the "data" structure */ data.d = d; data.c = c; data.nc = nc; /* From d, create the parameter vector. */ mixdchlet_pack_paramvector(p, np, d); /* Define the step size vector u. */ for (i = 0; i < np; i++) u[i] = 0.1; /* Feed it all to the mighty optimizer. */ status = esl_min_ConjugateGradientDescent(p, u, np, &mixdchlet_complete_func, &mixdchlet_complete_gradient, (void *) (&data), tol, wrk, &fx); if (status != eslOK && status != eslENOHALT) // eslENOHALT? Then take what we've got - it's probably pretty good goto ERROR; /* Convert the final parameter vector back to a mixdchlet */ mixdchlet_unpack_paramvector(p, np, d); free(p); free(u); free(wrk); return eslOK; ERROR: if (p != NULL) free(p); if (u != NULL) free(u); if (wrk != NULL) free(wrk); return status; } /* Function: esl_mixdchlet_Fit_Multipass() * * Purpose: Given a set of count vectors , find maximum * likelihood mixdchlet parameters. A number * of initial guesses for a mixdchlet are used, * with conjugate gradient descent performed for * each guess. The mixdchlet returned is the one * among these multiple local searches with * best likelihood. This is a convenience * function, which simply wraps * for multiple start points. * * Args: r - pointer to random generator * c - set of count vectors, [0..M-1][0..N-1] * nc - number of count samples * reps - number of random starting points * best_md - an initialized mixdchlet, which will * contain the correct q and alpha values * at completion * verbose - if >0, output is verbose * * Returns: on success, and contains the fitted * mixdchlet parameters with best likelihood. * * Throws: on allocation error, and the state of * is undefined. */ int esl_mixdchlet_Fit_Multipass(ESL_RANDOMNESS *rng, double **c, int nc, int reps, ESL_MIXDCHLET *best_md, int verbose) { ESL_MIXDCHLET *md = esl_mixdchlet_Create(best_md->N, best_md->K); double best_lk = -eslINFINITY; int err_cnt = 0; int i, q, k; double lk; int status; for (i = 0; i < reps; i++) { /* for each pass, establish a new random starting point */ if (( status = esl_dirichlet_DSampleUniform(rng, md->N, md->pq)) != eslOK) goto ERROR; for (q = 0; q < md->N; q++) for (k = 0; k < md->K; k++) md->alpha[q][k] = 10.0 * esl_rnd_UniformPositive(rng); /* then use Fit to do local search */ status = esl_mixdchlet_Fit(c, nc, md, 0); if (status != eslOK) { err_cnt++; if (err_cnt==2*reps) { goto ERROR; } else { i--; /* try another starting point */ continue; } } esl_dirichlet_LogProbDataSet_Mixture (nc, c, md, &lk); if (verbose) { fprintf(stderr, "Repetition # %d\n------------\n", i); esl_mixdchlet_Dump(stderr, md); fprintf(stderr, "llk = %.3f (vs best = %.3f)\n", lk, best_lk); } if (lk > best_lk) { if (verbose) fprintf(stderr, "... so copy md -> best_md\n"); best_lk = lk; esl_mixdchlet_Copy(md, best_md); } } if (verbose) { fprintf(stdout, "\n\n----------------\nbest mixture:\n"); esl_mixdchlet_Dump(stdout, best_md); fprintf(stdout, "llk = %.3f", best_lk); } esl_mixdchlet_Destroy(md); return eslOK; ERROR: esl_mixdchlet_Destroy(md); return status; } /*----------- end, Dirichlet Maximum likelihood fit from counts ---------------*/ /***************************************************************** *# 3. Sampling from Dirichlets: requires *****************************************************************/ /* Function: esl_dirichlet_DSample() * * Purpose: Given a Dirichlet density parameterized by $\alpha[0..K-1]$, * sample a probability vector $p[0..K-1]$ from * $P(p \mid \alpha)$. * * Args: r - random number generation object * alpha - parameters of Dirichlet density [0..K-1] * K - vector size * p - RETURN: sampled probability vector * (caller allocates 0..K-1). * * Returns: , and

will contain the sampled vector. */ int esl_dirichlet_DSample(ESL_RANDOMNESS *r, double *alpha, int K, double *p) { int x; for (x = 0; x < K; x++) p[x] = esl_rnd_Gamma(r, alpha[x]); esl_vec_DNorm(p, K); return eslOK; } /* Function: esl_dirichlet_FSample() * * Purpose: Same as , except it * works in single-precision floats, not doubles. */ int esl_dirichlet_FSample(ESL_RANDOMNESS *r, float *alpha, int K, float *p) { int x; for (x = 0; x < K; x++) p[x] = (float) esl_rnd_Gamma(r, (double) alpha[x]); esl_vec_FNorm(p, K); return eslOK; } /* Function: esl_dirichlet_DSampleUniform() * * Purpose: Sample a probability vector $p[0..K-1]$ uniformly, by * sampling from a Dirichlet of $\alpha_i = 1.0 \forall i$. * * Args: r - source of random numbers * K - vector size * p - RETURN: sampled prob vector, caller alloc'ed 0..K-1 * * Returns: , and

will contain the sampled vector. * * Throws: (no abnormal error conditions) */ int esl_dirichlet_DSampleUniform(ESL_RANDOMNESS *r, int K, double *p) { int x; for (x = 0; x < K; x++) p[x] = esl_rnd_Gamma(r, 1.0); esl_vec_DNorm(p, K); return eslOK; } /* Function: esl_dirichlet_FSampleUniform() * * Purpose: Same as , except it * works in single-precision floats, not doubles. */ int esl_dirichlet_FSampleUniform(ESL_RANDOMNESS *r, int K, float *p) { int x; for (x = 0; x < K; x++) p[x] = (float) esl_rnd_Gamma(r, 1.0); esl_vec_FNorm(p, K); return eslOK; } /* Function: esl_dirichlet_SampleBeta() * * Purpose: Samples from a Beta(theta1, theta2) density, leaves answer * in . (Special case of sampling Dirichlet.) * * Returns: . */ int esl_dirichlet_SampleBeta(ESL_RANDOMNESS *r, double theta1, double theta2, double *ret_answer) { double p, q; p = esl_rnd_Gamma(r, theta1); q = esl_rnd_Gamma(r, theta2); *ret_answer = p / (p+q); return eslOK; } /*---------------- end, Dirichlet sampling ----------------------*/ /***************************************************************** *# 4. Reading mixture Dirichlets from files [requires esl_fileparser] *****************************************************************/ /* Function: esl_mixdchlet_Read() * * Purpose: Reads a mixture Dirichlet from an open stream , using the * token-based parser. * * The first two tokens are , the length of the Dirichlet parameter * vector(s), and , the number of mixture components. Then for * each of the mixture components , it reads a mixture coefficient * followed by Dirichlet parameters . * * This function may be called more than once on the same open file, * to read multiple different mixture Dirichlets from it (transitions, * match emissions, insert emissions, for example). * * Note: One reason this function takes an ESL_FILEPARSER instead of * a filename or an open FILE pointer is that file format errors * in Easel are non-fatal "normal" errors, and we want to record * an informative error message. The ESL_FILEPARSER has an error * buffer for this purpose. * * Returns: on success, and contains a new object * that the caller is responsible for free'ing. * * on 'normal' parse failure, in which case errbuf> * contains an informative diagnostic message, and linenumber> * contains the linenumber at which the parse failed. */ int esl_mixdchlet_Read(ESL_FILEPARSER *efp, ESL_MIXDCHLET **ret_pri) { ESL_MIXDCHLET *pri; int K; /* Dirichlet param vector size */ int N; /* number of mixture components */ char *tok; /* ptr to a whitespace-delim, noncomment token */ int toklen; /* length of a parsed token */ int status; /* return status of an Easel call */ int q; /* counter over mixture components (0..N-1) */ int i; /* counter over params (0..K-1) */ *ret_pri = pri = NULL; if ((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) goto ERROR; K = atoi(tok); if (K < 1) { sprintf(efp->errbuf, "Bad vector size %.32s", tok); goto ERROR; } if ((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) goto ERROR; N = atoi(tok); if (N < 1) { sprintf(efp->errbuf, "Bad mixture number %.32s", tok); goto ERROR; } pri = esl_mixdchlet_Create(N, K); if (pri == NULL) { sprintf(efp->errbuf, "mxdchlet alloc failed"); goto ERROR; } for (q = 0; q < N; q++) { if ((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) goto ERROR; pri->pq[q] = atof(tok); if (pri->pq[q] < 0.0 || pri->pq[q] > 1.0) { sprintf(efp->errbuf, "bad mixture coefficient %.32s", tok); goto ERROR; } for (i = 0; i < K; i++) { if ((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) goto ERROR; pri->alpha[q][i] = atof(tok); if (pri->alpha[q][i] <= 0.0) { sprintf(efp->errbuf, "Dirichlet params must be positive, got %.32s", tok); goto ERROR; } } } esl_vec_DNorm(pri->pq, N); *ret_pri = pri; return eslOK; ERROR: esl_mixdchlet_Destroy(pri); return eslEFORMAT; } /* Function: esl_mixdchlet_Write() * Synopsis: Write a mixture Dirichlet to an open output stream. * * Purpose: Write mixture Dirichlet to open output stream . * * Args: fp - open output stream * d - mixture Dirichlet to write * * Returns: on success. * * Throws: on any write error, such as filled disk. */ int esl_mixdchlet_Write(FILE *fp, ESL_MIXDCHLET *d) { int q,i; if (fprintf(fp, "%d %d\n", d->K, d->N) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "mixture dirichlet write failed"); for (q = 0; q < d->N; q++) { if (fprintf(fp, "%.3f ", d->pq[q]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "mixture dirichlet write failed"); for (i = 0; i < d->K; i++) if (fprintf(fp, "%.3f ", d->alpha[q][i]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "mixture dirichlet write failed"); if (fprintf(fp, "\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "mixture dirichlet write failed"); } return eslOK; } /*-------------- end, reading mixture Dirichlets ----------------*/ /***************************************************************** * 5. Unit tests *****************************************************************/ #ifdef eslDIRICHLET_TESTDRIVE static void utest_io(ESL_MIXDCHLET *d, double tol) { char *msg = "esl_dirichlet: io unit test failed"; ESL_MIXDCHLET *d2 = NULL; ESL_FILEPARSER *efp = NULL; FILE *fp = NULL; char tmpfile[16] = "esltmpXXXXXX"; /* Create a mixture Dirichlet file, as a named tmpfile. */ if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); if (esl_mixdchlet_Write(fp, d) != eslOK) esl_fatal(msg); fclose(fp); /* Read it back in */ if ((fp = fopen(tmpfile, "r")) == NULL) esl_fatal(msg); if ((efp = esl_fileparser_Create(fp)) == NULL) esl_fatal(msg); if (esl_mixdchlet_Read(efp, &d2) != eslOK) esl_fatal(msg); esl_fileparser_Destroy(efp); fclose(fp); if (esl_mixdchlet_Compare(d, d2, tol) != eslOK) esl_fatal(msg); esl_mixdchlet_Destroy(d2); remove(tmpfile); return; } static void utest_bild() { char *msg = "esl_dirichlet: BILD unit test failed"; ESL_MIXDCHLET *d = NULL; int K = 4; int N = 2; double *counts; double *mix; double *bg; double sc; /* Create a mixture Dirichlet */ if ((d = esl_mixdchlet_Create(N, K)) == NULL) esl_fatal(msg); //esl_vec_DSet(d->pq, N, 1.0/N); d->pq[0] = 0.4; d->pq[1] = 0.6; d->alpha[0][0] = 0.1; d->alpha[0][1] = 0.2; d->alpha[0][2] = 0.3; d->alpha[0][3] = 0.4; esl_vec_DSet(d->alpha[1], K, 1.0); //simulate count vector counts = malloc(K*sizeof(double)); counts[0] = 3.0; //2.2; counts[1] = 1.0; //0.9; counts[2] = 0.0; //4.5; counts[3] = 0.0; //3.0; //simulate background probabilities bg = malloc(K*sizeof(double)); esl_vec_DSet(bg, K, 1.0/K); //allocate working space mix = malloc(K*sizeof(double)); esl_mixdchlet_BILD_score(counts, K, N, d, mix, bg, &sc); if (esl_DCompare(sc, 0.701, 0.001) != eslOK) esl_fatal(msg); // fprintf(stderr, "Score is %.3f\n", sc); esl_mixdchlet_Destroy(d); free(bg); free(counts); free(mix); return; } /* * For any given sampling effort, there is always a possibility that the resulting * count vector will have a higher likelihood under the wrong component than under the * correct component. This unit test runs multiple inferences and only fail if * more of the inferences fail than is expected */ static void utest_inference(ESL_RANDOMNESS *r, ESL_MIXDCHLET *d, int ntrials, int ncounts, int be_verbose) { char *msg = "esl_dirichlet: inference unit test failed"; double *counts = malloc(sizeof(double) * d->K); double *probs = malloc(sizeof(double) * d->K); double *iq = malloc(sizeof(double) * d->N); double *ip = malloc(sizeof(double) * d->K); int qused, qguess; int c, i, q, j; double maxdeviation; int fail_cnt_1 = 0; int fail_cnt_2 = 0; int fail_cnt_3 = 0; for (j=0; jpq, d->N); //printf("qused=%1d\n", qused); esl_dirichlet_DSample(r, d->alpha[qused], d->K, probs); esl_vec_DSet(counts, d->K, 0.); for (c = 0; c < ncounts; c++) { i = esl_rnd_DChoose(r, probs, d->K); counts[i] += 1.; } /* First inference test: * classify by posterior inference on the sampled probability vector. */ for (q = 0; q < d->N; q++) { esl_dirichlet_LogProbProbs(probs, d->alpha[q], d->K, &(iq[q])); iq[q] += log(d->pq[q]); } qguess = esl_vec_DArgMax(iq, d->N); /* the MP guess from the probs */ //printf("qguess: %1d\n", qguess); if (qused != qguess) { fail_cnt_1++; } /* Second inference test: * classify by posterior inference on the sampled count vector; * then attempt to estimate the probability vector. */ esl_mixdchlet_MPParameters(counts, d->K, d, iq, ip); qguess = esl_vec_DArgMax(iq, d->N); /* the MP guess from the counts */ //printf("%1d\n", qguess); if (qused != qguess) { fail_cnt_2++; } for (i = 0; i < d->K; i++) ip[i] = fabs(ip[i] - probs[i]); /* ip[] is now the differences rel to probs */ maxdeviation = esl_vec_DMax(ip, d->K); // printf("maxdev=%.3f\n", maxdeviation); if (maxdeviation > 0.05) { fail_cnt_3++; } } if (fail_cnt_1 > 2 || fail_cnt_2 > 2 || fail_cnt_3 > 0) { char m1[100], m2[100], m3[100], m4[100], final_msg[500]; sprintf(m1, "Out of %d total trials:", ntrials); sprintf(m2, "* classification sampled probability vector, failed %d times", fail_cnt_1); sprintf(m3, "* classification sampled count vector, failed %d times", fail_cnt_2); sprintf(m4, "* gross error in posterior probs estimated from counts, %d times", fail_cnt_3); sprintf(final_msg, "%s\n%s\n%s\n%s\n%s\n", m1, m2, m3, m4, msg ); esl_fatal(final_msg); } free(counts); free(probs); free(iq); free(ip); return; } /* * Performs two tests: * (1) Check to see if the inferred mixdchlt is similar to true one; * (2) Check if the likelihood under the inferred mixdchlt is at least as good as under the true mixdchlt. * * Also, now calls the Fit routine multiple times (via esl_mixdchlet_Fit_Multipass), * since any single random starting point might lead to a terrible locally optimal mixdchlet */ static void utest_fit(ESL_RANDOMNESS *r, ESL_MIXDCHLET *d, int ntrials, int ncounts, double tol, int reps, int be_verbose) { char *msg ; // = "esl_dirichlet: fit unit test failed"; ESL_MIXDCHLET *id = NULL; double **counts; double *probs = malloc(sizeof(double) * d->K); int qused; int m; int c; int i; /* counter over params (0..K-1) */ counts = malloc(sizeof(double *) * ntrials); for (m = 0; m < ntrials; m ++) counts[m] = malloc(sizeof(double) * d->K); for (m = 0; m < ntrials; m ++) { /* Sample component, p vector, c vector from mixture Dirichlet */ qused = esl_rnd_DChoose(r, d->pq, d->N); esl_dirichlet_DSample(r, d->alpha[qused], d->K, probs); esl_vec_DSet(counts[m], d->K, 0.); for (c = 0; c < ncounts; c++) { i = esl_rnd_DChoose(r, probs, d->K); counts[m][i] += 1.; } #ifdef eslDIRICHLET_TESTDRIVE_PRINTCOUNTS printf ("%d ", m); for (i=0; iK; i++) printf ("%.2f ", counts[m][i]); printf("\n"); #endif /*eslDIRICHLET_TESTDRIVE_PRINTCOUNTS*/ } /* Start with a random id, use the counts to infer d by * maximum likelihood gradient descent. * Generate a random starting point, alphas range from 0..10. */ id = esl_mixdchlet_Create(d->N, d->K); /* optimize id */ // esl_mixdchlet_Fit(counts, ntrials, id, be_verbose); esl_mixdchlet_Fit_Multipass(r, counts, ntrials, reps, id, 0); double lp_true; esl_dirichlet_LogProbDataSet_Mixture (ntrials, counts, d, &lp_true); double lp_inf; esl_dirichlet_LogProbDataSet_Mixture (ntrials, counts, id, &lp_inf); //Test if the likelihood under the inferred model is at least as good as the //likelihood under the true model int lk_ok = eslOK; if (lp_true > lp_inf +.00001) lk_ok = eslFAIL; //Test if the inferred q and alpha values are close // (note: "close" is relative - under the default conditions, they're all // within 35% of the true value) int alphas_ok = esl_mixdchlet_Compare(d, id, tol); if (lk_ok== eslFAIL || alphas_ok==eslFAIL) { fprintf(stderr, "\nGiven dirichlet\n"); esl_mixdchlet_Dump(stderr, d); fprintf (stderr, "logP = %.5f\n\n", lp_true); fprintf(stderr, "\nInferred dirichlet\n"); esl_mixdchlet_Dump(stderr, id); fprintf (stderr, "logP = %.5f\n\n", lp_inf); if (lk_ok==eslFAIL) msg = "esl_dirichlet: fit unit test failed (likelihood)"; else msg = "esl_dirichlet: fit unit test failed (similarity tolerance exceeded)"; esl_fatal(msg); } for (m = 0; m < ntrials; m ++) free(counts[m]); free(counts); free(probs); esl_mixdchlet_Destroy(id); return; } #endif /*eslDIRICHLET_TESTDRIVE*/ /*--------------------- end, unit tests -------------------------*/ /***************************************************************** * 6. Test driver *****************************************************************/ #ifdef eslDIRICHLET_TESTDRIVE /* * gcc -g -Wall -I. -L. -o esl_dirichlet_utest -DeslDIRICHLET_TESTDRIVE esl_dirichlet.c -leasel -lm * ./esl_dirichlet_utest */ #include "easel.h" #include "esl_fileparser.h" #include "esl_getopts.h" #include "esl_random.h" #include "esl_dirichlet.h" /* Note that the RNG seed of 10 is carefully chosen to make the stochastic * tests work reproducibly. Other choices will tend to fail. */ static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-s", eslARG_INT, "10", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, { "-t", eslARG_REAL, ".35", NULL, NULL, NULL, NULL, NULL, "tolerance for real-value equality comparisons", 0 }, { "-C", eslARG_INT, "2", NULL, NULL, NULL, NULL, NULL, "number of components in test mixture D'chlets", 0 }, { "-K", eslARG_INT, "6", NULL, NULL, NULL, NULL, NULL, "alphabet size in test mixture D'chlets", 0 }, { "-N", eslARG_INT, "1000", NULL, NULL, NULL, NULL, NULL, "number of sample counts in mixture D'chlet tests", 0 }, { "-T", eslARG_INT, "100", NULL, NULL, NULL, NULL, NULL, "number of trials of mixture D'chlet tests", 0 }, { "-R", eslARG_INT, "5", NULL, NULL, NULL, NULL, NULL, "number of repetitions of the D'chlet fitting procedure", 0 }, { "-v", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, NULL, "show verbose output", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options]"; static char banner[] = "test driver for dirichlet module"; int main(int argc, char **argv) { char *msg = "esl_dirichlet unit test failed"; ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); ESL_MIXDCHLET *d = NULL; int K = esl_opt_GetInteger(go, "-K"); int ncounts = esl_opt_GetInteger(go, "-N"); int ntrials = esl_opt_GetInteger(go, "-T"); int nfit_reps = esl_opt_GetInteger(go, "-R"); double tol = esl_opt_GetReal (go, "-t"); int be_verbose = esl_opt_GetBoolean(go, "-v"); if (be_verbose) printf("rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(r)); /* Create a two-component mixture Dirichlet for testing */ if ((d = esl_mixdchlet_Create(2, K)) == NULL) esl_fatal(msg); esl_vec_DSet(d->pq, 2, 0.5); esl_vec_DSet(d->alpha[0], K, 1.0); esl_vec_DSet(d->alpha[1], K, 0.1); utest_io(d, tol); utest_fit(r, d, ntrials, ncounts, tol, nfit_reps, be_verbose); utest_inference(r, d, ntrials, ncounts, be_verbose); utest_bild(); esl_randomness_Destroy(r); esl_mixdchlet_Destroy(d); esl_getopts_Destroy(go); return 0; } #endif /*eslDIRICHLET_TESTDRIVE*/ /*--------------------- end, test driver ------------------------*/ /***************************************************************** * 7. Example *****************************************************************/ #ifdef eslDIRICHLET_EXAMPLE /*::cexcerpt::dirichlet_example::begin::*/ #include #include #include "easel.h" #include "esl_random.h" #include "esl_fileparser.h" #include "esl_vectorops.h" #include "esl_dirichlet.h" int main(int argc, char **argv) { FILE *fp; ESL_FILEPARSER *efp; ESL_RANDOMNESS *r; ESL_MIXDCHLET *pri; int c,i,q,qused; double *counts, *probs, *iq, *ip; /* Read in a mixture Dirichlet from a file. */ fp = fopen(argv[1], "r"); efp = esl_fileparser_Create(fp); if (esl_mixdchlet_Read(efp, &pri) != eslOK) { fprintf(stderr, "%s;\ndirichlet file %s parse failed at line %d\n", efp->errbuf, argv[1], efp->linenumber); exit(1); } esl_fileparser_Destroy(efp); fclose(fp); /* Allocate some working spaces */ probs = malloc(sizeof(double) * pri->K); counts = malloc(sizeof(double) * pri->K); iq = malloc(sizeof(double) * pri->N); ip = malloc(sizeof(double) * pri->K); /* Sample a probability vector from it. */ r = esl_randomness_Create(0); /* init the random generator */ qused = esl_rnd_DChoose(r, pri->pq, pri->N); /* sample a component */ esl_dirichlet_DSample(r, pri->alpha[qused], pri->K, probs); printf("Component %2d: p[] = ", qused); for (i = 0; i < pri->K; i++) printf("%.3f ", probs[i]); printf("\n"); /* Sample a count vector from that prob vector. */ esl_vec_DSet(counts, pri->K, 0.); for (c = 0; c < 20; c++) counts[esl_rnd_DChoose(r, probs, pri->K)] += 1.; printf(" c[] = "); for (i = 0; i < pri->K; i++) printf("%5.0f ", counts[i]); printf("\n"); /* Estimate a probability vector (ip) from those counts, and * also get back the posterior prob P(q|c) of each component (iq). */ esl_mixdchlet_MPParameters(counts, pri->K, pri, iq, ip); printf(" reestimated p[] = "); for (i = 0; i < pri->K; i++) printf("%.3f ", ip[i]); printf("\n"); q = esl_vec_DArgMax(iq, pri->N); printf("probably generated by component %d; P(q%d | c) = %.3f\n", q, q, iq[q]); esl_mixdchlet_Destroy(pri); esl_randomness_Destroy(r); free(probs); free(counts); free(iq); free(ip); return 0; } /*::cexcerpt::dirichlet_example::end::*/ #endif /*eslDIRICHLET_EXAMPLE*/