/*********************************************************************** * * * MEME * * Copyright 1994-2017, The Regents of the University of California * * Author: Timothy L. Bailey * * * ***********************************************************************/ /* 9-30-99 tlb; add sequence to SAMPLE */ /* 6-29-99 tlb; add resic */ /* 9-22-08 pm; add posprior */ /* Supports FASTA and SWISSPROT formats. Note: Assumes SWISSPROT format if "ID" appears at beginning of first line. Ignores duplicate (same ) sequences. All formats: := + The *unique* name of the sequence. May *not* contain whitespace! NOTE: if is "WEIGHTS", the field is parsed as a string of sequence weights and any field (if any) is ignored. All weights are assigned in order to the sequences in the file. If there are more sequences than weights, the remainder are given weight one. Weights must be greater than zero and less than or equal to one. Weights may be specified by more than one "WEIGHTS" entry which may appear anywhere in the file. := + A description of the sequence; may contain whitespace. := + The DNA or protein sequence data; may contain whitespace. := * Text which is ignored. Pearson FASTA format: INPUT DATA FILE FORMAT: [

+ ]*

:= ">"" " Everything up to the first space is assumed to be the unique name of the sequence. SWISSPROT Format [ + + // ]* := "ID "" " := "DE " := "SQ " */ #include "meme.h" #include "read_sequence.h" #include "ushuffle.h" #include "seq.h" static void parse_sequence_weights( DATASET *dataset, char *sample_name, char *sample_de, char *sequence ); void set_database_freq_and_entropy( DATASET *dataset, bool use_comp /* use complementary strands, too */ ); static inline int compare_res( const uint8_t *res1, int len1, const uint8_t *res2, int len2 ); extern inline int compare_samples_by_res ( const void *elem1, const void *elem2 ); static SAMPLE *create_sample( ALPH_T *alph, /* the alphabet */ long length, /* length of sequence */ char *name, /* name of sample */ char *sequence, /* the sequence */ char *descript, // Description of the sample bool revcomp // make space for reverse complement ); /* local variables */ // JIJ: this was 100 000 but I changed it to 100 003 to make it prime because // apparently prime numbers are better for avoiding hash collisions. #define DATA_HASH_SIZE 100003 // The hash table will be appended to if read_seq_file is called more // than once. This is so that positive sequences the negative sequence file // will be ignored. static HASH_TABLE ht_seq_names = NULL; /* hash of dataset seq names */ /**********************************************************************/ /* get_sample_by_name Check if sample name is defined in ht_seq_names hash table */ /**********************************************************************/ SAMPLE *get_sample_by_name( char *sample_name ) { HASH_TABLE_ENTRY * hash_entry = hash_lookup_str(sample_name, ht_seq_names); return(hash_entry != NULL ? (SAMPLE *) hash_get_entry_value(hash_entry) : NULL); } // get_sample_by_name DATASET *create_meme_dataset_from_momo( ARRAYLST_T* seq_array, /* name of file to open */ ALPH_T *alph, /* alphabet used in sequences */ int width, bool eliminate_repeats) { int i, j; char *sample_name; /* name of sample read */ char *sample_de; /* descriptor text for sample */ char *sequence; /* sequence read */ long length = (long) width; /* length of sequence */ bool error=false; /* none yet */ /* create a dataset */ DATASET *dataset = (DATASET *) mm_malloc(sizeof(DATASET)); dataset->alph = alph_hold(alph); dataset->psp_w = 0; // indicates no PSP was read dataset->log_psp_w = 0; // so log_psp will get initialized dataset->samples = NULL; /* no samples */ dataset->n_samples = 0; /* no samples yet */ dataset->n_group[0] = dataset->n_group[1] = dataset->n_group[2] = 0; dataset->seq_weights = NULL; // user-provideed sequence weights dataset->n_wgts = 0; /* initialize maximum length of sequences */ dataset->max_slength = 0; dataset->min_slength = 10000000; printf("here1\n"); for (i = 0; i < arraylst_size(seq_array); ++i) { printf("here\n"); SEQ_T* seqobject = eliminate_repeats ? hash_get_entry_value(arraylst_get(i, seq_array)) : arraylst_get(i, seq_array); sample_name = get_seq_name(seqobject); sample_de = get_seq_description(seqobject); sequence = get_raw_sequence(seqobject); // HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample /* create the sample */ SAMPLE *sample = create_sample(alph, length, sample_name, sequence, sample_de, false); if (sample == NULL) {error = true; continue;} // Record the original index of the sample so we can // access it after the dataset is shuffled. sample->orig_index = dataset->n_samples; // All samples are in Group 0 initially. sample->group = 0; /* record maximum length of actual sequences */ dataset->max_slength = length; dataset->min_slength = length; printf("min_slength: %lu\n", dataset->min_slength); /* put the sample in the array of samples */ if ((dataset->n_samples % RCHUNK) == 0) { Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *); } dataset->samples[dataset->n_samples++] = sample; /* cleanup sequence (create_sample copies it) */ myfree(sequence); } /* sequences */ if (length < 0) error = true; /* read_sequence error */ /* check that datafile contained at least one sample */ if (!error) { if (dataset->n_samples == 0) { fprintf(stderr, "No sequences found in files. Check file format.\n"); error = true; } } /* exit if there was an error */ if (error) exit(1); /* resize the array of samples and intialize the size of Group 0*/ if (dataset->n_samples) Resize(dataset->samples, dataset->n_samples, SAMPLE*); dataset->n_group[0] = dataset->n_samples; // Initially these are equal. dataset->input_order = dataset->samples; // Set the frequencies and entropies of the database. set_database_freq_and_entropy(dataset, false); return dataset; } // create_dataset_from_momo /**********************************************************************/ /* read_seq_file Open a sequence file and read in the sequences. Returns a dataset-> setup_hash_alphabet must have been called prior to first call to this. */ /**********************************************************************/ DATASET *read_seq_file( char *file_name, /* name of file to open */ ALPH_T *alph, /* alphabet used in sequences */ bool use_comp, /* use complementary strands, too */ bool ignore_dup // allow duplicate names ) { int i, j; char *sample_name; /* name of sample read */ char *sample_de; /* descriptor text for sample */ char *sequence; /* sequence read */ long length; /* length of sequence */ bool error=false; /* none yet */ /* create a hash table of sequence names */ if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE, NULL); /* create a dataset */ DATASET *dataset = (DATASET *) mm_malloc(sizeof(DATASET)); dataset->alph = alph_hold(alph); dataset->invcomp = use_comp; dataset->seed = 0; dataset->psp_w = 0; // indicates no PSP was read dataset->log_psp_w = 0; // so log_psp will get initialized dataset->samples = NULL; /* no samples */ dataset->input_order = NULL; // no samples yet dataset->n_samples = 0; /* no samples yet */ dataset->n_group[0] = dataset->n_group[1] = dataset->n_group[2] = 0; dataset->seq_weights = NULL; // user-proved sequence weights dataset->n_wgts = 0; /* initialize maximum length of sequences */ dataset->max_slength = 0; dataset->min_slength = 10000000; /* open data file */ FILE *data_file; /* file pointer */ if (file_name == NULL) { fprintf(stderr, "You must specify a data file or `stdin'.\n"); exit(1); } else if (strcmp(file_name, "stdin")) { data_file = fopen(file_name, "r"); if (data_file == NULL) { fprintf(stderr, "Cannot open file `%s'.\n", file_name); exit(1); } } else { data_file = stdin; } while (read_sequence(alph, data_file, &sample_name, &sample_de, &sequence, &length)) { /* skip sequence if an error occurred */ if (length < 0) continue; /* parse weights if given; make (more than enough) room in array */ if (strcmp(sample_name, "WEIGHTS")==0) { parse_sequence_weights(dataset, sample_name, sample_de, sequence); continue; } /* ignore duplicate (same sample name) sequences */ if (hash_lookup_str(sample_name, ht_seq_names) != NULL) { if (! ignore_dup) { fprintf(stderr, "Duplicate sequence name. Skipping '%s'.\n", sample_name); myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } } else { // Don't free sample_name since the hash table doesn't copy it. hash_insert_str(sample_name, ht_seq_names); /* put name in hash table */ } HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample /* create the sample */ SAMPLE *sample = create_sample(alph, length, sample_name, sequence, sample_de, use_comp); if (sample == NULL) {error = true; continue;} // Record the original index of the sample so we can // access it after the dataset is shuffled. sample->orig_index = dataset->n_samples; // All samples are in Group 0 initially. sample->group = 0; /* record maximum length of actual sequences */ dataset->max_slength = MAX(sample->length, dataset->max_slength); dataset->min_slength = MIN(sample->length, dataset->min_slength); /* put the sample in the array of samples */ if ((dataset->n_samples % RCHUNK) == 0) { Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *); } dataset->samples[dataset->n_samples++] = sample; hash_entry = hash_lookup_str(sample_name, ht_seq_names); if (!hash_entry) { fprintf(stderr, "hash error: added sample ID then failed to find it in read_seq_file ()\n"); error = true; break; } hash_set_entry_value(sample, hash_entry); /* cleanup sequence (create_sample copies it) */ myfree(sequence); myfree(sample_name); myfree(sample_de); } /* sequences */ if (length < 0) error = true; /* read_sequence error */ /* check that datafile contained at least one sample */ if (!error) { if (dataset->n_samples == 0) { fprintf(stderr, "No sequences found in file `%s'. Check file format.\n", file_name); error = true; } } /* exit if there was an error */ if (error) exit(1); /* resize the array of samples and intialize the size of Group 0*/ if (dataset->n_samples) Resize(dataset->samples, dataset->n_samples, SAMPLE*); dataset->n_group[0] = dataset->n_samples; // set the left flank region to be the entire sequence dataset->n_region[0] = dataset->max_slength; dataset->n_group[1] = dataset->n_group[2] = 0; dataset->region_last_pos[0] = dataset->region_last_pos[1] = dataset->region_last_pos[2] = dataset->max_slength - 1; dataset->input_order = dataset->samples; // Set the frequencies and entropies of the database. set_database_freq_and_entropy(dataset, use_comp); return dataset; } /* read_seq_file */ /**********************************************************************/ /* shuffle_dataset_order Shuffle the order of the sequences in a dataset after first ordering them alphabetically by integer-encoding. (This makes sure the shuffled order is independent of strand and input order.) */ /**********************************************************************/ void shuffle_dataset_order( DATASET *dataset // dataset to shuffle ) { int i, j; int n = dataset->n_samples; SAMPLE **samples = dataset->samples; // Save an array containing pointers to the samples in their // input order. It initially is just a pointer to dataset->samples. if (dataset->input_order && dataset->input_order != dataset->samples) free(dataset->input_order); dataset->input_order = NULL; // Create new array. Resize(dataset->input_order, n, SAMPLE*); memcpy(dataset->input_order, samples, n*sizeof(SAMPLE*)); // With shuffling turned on, Classic sometimes gives different results // than <= v4.10.2 due to its previous (unintended) dependency on sequence order. if (1) { // Sort the samples alphabetically by their integer-encoding, // accounting for the possibility of two strands. qsort(samples, dataset->n_samples, sizeof(SAMPLE*), compare_samples_by_res); // Shuffle the order of the sorted SAMPLE pointers using Fisher-Yates. for (i=n-1; i>0; i--) { j = drand_mt() * (i + 1); SAMPLE *tmp = samples[i]; samples[i] = samples[j]; samples[j] = tmp; // swap } } else { fprintf(stderr, "Shuffling of sequence order is is SUPPRESSED. Turn it back on!!!\n"); } } // shuffle_dataset_order /**********************************************************************/ /* add_control_samples Add hsfrac fraction of the provided control samples to groups 1 and 2, split evenly. Do not free the control dataset after calling this. */ /**********************************************************************/ extern void add_control_samples ( DATASET *dataset, // primary dataset DATASET *control, // control dataset int c1, // size of control group 1 int c2, // size of control group 2 bool use_comp // consider both strands ) { int i; // Resize the samples array to make room. int n_control = c1 + c2; Resize(dataset->samples, dataset->n_samples + n_control, SAMPLE*); // Add each control sample to group 1 or 2. for (i=0; isamples[i]; // Because create_sample doesn't initialize logcumback // we need to do a direct link. That means never free the control dataset. SAMPLE *sample = s; // Add samples; note that size of group 0 doesn't change since // these are from a different (negative) dataset. if (i < c1) { sample->group = 1; dataset->n_group[1]++; } else { sample->group = 2; dataset->n_group[2]++; } /* put the sample in the array of samples */ dataset->samples[dataset->n_samples++] = sample; } int h1 = dataset->n_group[0]; // index of first sample in group 1 int h2 = h1 + c1; // index of first sample in group 2 dataset->group_last_idx[0] = h1-1; // index of last sequence in group 0 dataset->group_last_idx[1] = h2-1; // index of last sequence in group 1 dataset->group_last_idx[2] = h2+c2-1; // index of last sequence in group 2 // Reset the frequencies and entropies of the database. myfree(dataset->res_freq); set_database_freq_and_entropy(dataset, use_comp); } // add_control_samples /**********************************************************************/ /* parse_sequence_weights Read sequence weights from a FASTA header: >WEIGHTS w_1 w_2 ... w_n */ /**********************************************************************/ static void parse_sequence_weights( DATASET *dataset, char *sample_name, char *sample_de, char *sequence ) { double wgt; char *wgt_str = sample_de; Resize(dataset->seq_weights, dataset->n_wgts+(int)strlen(wgt_str), double); while (sscanf(wgt_str, "%lf", &wgt) == 1) { if (wgt <= 0 || wgt > 1) { fprintf(stderr, "Weights must be larger than zero and no greater than 1.\n"); exit(1); } dataset->seq_weights[dataset->n_wgts++] = wgt; /* save weight */ wgt_str += strspn(wgt_str, " "); /* skip white */ wgt_str += strcspn(wgt_str, " "); /* skip token */ } myfree(sample_name); myfree(sample_de); myfree(sequence); } // parse_sequence_weights /**********************************************************************/ /* set_database_freq_and_entropy Calculate the prior residue frequencies and entropies and |D|, size of the dataset tlb; 5/9/97 wgt_total_res and weighted res_freq tlb; 12-Jan-2017 make wgt_total_res non-ambiguous only */ /**********************************************************************/ void set_database_freq_and_entropy( DATASET *dataset, bool use_comp /* use complementary strands, too */ ) { int i, j; double *seq_weights = dataset->seq_weights; int n_wgts = dataset->n_wgts; ALPH_T *alph = dataset->alph; // alphabet // count residues dataset->res_freq = NULL; Resize(dataset->res_freq, alph_size_core(alph), double); for (i=0; i < alph_size_core(alph); i++) { dataset->res_freq[i] = 0; } dataset->total_res = 0; dataset->wgt_total_res = 0; for (i=0; in_samples; i++) { /* sequence */ long slen = dataset->samples[i]->length; double sw = dataset->samples[i]->sw = (n_wgts > i ? seq_weights[i] : 1); dataset->total_res += slen; //dataset->wgt_total_res += slen*sw; double core_count = 0; // number of non-ambigs in sequence for (j=0; j < alph_size_core(alph); j++) { core_count += dataset->samples[i]->counts[j]; if (use_comp) { /* using complementary strand as well */ dataset->res_freq[j] += sw * dataset->samples[i]->counts[j]/2.0; dataset->res_freq[alph_complement(alph, j)] += sw * dataset->samples[i]->counts[j]/2.0; } else { /* not using complementary strands */ dataset->res_freq[j] += sw * dataset->samples[i]->counts[j]; } } dataset->wgt_total_res += sw * core_count; // weighted non-ambigs in dataset } /* convert counts to frequencies */ for (i=0; i < alph_size_core(alph); i++) { dataset->res_freq[i] /= dataset->wgt_total_res; } //fprintf(stderr, "wgt_total_res %f\n", dataset->wgt_total_res); exit(1); } // set_database_freq_and_entropy /**********************************************************************/ /* shuffle_dataset_letters Shuffle the letters in each sequence in a dataset, preserving the k-mer counts in each sequence. Renames the sequences appending "_shuf_copy#". Modifies the dataset in place. */ /**********************************************************************/ void shuffle_dataset_letters( DATASET *dataset, // dataset to have sequences shuffled and replaced int kmer, // preserve k-mer counts in each sequence int revcomp // include reverse complements of sequences ) { int i,j,k; int n_samples = dataset->n_samples; SAMPLE **samples = dataset->samples; int n_copies = 1; int pad = log(n_copies)/log(10) + 1; int i_copy = 1; if (kmer > 0) { for (i=0; ilength; // Replace the sequence with its reverse complement if it is smaller. // Check this using the integer-encoded sequence due to letter aliases. if (revcomp && compare_res(s->resic, s->length, s->res, s->length) < 0) { char *rc = NULL; Resize(rc, length+1, char); for (j=0, k=length-1; jalph, s->seq[k]); rc[length] = '\0'; SAMPLE *new = create_sample(dataset->alph, s->length, s->sample_name, rc, s->descript, revcomp); myfree(rc); // free local storage free_sample(s); // free larger sample s = new; // replace the sample } // Create the new sample name. char *name = NULL; Resize(name, strlen(s->sample_name) + pad + 14, char); sprintf(name, "%s_shuf_%d", s->sample_name, i_copy); // Create a shuffled copy of the sequence. char *shuf = NULL; Resize(shuf, length+1, char); shuf[length] = '\0'; //FIXME replace with simple call to ushuffle() ushuffle1(s->seq, length, kmer); ushuffle2(shuf); // Create a new sample. SAMPLE *new = create_sample(dataset->alph, length, name, shuf, s->descript, revcomp); // Replace sample with a shuffled version. samples[i] = new; myfree(name); // free local storage myfree(shuf); // free local storage free_sample(s); // free old sample } } } // shuffle_dataset_letters /**********************************************************************/ /* compare_res Compare integer-encoded sequences. */ /**********************************************************************/ static inline int compare_res( const uint8_t *res1, int len1, const uint8_t *res2, int len2 ) { int i; int len = MIN(len1, len2); for (i=0; i res2[i]) { return(+1); } } // res1 == res2 if (len1 < len2) { return(-1); } else if (len1 > len2) { return(+1); } else { return(res1 < res2); // Break ties using address. } } // compare_res /**********************************************************************/ /* compare_samples_by_res Compare two samples based on their integer-coded sequences. */ /**********************************************************************/ inline int compare_samples_by_res ( const void *elem1, const void *elem2 ) { const uint8_t *res1 = (*(SAMPLE **)elem1)->res; const uint8_t *resic1 = (*(SAMPLE **)elem1)->resic; const int len1 = (*(SAMPLE **)elem1)->length; const uint8_t *res2 = (*(SAMPLE **)elem2)->res; const uint8_t *resic2 = (*(SAMPLE **)elem2)->resic; const int len2 = (*(SAMPLE **)elem2)->length; // Replace res with MIN(res, resic) to make // comparison insensitive to strand given. if (resic1 && compare_res(resic1, len1, res1, len1) < 0) res1 = resic1; if (resic2 && compare_res(resic2, len2, res2, len2) < 0) res2 = resic2; // Now compare the integer-encoded sequences. return (compare_res(res1, len1, res2, len2)); } // compare_samples_by_res /**********************************************************************/ /* create_sample Create a sample. Returns the sample or NULL on error. */ /**********************************************************************/ static SAMPLE *create_sample( ALPH_T *alph, /* the alphabet */ long length, /* length of sequence */ char *name, /* name of sample (is duplicated here) */ char *sequence, /* the sequence (is duplicated here) */ char *descript, // Description of the sample (is duplicated here) bool revcomp // make space for reverse complement ) { long i, j; SAMPLE *new1; /* disallow zero length samples */ if (length == 0) { fprintf(stderr, "\nZero length sequences not allowed. (Sequence `%s').\n", name); return NULL; } /* create the record to hold the sample and its associated data */ new1 = (SAMPLE *) mm_malloc(sizeof(SAMPLE)); /* assign the name and sequence data */ new1->sample_name = strdup(name); new1->descript = strdup(descript); new1->seq = strdup(sequence); new1->length = length; new1->sw = 1; /* set up encoded version of sequence and weights */ new1->res = (uint8_t *) mm_malloc(length * sizeof(char)); if (revcomp) { new1->resic = (uint8_t *) mm_malloc(length * sizeof(char)); } else { new1->resic = NULL; } new1->weights = (WEIGHTS_T *) mm_malloc(length * sizeof(WEIGHTS_T)); for (i=0; ires[i] = e; // FIXME: make resic obsolete if (new1->resic) new1->resic[length-i-1] = alph_complement(alph, e); new1->weights[i] = 1.0; } /* set up erasing arrays */ new1->not_o = (double *) mm_malloc(length * sizeof(double)); new1->log_not_o = (int *) mm_malloc(length * sizeof(int)); /* set up arrays to hold posterior probabilities */ create_2array(new1->pY, int, 3, length); new1->pYic = (char *) mm_malloc(length * sizeof(char)); /* Set up Z array and log_psp arrays */ if (revcomp) { // Offset pointer -lseq so that z[j], j in [-lseq...+lseq] new1->z_buf = mm_malloc(sizeof(Z_T) * (2*length+1)); new1->z = (new1->z_buf)+length; } else { new1->z_buf = mm_malloc(sizeof(Z_T) * (length+1)); new1->z = new1->z_buf; } new1->log_psp_buf = new1->log_psp = NULL; new1->psp_original_buf = new1->psp_original = NULL; new1->max_log_psp = LOG(0); /* set up array to hold character counts and get character counts */ new1->counts = (double *) calloc(alph_size_core(alph), sizeof(double)); for (i=0; ires[i]; if (e != alph_wild(alph)) { // normal letter new1->counts[e]++; } } new1->logcumback = (LCB_T *) mm_malloc((length+1) * sizeof(LCB_T)); new1->nsites = 0; new1->sites = NULL; new1->minpv = 1; new1->group = 0; return new1; } /* create_sample */ void free_sample(SAMPLE *sample) { myfree(sample->sample_name); myfree(sample->descript); myfree(sample->seq); myfree(sample->res); myfree(sample->resic); myfree(sample->pYic); myfree(sample->weights); myfree(sample->not_o); myfree(sample->log_not_o); myfree(sample->logcumback); myfree(sample->sites); free_2array(sample->pY, 3); myfree(sample->counts); myfree(sample->z_buf); // Note: don't ever free ->z myfree(sample->psp_original_buf); // Note: don't ever free ->psp_original myfree(sample->log_psp_buf); // Note: don't ever free ->log_psp myfree(sample->logcumback); free(sample); } // free_sample