#!@WHICHPERL@ # AUTHOR: Philip Machanick # CREATE DATE: 20 October 2009 # # centre sequences selected and trim the excess if they are # wider than a given threshold. If string to be printed # contains only ambiguous characters, nothing is printed. use warnings; use strict; use lib qw(@PERLLIBDIR@); use Fcntl qw(O_CREAT O_TRUNC O_WRONLY); use Getopt::Long; use Pod::Usage; use Alphabet qw(dna protein rna); use ReadFastaFile; =head1 NAME fasta-center - Extracts the center of sequences. =head1 SYNOPSIS fasta-center [options] Options: -dna the sequences use the DNA alphabet -protein the sequences use the protein alphabet -rna the sequences use the RNA alphabet -alph file with the alphabet definition -len length of sequences to output; default: 100 -flank output flanking sequences to -reject output rejected sequences to -h print this help message and exit Reads sequences in FASTA format. For each sequence, it outputs the length portion of the sequence centred on the original sequence. If any sequence is less than long, it is output in its entirety. Flanking sequences, if output, each have a FASTA name starting with the name of the original sequence, with "-L" appended for the left flanking sequence and "-R" for the right flanking sequence. When an alphabet is specified the sequences are validated and sequences consisting of nothing but ambiguous symbols are rejected and optionally written to the reject file. Reads from standard input. Writes to standard output. =cut # Set option defaults my $help = 0; # FALSE my $opt_dna = 0; # FALSE my $opt_protein = 0; # FALSE my $opt_rna = 0; # FALSE my $opt_alph_file = undef; my $center_size = 100; my $opt_flank_file = undef; my $opt_reject_file = undef; # read options GetOptions( "help|?" => \$help, "dna" => \$opt_dna, "protein" => \$opt_protein, "rna" => \$opt_rna, "alph=s" => \$opt_alph_file, "len=i" => \$center_size, "flank=s" => \$opt_flank_file, "reject=s" => \$opt_reject_file, ) or pod2usage(2); # test for help message request pod2usage(1) if $help; # test that the alphabet is only set once if (($opt_dna and $opt_protein) or ($opt_dna and defined($opt_alph_file)) or ($opt_protein and defined($opt_alph_file))) { pod2usage("Options -dna, -protein or -alph are mutually exclusive"); } # test that the alphabet file is defined if (defined($opt_alph_file)) { unless (-s $opt_alph_file) { pod2usage("Option -alph requires an existing alphabet definition file"); } } # test that the length is positive if ($center_size <= 0) { pod2usage("Option -len only allows positive lengths"); } # load the alphabet my $alph = undef; if ($opt_dna) { $alph = dna(); } elsif ($opt_protein) { $alph = protein(); } elsif ($opt_rna) { $alph = rna(); } elsif (defined($opt_alph_file)) { $alph = new Alphabet($opt_alph_file); } # open the flank file my $flank_fh = undef; if (defined($opt_flank_file)) { sysopen($flank_fh, $opt_flank_file, O_CREAT | O_TRUNC | O_WRONLY) or die("Failed to open flank file \"$opt_flank_file\" for writing: $!"); } # open the reject file my $reject_fh = undef; if (defined($opt_reject_file)) { sysopen($reject_fh, $opt_reject_file, O_CREAT | O_TRUNC | O_WRONLY) or die("Failed to open reject file \"$opt_reject_file\" for writing: $!"); } # read each sequence my $line; my $header = undef; my $name = undef; my $seq = undef; my $warned = 0; #FALSE while ($line = ) { chomp $line; if ($line =~ m/^>(\S+)/) { # sequence start &process_sequence($header, $name, $seq) if defined $name; $header = $line; $name = $1; $seq = ''; } elsif (defined($seq)) { # sequence content $line =~ s/\s//g; # remove space from sequence # test sequence is valid in the alphabet if (defined($alph)) { my ($pos, $sym) = $alph->find_unknown($line); die("Sequence $name contains unknown symbol $sym\n") if (defined($pos)); } $seq .= $line unless $line eq ""; } else { # don't know what this is... warn("Content preceeds the sequences!\n") unless $warned; $warned = 1; #TRUE } } &process_sequence($header, $name, $seq) if defined $name; # close files close($flank_fh) if defined $flank_fh; close($reject_fh) if defined $reject_fh; # process each sequence and output the central part sub process_sequence { my ($header, $name, $seq) = @_; # get the center of the sequence my $seq_len = length($seq); if ($seq_len <= $center_size) { # sequence is smaller than the central size # check sequence contains some useful content and is not just ambiguity characters unless (defined($alph) && $alph->is_ambig_seq($seq)) { print $header, "\n", $seq, "\n"; } elsif (defined($reject_fh)) { print $reject_fh $header, "\n", $seq, "\n"; } } else { # sequence is larger than the central size # calculate the offset of the central portion and extract it my $offset = int(($seq_len - $center_size) / 2); my $seq_center = substr($seq, $offset, $center_size); # check central sequence contains some useful content and is not just ambiguity characters unless (defined($alph) && $alph->is_ambig_seq($seq_center)) { print $header, "\n", $seq_center, "\n"; if (defined($flank_fh)) { # print the flanking regions print $flank_fh ">", $name, "-L\n", substr($seq, 0, $offset), "\n"; print $flank_fh ">", $name, "-R\n", substr($seq, $offset + $center_size), "\n"; } } elsif (defined($reject_fh)) { print $reject_fh $header, "\n", $seq, "\n"; } } } 1;