#!/usr/local/bin/perl # $Id: caqc.pl,v 1.31 2009/05/15 17:57:26 skoren Exp $ # # This program reads a Celera .asm file and produces aggregate information # about the assembly # # Written by Mihai Pop, Martin Shumway # # #use warnings; use strict; use Getopt::Long; use IO::File; use File::Basename; use Statistics::Descriptive; use File::Copy; use Math::BigFloat; use FindBin qw($Bin); my $MY_VERSION = "caqc Version 2.13 (Build " . (qw/$Revision: 1.31 $/)[1] . ")"; # Constants my $MINQUAL = 20; my $MINCONTIG = 10000; my @help_lines = undef; my $help_file = $Bin . '/caqc_help.ini'; my $MY_HELPTEXT = qq~ Generate quality statistics from the specified Celera assembly .asm file. caqc [options] caqc reads the .asm, which is the output file of the Celera Assembler. caqc requires that the file be in the current directory. options: -minqual Minimum quality value threshhold to report as bad quality (default $MINQUAL) -mincontig Minimum contig size to report as a big contig (default $MINCONTIG) -d Specify the record delimiter tagvalue -s Specify the list delimiter el[el ...] -h Display help information and exit -h Display help on output qc term. Use 'all' for all qc terms. -html Output qc help as html -silent Do not emit output to the stdout -euid Print EUIDs -g Genome size used in the calculation of N50 numbers (default: TotalBasesInContigs for contigs and TotalBasesInScaffolds for scaffolds) -debug Set the debug . -l Specify the logfile. Only created if debug level set. -t Specify the number of top scaffolds and contigs to use. (default: 5) -frg Computes frag related statistics: BasesCount and ClearRangeLengthFRG. This option requires that a .frg file exist in the current directory. -metrics Option to output a .qc.metrics file which is inputted to ametrics. This option requires that a .frg file exist in the current directory. [Note: -frg is a subset of -metrics] output files: .qc Described below. .qc.metrics (optional) Input file for ametrics caqc produces output on stdout as well as a .qc file in the current directory. The file output is in a tag-value format organized in a hierarchical manner compatible with the INI style for config files. This file can be read using TIGR::ConfigFile (run perldoc TIGR::ConfigFile). Delimiters can be controlled by using the tag-value separator specified in the -d option, and the minor delimiter separating list values by using the -s option. By default both are set to whitespace for human readability in the stdout output, and to = and comma respectively in the .qc file. The optional .qc.metrics file is a copy of the .qc file with additional metrics information for ametrics, which uploads it into the ASDB database. NOTE: caqc has backwards compatibility to support usage of: caqc .asm [options] See also: http://intranet.tigr.org/software_docs/CeleraAssembler.shtml TIGR::ConfigFile ~; # Global delimiters my $DEFAULT_TAG_DELIM = '='; my $DEFAULT_FIELD_DELIM = ','; my $DEFAULT_TOP_COUNT = 5; my $d = $DEFAULT_TAG_DELIM; # tag-value delimiter my $s = $DEFAULT_FIELD_DELIM; # list separator my $silent = 0; my $euid = 0; ############################################## =over =item B Reads from stdin the text between "extreme" { and } . for example: {A {B } } Returns the whole: {A{B}} =cut sub getCARecord { my $file = shift; my $level = 0; my $block = ""; while (<$file>) { if (/^\s*\{/) { $level++; } if (/^\s*\}/) { $level--; } $block .= $_; if ( $level == 0 ) { last; } } if ( $level != 0 ) { die("end of file reached before end of block\n"); } if ( $block ne "" ) { return $block; } else { return undef; } } # getCARecord ######################################################3 =item B Parses a record and returns a triplet consisting of - record type - hash of fields and values - array of sub-records =cut sub parseCARecord { my $record = shift; my @lines = split( '\n', $record ); my $type; my %fields; my @recs; # get record type $lines[0] =~ /\{(\w+)/; if ( !defined $1 ) { die("Wierd start of record: $record\n"); } $type = $1; if ( $lines[$#lines] !~ /^\s*\}/ ) { die("Wierd end of record: $record\n"); } my $level = 0; my $fieldname; for ( my $i = 1 ; $i < $#lines ; $i++ ) { if ( $lines[$i] =~ /^(\w+):(\S+)$/ ) { # simple field $fields{$1} = $2; } # simple field if ( $lines[$i] =~ /^(\w+):$/ ) { # complex field $fieldname = $1; $fields{$fieldname} = ""; $i++; while ( $i < $#lines && ( $lines[$i] !~ /^\.$/ ) ) { $fields{$fieldname} .= "$lines[$i]\n"; $i++; } } # complex field if ( $lines[$i] =~ /^\s*\{/ ) { # subrecord my $level = 1; my $thisrec = ++$#recs; $recs[$thisrec] .= "$lines[$i]\n"; $i++; while ( $level > 0 && $i < $#lines ) { if ( $lines[$i] =~ /^\s*\{/ ) { $level++; } if ( $lines[$i] =~ /^\s*\}/ ) { $level--; } $recs[$thisrec] .= "$lines[$i]\n"; if ( $level == 0 ) { last; } else { $i++; } } if ( $level != 0 ) { die("Error parsing sub_record in: $record\n"); } } # subrecord } # for $i... return ( $type, \%fields, \@recs ); } # parseCARecord ################################################ =item B Obtains both IDs from a "paired" id, that is, converts (10, 1000) into 10,1000. If the Id is not a pair in parantheses, it returns the input. Thus, getBothCAIds('(10, 1000)') returns 10,1000 while getBothCAIds("abba") returns "abba". =cut sub getBothCAIds { my $string = shift; if ( $string =~ /$(\S+),(\S+)$/ ) { return ( $1, $2 ); } else { return $string; # just in case we have a real ID } } # getBothCAIds ################################################ =item B Obtains the ID from a "paired" id, that is, converts (10, 1000) into 10. If the Id is not a pair in parantheses, it returns the input. Thus, getCAId('(10, 1000)') returns 10 while getCAId("abba") returns "abba". =cut sub getCAId { my $string = shift; if ( $string =~ /$(\S+),(\S+)$/ ) { return $1; } else { return $string; # just in case we have a real ID } } # getCAId ################################################ # calculate the gc content of contigs in scaffolds by getting the # the percentage of g and c bases in the contig consensus for all the # scaffold contigs sub calculateGC($$$) { my $scaffcontigs_ref = shift; my $gcs_ref = shift; my $sizes_ref = shift; my %scaffcontigs = %$scaffcontigs_ref; my %gcs = %$gcs_ref; my %sizes = %$sizes_ref; my @contigids = keys(%scaffcontigs); my $gc_tot = 0; my $cons_tot = 0; foreach my $contig_id (@contigids) { my $gc_num = $gcs{$contig_id}; my $contig_size = $sizes{$contig_id}; $gc_tot += $gc_num; $cons_tot += $contig_size; } my $gc_perc = 0; if ( $cons_tot != 0 ) { $gc_perc = ( $gc_tot / $cons_tot ); } return $gc_perc; } # Emit the results to the terminal, and if prefix specified, to the # prefix.qc file, as a config-readable output. # sub printl($$$) { my $tag = shift; my $rh_value = shift; my $rfh = shift; my $s1 = sprintf( '%-32s', $tag ); print( STDERR sprintf( "%s%-10s", $s1, $$rh_value{$tag} ) . "\n" ) if ( !$silent ); $rfh->print( "$tag=" . $$rh_value{$tag} . "\n" ); } # Emit the results to the terminal, and if prefix specified, to the # prefix.qc file, as a config-readable output. Integer form. # sub printld($$$) { my $tag = shift; my $rh_value = shift; my $rfh = shift; my $s1 = sprintf( "%-32s", $tag ); print( STDERR sprintf( "%s%-d", $s1, Math::BigFloat->new( $$rh_value{$tag} )->ffround(0) ) . "\n" ) if ( !$silent ); $rfh->print( "$tag=" . sprintf( "%d", Math::BigFloat->new( $$rh_value{$tag} )->ffround(0) ) . "\n" ); } # Emit the results to the terminal, and if prefix specified, to the # prefix.qc file, as a config-readable output. Float form. # sub printlf($$$) { my $tag = shift; my $rh_value = shift; my $rfh = shift; my $s1 = sprintf( "%-32s", $tag ); print( STDERR sprintf( "%s%-0.2f", $s1, $$rh_value{$tag} ) . "\n" ) if ( !$silent ); $rfh->print( "$tag=" . sprintf( "%0.2f", $$rh_value{$tag} ) . "\n" ); } # output percentage of total reads sub printp($$$$) { my ( $tag, $rh_value, $rfh, $tot ) = @_; my $val = sprintf "%s(%.2f%%)", $$rh_value{$tag}, $$rh_value{$tag} / $tot * 100; printf( STDERR "%-32s%-10s\n", $tag, $val ) if ( !$silent ); $rfh->print("$tag=$val\n"); } sub max($$) { my ( $a, $b ) = @_; return ( $a > $b ) ? $a : $b; } sub min($$) { my ( $a, $b ) = @_; return $a if ( $a != 0 && $b == 0 ); return $b if ( $a == 0 && $b != 0 ); return ( $a < $b ) ? $a : $b; } sub calcNStats($$$$$) { my ( $type, $gsz, $lens, $sortContig, $Results ) = @_; my $prevSz = 0; my $prevIncr = 0; my $incrSize = $gsz / 10; my @N = ( .25, .5, .75 ); my $sum = 0; if ( $gsz < 10000000 ) { # 10M $incrSize = 1000000; } elsif ( $gsz < 100000000 ) { # 100M $incrSize = 10000000; } elsif ( $gsz < 1000000000 ) { # 1G $incrSize = 100000000; } elsif ( $gsz < 10000000000 ) { # 10G $incrSize = 1000000000; } foreach my $cc (@$sortContig) { my $len = $lens->{$cc}; $sum += $len; # use while loops instead of if's since big contig/scaffold can span divisions while ( @N && $sum > $gsz * $N[0] ) { push @{ $Results->{"N${type}Bases"} }, [ shift(@N), $len ]; } while ( int( $sum / $incrSize ) > $prevIncr ) { $prevIncr++; if ( $len != $prevSz ) { push @{ $Results->{"Incr${type}Bases"} }, [ $prevIncr * $incrSize, $len ]; $prevSz = $len; } } } } my %bad_mates = (); my @MDI_Rec = (); sub trackBadMates($$$$$) { my ( $type, $uid, $size, $mid, $sfs ) = @_; my $orient = '+'; my ( $l, $r ) = split ',', $$sfs{'pos'}; if ( $l > $r ) { ( $l, $r ) = ( $r, $l ); $orient = '-'; } my $status = $bad_mates{$mid}[0]; my $libIID = $bad_mates{$mid}[1]; my $mateExtent; if ( @{ $bad_mates{$mid} } > 2 ) { $status = $bad_mates{$mid}[-2]; $mateExtent = $bad_mates{$mid}[-1]; } else { my $stddev = $MDI_Rec[$libIID]{std}; my $mean = $MDI_Rec[$libIID]{mea}; if ( !$stddev ) { print STDERR "undef stddev for lib $libIID, frag $mid\n"; } # calculate max range withing 5 stddevs of mean if ( $orient eq '+' ) { $mateExtent = int( $l + $mean + 5 * $stddev ); } else { $mateExtent = int( $r - $mean - 5 * $stddev ); } } $bad_mates{$mid} = [ "$type:$uid", $size, $l, $r, $orient, $status, $mateExtent ]; } my @LibIIDRange = (); sub getLibIID($) { my ($frgIID) = @_; # ranges should be ordered, so first end > fragIID should give correct lib for ( my $i = 1 ; $i < @LibIIDRange ; $i++ ) { return $i if $frgIID <= $LibIIDRange[$i]; } return -1; } sub outputHelp($) { my $helpflag = shift; if ( lc($helpflag) eq 'all' ) { foreach (@help_lines) { print; } } elsif ( $helpflag eq '' ) { print STDERR $MY_HELPTEXT; } else { my %help_text_headers = (); my %help_text_headers_lc = (); my %help_text_map = (); my %help_text_map_lc = (); my $current_header = undef; my @current_array = undef; foreach my $line (@help_lines) { if ( $line =~ /^\[(.*)\]/ ) { #Section if ( defined $current_header ) { $help_text_headers{$current_header} = [@current_array]; my @fields = @{ $help_text_headers{$current_header} }; @current_array = (); } $current_header = $1; } elsif ( $line =~ /^(\S+)\s*-\s*(.*)$/ ) { $help_text_map{$1} = $2; if ( defined $current_header ) { push @current_array, $1; } } } foreach my $key ( keys %help_text_map ) { $help_text_map_lc{ lc $key } = $key; } foreach my $key ( keys %help_text_headers ) { $help_text_headers_lc{ lc $key } = $key; } if ( exists $help_text_headers_lc{ lc $helpflag } ) { $helpflag = $help_text_headers_lc{ lc $helpflag }; print STDOUT "[$helpflag]\n"; my @fields = @{ $help_text_headers{$helpflag} }; foreach my $field (@fields) { my $value = undef; $value = $help_text_map{$field} if ( defined $field ); print STDOUT "$field - $value\n" if ( defined $value ); } } elsif ( exists $help_text_map_lc{ lc $helpflag } ) { $helpflag = $help_text_map_lc{ lc $helpflag }; my $help_text = $help_text_map{$helpflag}; print STDOUT "$helpflag - $help_text\n"; } else { print STDOUT "Term '$helpflag' is not a valid help search. To see all terms type use '-help all'.\n"; } } } sub outputHtml() { my $HTML_HEAD = qq~ OUTPUT GUIDE

CAQC

A description of the outputs of qc output of the Celera Assembler.

~; my $HTML_TAIL = qq~ ~; print STDOUT $HTML_HEAD; foreach my $line (@help_lines) { if ( $line =~ /^\[.*\]/ ) { #Section print STDOUT "

$line

"; } elsif ( $line =~ /^(\S+)\s*-\s*(.*)$/ ) { print STDOUT "

$1 - $2

"; } } print STDOUT $HTML_TAIL; } sub readHelpFile() { my $help_fh = new IO::File("< $help_file") or die "Unable to read $help_file. Error: '$!'"; @help_lines = <$help_fh>; } MAIN: { my $infile; my $delim = undef; my $sep = undef; my $genomesize; my $topCount = undef; my $frg = undef; my $metrics = undef; my $helpflag = undef; my $version = undef; my $html = undef; Getopt::Long::Configure('no_ignore_case'); my $err = GetOptions( 'i=s' => \$infile, 'minqual=i' => \$MINQUAL, 'mincontig=i' => \$MINCONTIG, 'd=s' => \$delim, 's=s' => \$sep, 'silent!' => \$silent, 'euid!' => \$euid, 'g=i' => \$genomesize, 't=i' => \$topCount, 'metrics' => \$metrics, 'frg' => \$frg, 'help:s' => \$helpflag, 'V|v' => \$version, 'html' => \$html, ); if ( $err == 0 ) { die "Try -h for help.\n"; } if ($version) { print STDERR $MY_VERSION . "\n"; exit(1); } if ( defined $helpflag ) { readHelpFile(); outputHelp($helpflag); exit(1); } if ($html) { readHelpFile(); outputHtml(); exit(1); } $d = ( defined $delim ) ? $delim : $d; $s = ( defined $sep ) ? $sep : $s; $topCount = $DEFAULT_TOP_COUNT if ( !defined $topCount ); if ( !defined $infile ) { if ( $#ARGV < 0 ) { print STDERR $MY_HELPTEXT; die("\nMust specify an input file name.\n"); } else { $infile = $ARGV[0]; } } my $mateLinkRangeFile = 'mateLinkIIDRanges.txt'; if ( -s $mateLinkRangeFile ) { open( MLRF, "<$mateLinkRangeFile" ) || die "Read of $mateLinkRangeFile failed."; # Lines look like: Distance 1 has IID range 1 to 5675 # as created from mateLinkIIDRanges.rb while () { chop; my ( undef, $libNum, undef, undef, undef, $b, undef, $e ) = split; $LibIIDRange[$libNum] = $e; } close MLRF; } #If input is just the prefix i.e. "gbaf" it will append the ".asm", #otherwise, if input already has the ".asm" extension, it will not be touched if ( $infile !~ /\.asm$/ ) { $infile .= '.asm'; } open( IN, $infile ) or die("Cannot open $infile: ($!)"); my ( $prefix, $path, $suffix ) = fileparse( $infile, '.asm' ); my $frgfile = "$path/$prefix.frg"; my $catmapfile = "$path/$prefix.catmap"; my $frgOpen = undef; my $catOpen = undef; if ( defined $frg or defined $metrics ) { $frgOpen = open( FRG, $frgfile ) or die("Cannot open $frgfile"); $catOpen = open( CAT, $catmapfile ) or warn("Cannot open $catmapfile"); } my $record; my %lens; # contig lengths wihtout gaps my %sizes; # contig sizes with gaps my %gcs; # g,c content of the contig consensus my %seqs; # contig number of reads my %scaflen; # sum of contig sizes in scaffold my %scafcontig; # number of contigs in scaffold with key scaffold_id my %gc_scaffcontigs; # contigs in scaffolds with key contigid my %inscaff; # contigs in scaffolds my %adjscaflen; # length of scaffold including inner-scaffold gaps my %readlens; # sum of lengths of reads in contig my %varctg; # number of var records in a contig my %utgFrags; # frags in surrogate unitigs my %libs_initial; # the libraries for a project before assembly my %libs_final; # the libraries for a project after assembly my $totalCLRReadLengthASM = 0; # Aggregate length of CLR of input reads used in assembly from ASM File my $totalSeqs = 0; # Total number of reads used in assembly my $ncontigs = 0; my $nunitigs = 0; my $uunitigs = 0; #Daniela Nov 16th 2005 my $numSingletons = 0; #VI my $lenPlacedSurrogates = 0; #VI my $numPlacedSurrogates = 0; #VI my $numPlacedSurrogateFrags = 0; #VI my $lenPlacedSurrogateFrags = 0; #VI my $utgSeqs = 0; my $uutgSeqs = 0; #Daniela Nov 16th 2005 my $surrReadLen = 0; # reads in surrogates my $contigReadLen = 0; # reads in contigs my $degenReadLen = 0; # reads in degenerates my $singReadLen = 0; # reads in singletons my %utglens = (); my $utglensTotal = 0; my %uutglens = (); #Daniela Nov 16th 2005 my $utgs = Statistics::Descriptive::Sparse->new(); my $uutgs = Statistics::Descriptive::Sparse->new(); #Daniela Nov 16th 2005 my %Results = (); $Results{ReadsWithChaffMate} = 0; $Results{ReadsWithBadShortMate} = 0; $Results{ReadsWithBothDegenMate} = 0; $Results{ReadsWithDegenMate} = 0; $Results{ReadsWithDiffScafMate} = 0; $Results{ReadsWithGoodMate} = 0; $Results{ReadsWithBothChaffMate} = 0; $Results{ReadsWithBadLongMate} = 0; $Results{ReadsWithNoMate} = 0; $Results{ReadsWithOuttieMate} = 0; $Results{ReadsWithSurrogateMate} = 0; $Results{ReadsWithSameOrientMate} = 0; $Results{ReadsWithBothSurrMate} = 0; $Results{ReadsWithUnassignedMate} = 0; my %readLen; my $scaffLinkWeights = 0; # sum of links connecting scaffolds my $Kb_scaff_count = 0; my $Kb_scaff_tot_span = 0; my %cats = (); if ($catOpen) { while ( defined( my $cat_line = ) ) { chomp($cat_line); my @maps = split( / /, $cat_line ); $cats{ $maps[0] } = $maps[1]; } } my $totalScaffolds = 0; my $totalContigsInScaffolds = 0; my $intraScaffoldGaps = 0; my $minContigsPerScaffold = 0; my $maxContigsPerScaffold = 0; my $totalSpanOfScaffolds = 0; my $minScaffoldSpan = 0; my $maxScaffoldSpan = 0; my $totalBasesInScaffolds = 0; my $minBasesInScaffolds = 0; my $maxBasesInScaffolds = 0; while ( $record = getCARecord( \*IN ) ) { my $type; if ( $record =~ m/^\{?(\w+)/ ) { $type = $1; } if ( $type eq 'CCO' ) { my ( $type, $fields, $recs ) = parseCARecord($record); $ncontigs++; my $contiglen = $$fields{'len'}; my $nreads = $$fields{'npc'}; my $qual = $$fields{'qlt'}; my ( $contig_euid, $contigid ) = getBothCAIds( $$fields{'acc'} ); $sizes{$contig_euid} = $contiglen; $lens{$contig_euid} = $contiglen; my $ngaps = $$fields{'cns'} =~ tr/-/-/; my $n_gbases = $$fields{'cns'} =~ tr/G/G/; my $n_cbases = $$fields{'cns'} =~ tr/C/C/; $lens{$contig_euid} -= $ngaps; $seqs{$contig_euid} = $nreads; $gcs{$contig_euid} = $n_gbases + $n_cbases; foreach my $rec (@$recs) { my ( $sid, $sfs, $srecs ) = parseCARecord($rec); if ( $sid eq 'MPS' ) { my $mid = $$sfs{'mid'}; my $rlen = $readLen{$mid}; #abs($r - $l); $readlens{$contig_euid} += $rlen; if ( exists $utgFrags{$mid} ) { $numPlacedSurrogateFrags++; $lenPlacedSurrogateFrags += $rlen; } if ( exists $bad_mates{$mid} ) { trackBadMates( 'CCO', $contig_euid, $contiglen, $mid, $sfs ); } } elsif ( $sid eq 'UPS' ) { my $typ = $$sfs{'typ'}; if ( $typ eq 'S' ) { my ( $ub, $ue ) = split ',', $$sfs{'pos'}; $lenPlacedSurrogates += abs( $ue - $ub ); $numPlacedSurrogates++; } } elsif ( $sid eq 'VAR' ) { $varctg{$contig_euid}++; } } } # if $type = CCO elsif ( $type eq 'UTG' ) { my ( $type, $fields, $recs ) = parseCARecord($record); #Daniela Nov 16th 2005 if ( $$fields{'sta'} eq 'U' ) # Daniela Jan 24th 2006 { $uunitigs++; $uutgs->add_data( $$fields{'len'} ); $uutgSeqs += $$fields{'nfr'}; $uutglens{ getCAId( $$fields{'acc'} ) } = $$fields{'len'}; } elsif ( $$fields{'sta'} ne 'N' ) { $nunitigs++; my $utgLen = $$fields{'len'}; $utgs->add_data($utgLen); $utgSeqs += $$fields{'nfr'}; $utglens{ getCAId( $$fields{'acc'} ) } = $utgLen; $utglensTotal += $utgLen; for ( my $rd = 0 ; $rd <= $#$recs ; $rd++ ) { my ( $sid, $sfs, $srecs ) = parseCARecord( $$recs[$rd] ); if ( $sid eq 'MPS' ) { my $mid = $$sfs{'mid'}; $surrReadLen += $readLen{$mid}; $utgFrags{$mid} = 1; if ( exists $bad_mates{$mid} ) { my ( $uuid, $uiid ) = getBothCAIds( $$fields{'acc'} ); trackBadMates( 'UTG', $uuid, $utgLen, $mid, $sfs ); } } } } } elsif ( $type eq 'SCF' ) { my ( $type, $fields, $recs ) = parseCARecord($record); my ( $scf_euid, $scfid ) = getBothCAIds( $$fields{'acc'} ); my %scaffcontigs = (); my $adj = 0; for ( my $i = 0 ; $i <= $#$recs ; $i++ ) { my ( $lrec, $lfield, $lrecs ) = parseCARecord( $$recs[$i] ); $scaffcontigs{ $$lfield{'ct1'} }++; $scaffcontigs{ $$lfield{'ct2'} }++; $gc_scaffcontigs{ $$lfield{'ct1'} }++; $gc_scaffcontigs{ $$lfield{'ct2'} }++; $inscaff{ $$lfield{'ct1'} } = 1; $inscaff{ $$lfield{'ct2'} } = 1; $adj += $$lfield{'mea'}; } my $contigsInScaffCount = 0; my $scaflenTmp = 0; while ( my ( $ct, $freq ) = each %scaffcontigs ) { $contigsInScaffCount++; $scaflenTmp += $lens{$ct}; } $scafcontig{$scf_euid} = $contigsInScaffCount; $scaflen{$scf_euid} = $scaflenTmp; $adjscaflen{$scf_euid} = $scaflenTmp + int($adj); if ( ( $adjscaflen{$scf_euid} ) >= 2000 ) { $Kb_scaff_count++; $Kb_scaff_tot_span = $Kb_scaff_tot_span + $adjscaflen{$scf_euid}; } $totalScaffolds++; $totalContigsInScaffolds += $contigsInScaffCount; $intraScaffoldGaps += $contigsInScaffCount - 1; $minContigsPerScaffold = min( $minContigsPerScaffold, $contigsInScaffCount ); $maxContigsPerScaffold = max( $maxContigsPerScaffold, $contigsInScaffCount ); $totalSpanOfScaffolds += $adjscaflen{$scf_euid}; $minScaffoldSpan = min( $minScaffoldSpan, $adjscaflen{$scf_euid} ); $maxScaffoldSpan = max( $maxScaffoldSpan, $adjscaflen{$scf_euid} ); $totalBasesInScaffolds += $scaflenTmp; $minBasesInScaffolds = min( $minBasesInScaffolds, $scaflenTmp ); $maxBasesInScaffolds = max( $maxBasesInScaffolds, $scaflenTmp ); } # if $type = SCF elsif ( $type eq 'SLK' ) { # scaffold link my ( $type, $fields, $recs ) = parseCARecord($record); $Results{TotalScaffoldLinks}++; $scaffLinkWeights += $$fields{num}; } elsif ( $type eq 'AFG' ) { my ( $type, $fields, $recs ) = parseCARecord($record); my ( $frag_id, $fragIID ) = getBothCAIds( $$fields{'acc'} ); my $clrs = $$fields{'clr'}; my ( $clrl, $clrr ) = split /,/, $clrs; my $length = $clrr - $clrl; $totalCLRReadLengthASM += $length; $readLen{$frag_id} = $length; $totalSeqs++; my $mstField = $$fields{'mst'}; $bad_mates{$frag_id} = [ $mstField, getLibIID($fragIID) ]; if ( $mstField eq 'A' ) { $Results{ReadsWithChaffMate}++; } elsif ( $mstField eq 'C' ) { $Results{ReadsWithBadShortMate}++; } elsif ( $mstField eq 'D' ) { $Results{ReadsWithBothDegenMate}++; } elsif ( $mstField eq 'E' ) { $Results{ReadsWithDegenMate}++; } elsif ( $mstField eq 'F' ) { $Results{ReadsWithDiffScafMate}++; } elsif ( $mstField eq 'G' ) { $Results{ReadsWithGoodMate}++; delete $bad_mates{$frag_id}; } elsif ( $mstField eq 'H' ) { $Results{ReadsWithBothChaffMate}++; } elsif ( $mstField eq 'L' ) { $Results{ReadsWithBadLongMate}++; } elsif ( $mstField eq 'N' ) { $Results{ReadsWithNoMate}++; delete $bad_mates{$frag_id}; } elsif ( $mstField eq 'O' ) { $Results{ReadsWithOuttieMate}++; } elsif ( $mstField eq 'R' ) { $Results{ReadsWithSurrogateMate}++; } elsif ( $mstField eq 'S' ) { $Results{ReadsWithSameOrientMate}++; } elsif ( $mstField eq 'U' ) { $Results{ReadsWithBothSurrMate}++; } elsif ( $mstField eq 'Z' ) { $Results{ReadsWithUnassignedMate}++; } else { $Results{ReadsWithUnassignedMate}++; } $numSingletons++ if $$fields{'cha'} == 1; } elsif ( $type eq 'MDI' ) { my ( $type, $fields, $recs ) = parseCARecord($record); my ( $id, $iid ) = getBothCAIds( $$fields{'ref'} ); my $mea = $$fields{'mea'}; my $std = $$fields{'std'}; my $min = $$fields{'min'}; my $max = $$fields{'max'}; my $buc = $$fields{'buc'}; my $hist = $$fields{'his'}; $hist =~ s/\s*$//g; $hist =~ s/\n/,/g; $MDI_Rec[$iid] = { 'mea' => $mea, 'std' => $std }; if ($metrics) { my $lib_id = undef; if ( defined $cats{$id} ) { $lib_id = $cats{$id}; } elsif ( !-e ($catmapfile) ) { $lib_id = $id; } if ( defined $lib_id ) { $libs_final{$lib_id}->{'mea'} = $mea; $libs_final{$lib_id}->{'std'} = $std; $libs_final{$lib_id}->{'min'} = $min; $libs_final{$lib_id}->{'max'} = $max; $libs_final{$lib_id}->{'buc'} = $buc; $libs_final{$lib_id}->{'hist'} = $hist; } } } } # while $record my $totalBases = 'NA'; my $totalCLRReadLengthFRG = 'NA' ; # Aggregate length of CLR of input reads used in assembly from FRG File my $totalReadsInFRG = 'NA'; # Aggregate number of reads used in assembly from FRG File if ( $frgOpen or $metrics ) { $totalBases = 0; $totalCLRReadLengthFRG = 0; $totalReadsInFRG = 0; while ( $record = getCARecord( \*FRG ) ) { my $type; if ( $record =~ m/^\{?(\w+)/ ) { $type = $1; } if ( $type eq 'DST' and $metrics ) { my ( $type, $fields, $recs ) = parseCARecord($record); my $id = $$fields{'acc'}; my $lib_id = undef; if ( defined $cats{$id} ) { $lib_id = $cats{$id}; } elsif ( !-e ($catmapfile) ) { $lib_id = $id; } if ( defined $lib_id ) { my $mea = $$fields{'mea'}; my $std = $$fields{'std'}; $libs_initial{$lib_id}->{'mea'} = $mea; $libs_initial{$lib_id}->{'std'} = $std; } } elsif ( $type eq 'FRG' ) { my ( $type, $fields, $recs ) = parseCARecord($record); my $seq = $$fields{'seq'}; if (!defined $seq) { print STDERR "Seq not defined for record:$record\n"; next; } my $seqLen = $seq =~ tr/a-zA-Z//; $totalBases += $seqLen; my $clrStr = $$fields{'clr'}; if (!defined $clrStr) { print STDERR "clr not defined for record:$record\n"; next; } if ( $clrStr =~ /^(\d+),(\d+)$/ ) { my $clearRangeLen = $2 - $1 + 1; $totalCLRReadLengthFRG += $clearRangeLen; } $totalReadsInFRG++; } } } # Initialize the Results hash so that we never get blanks for values $Results{BasesCount} = $totalBases; $Results{ClearRangeLengthFRG} = $totalCLRReadLengthFRG; $Results{ClearRangeLengthASM} = $totalCLRReadLengthASM; $Results{SurrogateBaseLength} = $surrReadLen; $Results{ContigBaseLength} = 0; $Results{DegenBaseLength} = 0; $Results{SingletonBaseLength} = 0; $Results{Contig_SurrBaseLength} = 0; $Results{TotalScaffolds} = 0; $Results{TotalContigsInScaffolds} = 0; $Results{MeanContigsPerScaffold} = 0.0; $Results{MinContigsPerScaffold} = 0; $Results{MaxContigsPerScaffold} = 0; $Results{TotalBasesInScaffolds} = 0; $Results{MeanBasesInScaffolds} = 0.0; $Results{MinBasesInScaffolds} = 0; $Results{MaxBasesInScaffolds} = 0; $Results{TotalSpanOfScaffolds} = 0; $Results{MeanSpanOfScaffolds} = 0.0; $Results{MinScaffoldSpan} = 0; $Results{MaxScaffoldSpan} = 0; $Results{IntraScaffoldGaps} = 0; $Results{MeanSequenceGapLength} = 0.0; $Results{'2KbScaffolds'} = 0; $Results{'2KbScaffoldsSpan'} = 0; $Results{MeanContigLength} = 0.0; $Results{MinContigLength} = 0; $Results{MaxContigLength} = 0; $Results{TotalBigContigs} = 0; $Results{BigContigLength} = 0; $Results{MeanBigContigLength} = 0.0; $Results{MinBigContig} = 0; $Results{MaxBigContig} = 0; $Results{BigContigsPercentBases} = 0.0; $Results{TotalSmallContigs} = 0; $Results{SmallContigLength} = 0; $Results{MeanSmallContigLength} = 0.0; $Results{MinSmallContig} = 0; $Results{MaxSmallContig} = 0; $Results{SmallContigsPercentBases} = 0.0; $Results{TotalDegenContigs} = 0; $Results{DegenContigLength} = 0; $Results{MeanDegenContigLength} = 0.0; $Results{MinDegenContig} = 0; $Results{MaxDegenContig} = 0; $Results{DegenPercentBases} = 0.0; $Results{TotalUsableReads} = 0; $Results{TotalReadsInput} = $totalReadsInFRG; $Results{AvgClearRange} = ( $totalSeqs > 0 ) ? $totalCLRReadLengthASM / $totalSeqs : 0; $Results{ContigReads} = 0; $Results{BigContigReads} = 0; $Results{SmallContigReads} = 0; $Results{DegenContigReads} = 0; $Results{SurrogateReads} = 0; $Results{SingletonReads} = 0; $Results{SingletonPercentBases} = 0.0; $Results{TotalVarRecords} = 0; $Results{DegenVarRecords} = 0; $Results{ChaffReads} = $numSingletons; $Results{AllReads} = 0.0; $Results{ContigsOnly} = 0.0; $Results{Contigs_Surrogates} = 0.0; $Results{Contigs_Degens_Surrogates} = 0.0; $Results{TotalSurrogates} = $nunitigs; $Results{SurrogatesPercentBases} = 0.0; $Results{SurrogateInstances} = $numPlacedSurrogates; $Results{SurrogateLength} = $utgs->sum(); $Results{SurrogateInstanceLength} = $lenPlacedSurrogates; $Results{UnPlacedSurrReadLen} = $surrReadLen - $lenPlacedSurrogateFrags; $Results{PlacedSurrReadLen} = $lenPlacedSurrogateFrags; $Results{PlacedSurrogateReads} = $numPlacedSurrogateFrags; $Results{MeanSurrogateLength} = $utgs->mean(); $Results{MinSurrogateLength} = $utgs->min(); $Results{MaxSurrogateLength} = $utgs->max(); $Results{SDSurrogateLength} = $utgs->standard_deviation(); $Results{TotalUUnitigs} = $uunitigs; $Results{MeanUUnitigLength} = $uutgs->mean(); $Results{MinUUnitigLength} = $uutgs->min(); $Results{MaxUUnitigLength} = $uutgs->max(); $Results{SDUUnitigLength} = $uutgs->standard_deviation(); if ( !exists $Results{ReadsWithNoMate} ) { $Results{ReadsWithNoMate} = 0; } if ( !exists $Results{ReadsWithBadMate} ) { $Results{ReadsWithBadMate} = 0; } if ( !exists $Results{ReadsWithGoodMate} ) { $Results{ReadsWithGoodMate} = 0; } if ( !exists $Results{TotalScaffoldLinks} ) { $Results{TotalScaffoldLinks} = 0; $Results{MeanScaffoldLinkWeight} = 0; } else { $Results{MeanScaffoldLinkWeight} = $scaffLinkWeights / $Results{TotalScaffoldLinks}; } #Begin Marwan 10-17-06: #Statistics::Descriptive::Sparse->new() functions return undef if no value exists if ( !defined $Results{MeanSurrogateLength} ) { $Results{MeanSurrogateLength} = 0; } if ( !defined $Results{MinSurrogateLength} ) { $Results{MinSurrogateLength} = 0; } if ( !defined $Results{MaxSurrogateLength} ) { $Results{MaxSurrogateLength} = 0; } if ( !defined $Results{SDSurrogateLength} ) { $Results{SDSurrogateLength} = 0; } if ( !defined $Results{MeanUUnitigLength} ) { $Results{MeanUUnitigLength} = 0; } if ( !defined $Results{MinUUnitigLength} ) { $Results{MinSurrogateLength} = 0; } if ( !defined $Results{MaxUUnitigLength} ) { $Results{MaxSurrogateLength} = 0; } if ( !defined $Results{SDUUnitigLength} ) { $Results{SDSurrogateLength} = 0; } #End Marwan 10-17-06 my $totlen = 0; my $totseqs = 0; my $biglen = 0; my $bigseqs = 0; my $nbigs = 0; my $maxcontig = 0; for ( my $i = 0 ; $i < $ncontigs ; $i++ ) { my ( $contig_id, $len ) = each(%lens); my $number; my $min; my $max; my $totlen; my $nseq; my $vars; if ( exists $inscaff{$contig_id} ) { # the good guys if ( $len >= $MINCONTIG ) { # the good big guys $number = 'TotalBigContigs'; $min = 'MinBigContig'; $max = 'MaxBigContig'; $totlen = 'BigContigLength'; $nseq = 'BigContigReads'; } else { # the small good guys $number = 'TotalSmallContigs'; $min = 'MinSmallContig'; $max = 'MaxSmallContig'; $totlen = 'SmallContigLength'; $nseq = 'SmallContigReads'; } $vars = 'TotalVarRecords'; $contigReadLen += $readlens{$contig_id}; } else { # the chaff $number = 'TotalDegenContigs'; $min = 'MinDegenContig'; $max = 'MaxDegenContig'; $totlen = 'DegenContigLength'; $nseq = 'DegenContigReads'; $vars = 'DegenVarRecords'; $degenReadLen += $readlens{$contig_id}; } $Results{$max} = max( $len, $Results{$max} ); $Results{$min} = min( $len, $Results{$min} ); $Results{$number}++; $Results{$totlen} += $len; $Results{$nseq} += $seqs{$contig_id}; $Results{$vars} += $varctg{$contig_id}; } # for $i < $ncontigs #This is the same thing as TotalContigsInScaffolds $Results{'TotalContigs'} = 0 + $Results{'TotalBigContigs'} + $Results{'TotalSmallContigs'}; #This is the same thing as TotalBasesInScaffolds $Results{'TotalBasesInContigs'} = 0 + $Results{'BigContigLength'} + $Results{'SmallContigLength'}; $Results{'MeanContigLength'} = ( $Results{'TotalContigs'} > 0 ) ? $Results{'TotalBasesInContigs'} * 1.0 / $Results{'TotalContigs'} : 0; $Results{'ContigReads'} = 0 + $Results{'BigContigReads'} + $Results{'SmallContigReads'}; #How could this ever be MinBigContig? $Results{'MinContigLength'} = min( $Results{'MinSmallContig'}, $Results{'MinBigContig'} ); #How could this ever be MaxBigContig? $Results{'MaxContigLength'} = max( $Results{'MaxBigContig'}, $Results{'MaxSmallContig'} ); $Results{'MeanBigContigLength'} = ( $Results{'TotalBigContigs'} > 0 ) ? $Results{'BigContigLength'} * 1.0 / $Results{'TotalBigContigs'} : 0; $Results{'MeanSmallContigLength'} = ( $Results{'TotalSmallContigs'} ) ? $Results{'SmallContigLength'} * 1.0 / $Results{'TotalSmallContigs'} : 0; $Results{'BigContigsPercentBases'} = ( $Results{'TotalBasesInContigs'} > 0 ) ? $Results{'BigContigLength'} * 100.0 / $Results{'TotalBasesInContigs'} : 0; $Results{'SmallContigsPercentBases'} = ( $Results{'TotalBasesInContigs'} > 0 ) ? $Results{'SmallContigLength'} * 100.0 / $Results{'TotalBasesInContigs'} : 0; $Results{'MeanDegenContigLength'} = ( $Results{'TotalDegenContigs'} ) ? $Results{'DegenContigLength'} * 1.0 / $Results{'TotalDegenContigs'} : 0; $Results{'DegenPercentBases'} = ( $Results{'TotalBasesInContigs'} > 0 ) ? $Results{'DegenContigLength'} * 100.0 / $Results{'TotalBasesInContigs'} : 0; # compute N50 values and top $topCount guys (contigs) my @sortContig = sort { $lens{$b} <=> $lens{$a} } ( keys %inscaff ); my $top5contig; my $sum = 0; my $reads_tot = 0; my $bases_tot = 0; for ( my $cc = 0 ; $cc <= $#sortContig && $cc < $topCount ; $cc++ ) { $top5contig .= "$cc$d$seqs{$sortContig[$cc]}$s$lens{$sortContig[$cc]}"; $top5contig .= "$s$sortContig[$cc]" if ($euid); $top5contig .= "\n"; $reads_tot += $seqs{ $sortContig[$cc] }; $bases_tot += $lens{ $sortContig[$cc] }; } $top5contig .= "total$d$reads_tot$s$bases_tot\n"; my $gsz; if ( !defined $genomesize ) { $gsz = $Results{'TotalBasesInContigs'}; } else { $gsz = $genomesize; } calcNStats( 'Contig', $gsz, \%lens, \@sortContig, \%Results ); $Results{'TotalScaffolds'} = $totalScaffolds; $Results{'TotalContigsInScaffolds'} = $totalContigsInScaffolds; $Results{'IntraScaffoldGaps'} = $intraScaffoldGaps; $Results{'MinContigsPerScaffold'} = $minContigsPerScaffold; $Results{'MaxContigsPerScaffold'} = $maxContigsPerScaffold; $Results{'TotalSpanOfScaffolds'} = $totalSpanOfScaffolds; $Results{'MinScaffoldSpan'} = $minScaffoldSpan; $Results{'MaxScaffoldSpan'} = $maxScaffoldSpan; $Results{'TotalBasesInScaffolds'} = $totalBasesInScaffolds; $Results{'MinBasesInScaffolds'} = $minBasesInScaffolds; $Results{'MaxBasesInScaffolds'} = $maxBasesInScaffolds; $Results{'2KbScaffolds'} = $Kb_scaff_count; $Results{'2KbScaffoldSpan'} = $Kb_scaff_tot_span; $Results{'MeanContigsPerScaffold'} = ( $Results{'TotalScaffolds'} > 0 ) ? $Results{'TotalContigsInScaffolds'} * 1.0 / $Results{'TotalScaffolds'} : 0.0; $Results{'MeanBasesInScaffolds'} = ( $Results{'TotalScaffolds'} > 0 ) ? $Results{'TotalBasesInScaffolds'} * 1.0 / $Results{'TotalScaffolds'} : 0.0; $Results{'MeanSpanOfScaffolds'} = ( $Results{'TotalScaffolds'} > 0 ) ? $Results{'TotalSpanOfScaffolds'} * 1.0 / $Results{'TotalScaffolds'} : 0.0; $Results{'MeanSequenceGapLength'} = ( $Results{'IntraScaffoldGaps'} > 0 ) ? ( $Results{'TotalSpanOfScaffolds'} - $Results{'TotalBasesInScaffolds'} ) * 1.0 / $Results{'IntraScaffoldGaps'} : 0.0; # compute N50 values and top $topCount guys (scaffolds) my @sortScaff = sort { $scaflen{$b} <=> $scaflen{$a} } ( keys %scaflen ); my $top5scaff; $sum = 0; my $contigs_tot = 0; my $gaps_tot = 0; my $size_tot = 0; my $span_tot = 0; my $avgContig_tot = 0; my $avgGap_tot = 0; for ( my $ss = 0 ; $ss <= $#sortScaff && $ss < $topCount ; $ss++ ) { my $scf = $sortScaff[$ss]; $top5scaff .= "$ss$d$scafcontig{$scf}$s$scaflen{$scf}$s$adjscaflen{$scf}$s" . sprintf( '%d', Math::BigFloat->new( $scaflen{$scf} * 1.0 / $scafcontig{$scf} ) ->ffround(0) ) . $s . sprintf( '%d', ( $scafcontig{$scf} - 1 > 0 ) ? ( Math::BigFloat->new( ( $adjscaflen{$scf} - $scaflen{$scf} ) * 1.0 / ( $scafcontig{$scf} - 1 ) )->ffround(0) ) : 0 ); $top5scaff .= "$s$scf" if ($euid); $top5scaff .= "\n"; $contigs_tot += $scafcontig{$scf}; $gaps_tot += $scafcontig{$scf} - 1; $size_tot += $scaflen{$scf}; $span_tot += $adjscaflen{$scf}; my $avgContig_size = sprintf( '%.2f', $scaflen{$scf} * 1.0 / $scafcontig{$scf} ); my $avgGap_size = sprintf( '%.2f', ( $scafcontig{$scf} - 1 > 0 ) ? ( ( $adjscaflen{$scf} - $scaflen{$scf} ) * 1.0 / ( $scafcontig{$scf} - 1 ) ) : 0.0 ); $avgContig_tot += ( $avgContig_size * $scafcontig{$scf} ); $avgGap_tot += ($avgGap_size) * ( $scafcontig{$scf} - 1 ); } if ( $contigs_tot != 0 ) { $avgContig_tot = $avgContig_tot / $contigs_tot; } else { $avgContig_tot = 0; } if ( $gaps_tot != 0 ) { $avgGap_tot = $avgGap_tot / $gaps_tot; } else { $avgGap_tot = 0; } my $top5scaff_place_holder = "total$d$contigs_tot$s$size_tot$s$span_tot$s" . sprintf( '%d', Math::BigFloat->new("$avgContig_tot")->ffround(0) ) . $s . sprintf( '%d', Math::BigFloat->new("$avgGap_tot")->ffround(0) ) . "\n"; $top5scaff .= $top5scaff_place_holder; if ( !defined $genomesize ) { $gsz = $Results{'TotalBasesInScaffolds'}; } else { $gsz = $genomesize; } calcNStats( 'Scaffold', $gsz, \%scaflen, \@sortScaff, \%Results ); $Results{'AllReads'} = ( $gsz > 0 ) ? sprintf( '%0.2f', $totalCLRReadLengthASM / $gsz ) : 0.0; #Coverage: $Results{'ContigsOnly'} = ( $Results{'TotalBasesInScaffolds'} > 0 ) ? ( ($contigReadLen) / $Results{'TotalBasesInScaffolds'} ) : 0.0; #$base->logLocal( "surrReadLen = $surrReadLen\ncontigReadLen = $contigReadLen\ndegenReadLen = $degenReadLen\ntotalReadLen = $totalCLRReadLengthASM", 2 ); # Bases $Results{ContigBaseLength} = $contigReadLen; $Results{DegenBaseLength} = $degenReadLen; $Results{Contig_SurrBaseLength} = $Results{'UnPlacedSurrReadLen'} + $contigReadLen; $Results{SingletonBaseLength} = $totalCLRReadLengthASM - $degenReadLen - $contigReadLen - $Results{'UnPlacedSurrReadLen'}; $Results{'Contigs_Degens_Surrogates'} = ( $Results{'TotalBasesInScaffolds'} + $Results{'DegenContigLength'} > 0 ) ? ( ( $Results{'UnPlacedSurrReadLen'} + $contigReadLen + $degenReadLen ) / ( $Results{'TotalBasesInScaffolds'} + $Results{'DegenContigLength'} ) ) : 0.0; $Results{'Contigs_Surrogates'} = ( $Results{'TotalBasesInScaffolds'} > 0 ) ? ( $Results{'Contig_SurrBaseLength'} / $Results{'TotalBasesInScaffolds'} ) : 0.0; # Surrogates $Results{'SurrogateReads'} = $utgSeqs; # Coverage $Results{'TotalUsableReads'} = $totalSeqs; $Results{'SingletonReads'} = $Results{'TotalUsableReads'} - $Results{'BigContigReads'} - $Results{'SmallContigReads'} - $Results{'DegenContigReads'} - $Results{'SurrogateReads'} + $Results{'PlacedSurrogateReads'}; # Should be in contigs # Parameter settings $Results{'MinBigContigSizeParm'} = $MINCONTIG; $Results{'MinQualityParm'} = $MINQUAL; # gc Content of contigs in scaffolds(placed contigs) # Daniela's change Nov 15th 2005 $Results{'Content'} = calculateGC( \%gc_scaffcontigs, \%gcs, \%sizes ) * 100; # Emit the results my $fh = new IO::File("> $prefix.qc") or die("Could not open $prefix.qc ($!)"); print STDERR "[Scaffolds]\n" if ( !$silent ); $fh->print("[Scaffolds]\n"); printl( 'TotalScaffolds', \%Results, $fh ); printl( 'TotalContigsInScaffolds', \%Results, $fh ); printlf( 'MeanContigsPerScaffold', \%Results, $fh ); printl( 'MinContigsPerScaffold', \%Results, $fh ); printl( 'MaxContigsPerScaffold', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); printl( 'TotalBasesInScaffolds', \%Results, $fh ); printld( 'MeanBasesInScaffolds', \%Results, $fh ); printl( 'MinBasesInScaffolds', \%Results, $fh ); printl( 'MaxBasesInScaffolds', \%Results, $fh ); for my $val ( @{ $Results{'NScaffoldBases'} } ) { printf $fh "N%2d%s=%d\n", $val->[0] * 100, 'ScaffoldBases', $val->[1]; printf STDERR "N%2d%-29s%d\n", $val->[0] * 100, 'ScaffoldBases', $val->[1]; } for my $val ( @{ $Results{'IncrScaffoldBases'} } ) { print $fh "ScaffoldAt$val->[0]=$val->[1]\n"; printf STDERR "ScaffoldAt%-22s%d\n", $val->[0], $val->[1]; } print STDERR "\n" if ( !$silent ); $fh->print("\n"); printl( 'TotalSpanOfScaffolds', \%Results, $fh ); printld( 'MeanSpanOfScaffolds', \%Results, $fh ); printl( 'MinScaffoldSpan', \%Results, $fh ); printl( 'MaxScaffoldSpan', \%Results, $fh ); printl( 'IntraScaffoldGaps', \%Results, $fh ); printl( '2KbScaffolds', \%Results, $fh ); printl( '2KbScaffoldSpan', \%Results, $fh ); printld( 'MeanSequenceGapLength', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); my $top5scaff_header = "[Top${topCount}Scaffolds${d}contigs${s}size${s}span${s}avgContig${s}avgGap"; if ( !$silent ) { if ($euid) { $top5scaff_header .= "${s}EUID]\n"; } else { $top5scaff_header .= "]\n"; } print STDERR $top5scaff_header; } $fh->print($top5scaff_header); if ( !$silent ) { print STDERR $top5scaff; } $fh->print($top5scaff); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[Contigs]\n" if ( !$silent ); $fh->print("[Contigs]\n"); printl( 'TotalContigsInScaffolds', \%Results, $fh ); printl( 'TotalBasesInScaffolds', \%Results, $fh ); printl( 'TotalVarRecords', \%Results, $fh ); printld( 'MeanContigLength', \%Results, $fh ); printl( 'MinContigLength', \%Results, $fh ); printl( 'MaxContigLength', \%Results, $fh ); for my $val ( @{ $Results{'NContigBases'} } ) { printf $fh "N%2d%s=%d\n", $val->[0] * 100, 'ContigBases', $val->[1]; printf STDERR "N%2d%-29s%d\n", $val->[0] * 100, 'ContigBases', $val->[1]; } for my $val ( @{ $Results{'IncrContigBases'} } ) { print $fh "ContigAt$val->[0]=$val->[1]\n"; printf STDERR "ContigAt%-24s%d\n", $val->[0], $val->[1]; } print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[BigContigs_greater_$MINCONTIG]\n" if ( !$silent ); $fh->print("[BigContigs_greater_$MINCONTIG]\n"); printl( 'TotalBigContigs', \%Results, $fh ); printl( 'BigContigLength', \%Results, $fh ); printld( 'MeanBigContigLength', \%Results, $fh ); printl( 'MinBigContig', \%Results, $fh ); printl( 'MaxBigContig', \%Results, $fh ); printlf( 'BigContigsPercentBases', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[SmallContigs]\n" if ( !$silent ); $fh->print("[SmallContigs]\n"); printl( 'TotalSmallContigs', \%Results, $fh ); printl( 'SmallContigLength', \%Results, $fh ); printld( 'MeanSmallContigLength', \%Results, $fh ); printl( 'MinSmallContig', \%Results, $fh ); printl( 'MaxSmallContig', \%Results, $fh ); printlf( 'SmallContigsPercentBases', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[DegenContigs]\n" if ( !$silent ); $fh->print("[DegenContigs]\n"); printl( 'TotalDegenContigs', \%Results, $fh ); printl( 'DegenContigLength', \%Results, $fh ); printl( 'DegenVarRecords', \%Results, $fh ); printld( 'MeanDegenContigLength', \%Results, $fh ); printl( 'MinDegenContig', \%Results, $fh ); printl( 'MaxDegenContig', \%Results, $fh ); printlf( 'DegenPercentBases', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); my $top5contig_header = "[Top${topCount}Contigs${d}reads${s}bases"; if ( !$silent ) { if ($euid) { $top5contig_header .= "${s}EUID]\n"; } else { $top5contig_header .= "]\n"; } print STDERR $top5contig_header; } $fh->print($top5contig_header); if ( !$silent ) { print STDERR $top5contig; } $fh->print($top5contig); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[UniqueUnitigs]\n" if ( !$silent ); $fh->print("[UniqueUnitigs]\n"); printl( 'TotalUUnitigs', \%Results, $fh ); printl( 'MinUUnitigLength', \%Results, $fh ); printl( 'MaxUUnitigLength', \%Results, $fh ); printld( 'MeanUUnitigLength', \%Results, $fh ); printld( 'SDUUnitigLength', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[Surrogates]\n" if ( !$silent ); $fh->print("[Surrogates]\n"); printl( 'TotalSurrogates', \%Results, $fh ); printl( 'SurrogateInstances', \%Results, $fh ); printl( 'SurrogateLength', \%Results, $fh ); printl( 'SurrogateInstanceLength', \%Results, $fh ); printl( 'UnPlacedSurrReadLen', \%Results, $fh ); printl( 'PlacedSurrReadLen', \%Results, $fh ); printl( 'MinSurrogateLength', \%Results, $fh ); printl( 'MaxSurrogateLength', \%Results, $fh ); printld( 'MeanSurrogateLength', \%Results, $fh ); printld( 'SDSurrogateLength', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[Mates]\n" if ( !$silent ); $fh->print("[Mates]\n"); printp( 'ReadsWithNoMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithGoodMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithBadShortMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithBadLongMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithSameOrientMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithOuttieMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithBothChaffMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithChaffMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithBothDegenMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithDegenMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithBothSurrMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithSurrogateMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithDiffScafMate', \%Results, $fh, $totalSeqs ); printp( 'ReadsWithUnassignedMate', \%Results, $fh, $totalSeqs ); printl( 'TotalScaffoldLinks', \%Results, $fh ); printlf( 'MeanScaffoldLinkWeight', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[Reads]\n" if ( !$silent ); $fh->print("[Reads]\n"); printl( 'TotalReadsInput', \%Results, $fh ); printl( 'TotalUsableReads', \%Results, $fh ); printld( 'AvgClearRange', \%Results, $fh ); printp( 'ContigReads', \%Results, $fh, $totalSeqs ); printp( 'BigContigReads', \%Results, $fh, $totalSeqs ); printp( 'SmallContigReads', \%Results, $fh, $totalSeqs ); printp( 'DegenContigReads', \%Results, $fh, $totalSeqs ); printp( 'SurrogateReads', \%Results, $fh, $totalSeqs ); printp( 'PlacedSurrogateReads', \%Results, $fh, $totalSeqs ); printp( 'SingletonReads', \%Results, $fh, $totalSeqs ); printp( 'ChaffReads', \%Results, $fh, $totalSeqs ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[Coverage]\n" if ( !$silent ); $fh->print("[Coverage]\n"); printlf( 'ContigsOnly', \%Results, $fh ); printlf( 'Contigs_Surrogates', \%Results, $fh ); printlf( 'Contigs_Degens_Surrogates', \%Results, $fh ); printlf( 'AllReads', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[TotalBaseCounts]\n" if ( !$silent ); $fh->print("[TotalBaseCounts]\n"); printl( 'BasesCount', \%Results, $fh ); printl( 'ClearRangeLengthFRG', \%Results, $fh ); printl( 'ClearRangeLengthASM', \%Results, $fh ); printl( 'SurrogateBaseLength', \%Results, $fh ); printl( 'ContigBaseLength', \%Results, $fh ); printl( 'DegenBaseLength', \%Results, $fh ); printl( 'SingletonBaseLength', \%Results, $fh ); printl( 'Contig_SurrBaseLength', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); print STDERR "[gcContent]\n" if ( !$silent ); $fh->print("[gcContent]\n"); printlf( 'Content', \%Results, $fh ); print STDERR "\n" if ( !$silent ); $fh->print("\n"); $fh->close() or die("Could not close $prefix.qc ($!)"); if ($metrics) { die("Could not copy $prefix.qc to $prefix.qc.metrics") if ( !copy( "$prefix.qc", "$prefix.qc.metrics" ) ); my $fm = new IO::File(">>$prefix.qc.metrics") or die("Could not open $prefix.qc.metrics ($!)"); my @lib_ids = keys %libs_initial; foreach my $lib_id (@lib_ids) { $fm->print("[library_$lib_id\_initial]\n"); my $mea = $libs_initial{$lib_id}->{'mea'}; $fm->print("mean=$mea\n"); my $std = $libs_initial{$lib_id}->{'std'}; $fm->print("sd=$std\n\n"); $fm->print("[library_$lib_id\_final]\n"); $mea = $libs_final{$lib_id}->{'mea'}; $fm->print("mean=$mea\n"); $std = $libs_final{$lib_id}->{'std'}; $fm->print("sd=$std\n"); my $min = $libs_final{$lib_id}->{'min'}; $fm->print("min=$min\n"); my $max = $libs_final{$lib_id}->{'max'}; $fm->print("max=$max\n"); my $buc = $libs_final{$lib_id}->{'buc'}; $fm->print("buc=$buc\n"); my $his = $libs_final{$lib_id}->{'hist'}; $fm->print("hist=$his\n\n"); } $fm->print("[ContigHistogram]\n"); my @len_arr = values(%sizes); my @sorted_len_arr = sort { $a <=> $b } @len_arr; my $len_list = join( ',', @sorted_len_arr ); $fm->print("hist=$len_list\n"); $fm->close() or die("Could not close $prefix.qc.metrics ($!)"); } my $badMateFile = "$prefix.badMateFragmentIDs"; # sort on UID of UTG, or CCO, then begin position, then end # UTGs should only be unplaced surrogates open( BADMATES, "| sort -k2.5,2 -k4,4n -k5,5n > $badMateFile" ) || die "Couldn't write to $badMateFile."; local $\ = "\n"; local $, = "\t"; while ( my ( $badId, $coord ) = each %bad_mates ) { if ($coord) { print BADMATES $badId, @$coord; } else { print BADMATES $badId, 'Chaff'; } } close BADMATES; exit(0); }