find_count.pl

#!perl -w use strict; # Searching for motifs, Counting ACGT and errors # documentation written in pod, perldoc perlpod =head1 NAME find_count.pl =head1 DESCRIPTION Simple example of motif searches and counting nucleotides in dna. =head1 SYNOPSIS find_count.pl [man|help] find_count.pl [counts] [motif=<regular_expresion>] ... [filename=<filename>] ... =head1 EXAMPLES perl find_count.pl --filename="smallfile.txt" --counts perl find_count.pl --f='NM_021964fragment.pep' --mo='A[DS]V' --mo='KND*E{2,}' perl find_count.pl --f='NM_021964fragment.pep' --mo='EE.*EE' --mo='A{7,14}' --mo='(CAT){3,}' =head1 OPTIONS All options must have a single or double hyphen in front of the name. Only the minimum name that is unique is required. =over 4 =item --counts Count dna nucleotides. =item --eg Show the EXAMPLES section from documentation. =item --filename Read in a filename or a list of filenames separated by commas or specified from the command line. --filename=one --filename=two --filename="three,four,five" =item --help or ? Return synopsis and usage information on options. =item --man Return the complete documentation. =item --motif Specify motifs to look for within the sequence. This may be several --motif entries on the command line. --motif='A[DS]V' --motif='KND*E{2,}' --motif='EE.*EE' --motif='A{7,14}' --motif='(CAT){3,}' =item --test Test all functions. =item --verbose Show original filenames before executing any functions. Show what is missing during sequence searching. =back =head1 NOTES It doesn't make sense to search protein sequences and count dna nucleotides. =head1 AUTHOR count( ) and motif( ) from Beginning Perl for Bioinformatics by James Tisdall, extended by David Scott to tease out a super simple library. =cut package main; # default namespace # Perl library entries use English; # Use longer names for readability # $PROGRAM_NAME: use SelfLoader; # Autoload for test routines use Pod::Usage; # documentation reader library, pod2usage() use Getopt::Long; # option reader library, GetOptions() pod2usage(0) if $#ARGV == -1; # no arguments if @ARGV has no entries # general options my $counts = ""; # count nucleotides, false by default my $help = ""; # no help by default my $man = ""; # no man page by default my $quiet = ""; # opposite of verbose my $showeg = ""; # no examples by default my $test = ""; # test routine my $verbose = ""; # false by default # application options my @motif = (); # empty list of motifs my @filename = (); # empty list of filenames GetOptions( "verbose!" => \$verbose, quiet => sub { $verbose = 0; }, 'help|?' => \$help, man => \$man, eg => \$showeg, test => \$test, "motif:s" => \@motif, "filename:s" => \@filename, counts => \$counts, ); pod2usage(1) if $help; pod2usage(-message => 'Spurious argument(s) left: ' . "@ARGV", -verbose => 1) if $#ARGV != -1; pod2usage(-exitstatus => 0, -verbose => 2) if $man; show_eg(`podselect -s "EXAMPLES" $PROGRAM_NAME`) if $showeg; test() if $test; # Clear out quoting from the command line my @clearance = (); foreach my $motif (@motif) { $motif =~ s/\"//g; $motif =~ s/\'//g; push @clearance, $motif; } @motif = (); @motif = @clearance; # Enable comma separated filenames @filename = split(/,/,join(',',@filename)); # Clear out filename quoting from the command line my @clearname = (); foreach my $filename (@filename) { $filename =~ s/\"$//; $filename =~ s/^\"//; $filename =~ s/\'$//; $filename =~ s/^\'//; push @clearname, $filename; } @filename = (); @filename = @clearname; # The DNA # Show the filenames if verbose if ( $verbose ) { print "Filenames:\n"; foreach my $filename (@filename) { print "$filename\n"; } } while (my $filename = shift @filename) { my $sequence = slurp($filename); # Remove whitespace $$sequence =~ s/\s//g; print count($filename, $sequence) if $counts; my @motifs = @motif; while (my $motif = shift @motifs) { print motif($motif, $sequence, $filename); } } # Successful exit exit(0); # Example 5 in functions # my $results = count($filename,\$sequence); # return a string of counts including errors, and the filename sub count { my ($filename, $dna) = @_; # Initialize the counts. # Notice that we can use scalar variables to hold numbers. my ($a, $c, $g, $t, $e) = (0, 0, 0, 0, 0); # Use a regular expression "trick", and five while loops, # to find the counts of the four bases plus errors while($$dna =~ /a/ig){++$a;} while($$dna =~ /c/ig){++$c;} while($$dna =~ /g/ig){++$g;} while($$dna =~ /t/ig){++$t;} while($$dna =~ /[^acgt\s]/ig){++$e;} return "$filename:\tA=$a C=$c G=$g T=$t errors=$e\n"; } # my $text = motif($re, $sequence, $filename); sub motif { my ($motif, $sequence, $filename ) = @_; if ( $$sequence =~ /$motif/i ) { return "$motif found: $filename\n"; } else { return "$motif missing: $filename\n" } } # Show only the examples sub show_eg { my @lines = @_; # remove 1st and last lines pop @lines; shift @lines; print @lines; exit (0); } # \$data = slurp($filename); # slurp in all data into a single variable # return a reference to avoid copying data # Recommend: File::Slurp library for this # This function was benchmarked with reading an array and reading lines # On a slow PC, it quickly out paced arrays, and did better than lines. sub slurp { my ($filename) = @_; my $inf; local $/; open($inf, "< $filename") or die("Unable to open $filename: $!"); my $buf = <$inf>; close $inf; return \$buf } __DATA__ # test functions # Tests have to be SelfLoaded because the test sets up its run # during the loading of the library: Test::More # Test the stable part of your program, the functions that reveal # the science. A program is often the greatest point of changes. # Not all functions should be tested ... many are seen immediately. # Some functions need rewritten to test ... I return a string from # count() instead of having it untestable as a function. # sub test { use Test::More tests => 2; # set up data my $sequence = "throwawayslurp.data"; my $data = ' AGCTAGCTCATCATCATCATTAGBAGDADTATGAGCGCAAAAAAAA BADBADGGGGCCCCATATATATATATATATAAAAAAAAAAAAAAAA '; open(FILE, "> $sequence") or die("Unable to write: $sequence: $!"); print FILE $data; close(FILE); # run tests my $slurp = slurp($sequence); is($$slurp,$data,"Test data written with data read by slurp"); my $text = count($sequence,$slurp); # note the use of tab as \t and newline as \n # These are hard to spot in testing # I let the test tell me the answers: # is($text,"tell me", "Test of counts and output"); is($text,"throwawayslurp.data:\tA=45 C=12 G=11 T=17 errors=7\n", "Test of counts and output"); # Clean up any test files unlink($sequence) or die("Unable to delete $sequence: $!"); # successful exit exit (0); }