#!perl -w
use strict;
# Searching for motifs, Counting ACGT and errors
# documentation written in pod, perldoc perlpod
=head1 NAME
find_count.pl
=head1 DESCRIPTION
Simple example of motif searches and counting nucleotides
in dna.
=head1 SYNOPSIS
find_count.pl [man|help]
find_count.pl [counts] [motif=] ... [filename=] ...
=head1 EXAMPLES
perl find_count.pl --filename="smallfile.txt" --counts
perl find_count.pl --f='NM_021964fragment.pep' --mo='A[DS]V' --mo='KND*E{2,}'
perl find_count.pl --f='NM_021964fragment.pep' --mo='EE.*EE' --mo='A{7,14}' --mo='(CAT){3,}'
=head1 OPTIONS
All options must have a single or double hyphen in front of the name. Only
the minimum name that is unique is required.
=over 4
=item --counts
Count dna nucleotides.
=item --eg
Show the EXAMPLES section from documentation.
=item --filename
Read in a filename or a list of filenames separated by commas
or specified from the command line.
--filename=one --filename=two --filename="three,four,five"
=item --help or ?
Return synopsis and usage information on options.
=item --man
Return the complete documentation.
=item --motif
Specify motifs to look for within the sequence. This may be
several --motif entries on the command line.
--motif='A[DS]V' --motif='KND*E{2,}' --motif='EE.*EE'
--motif='A{7,14}' --motif='(CAT){3,}'
=item --test
Test all functions.
=item --verbose
Show original filenames before executing any functions.
Show what is missing during sequence searching.
=back
=head1 NOTES
It doesn't make sense to search protein sequences and
count dna nucleotides.
=head1 AUTHOR
count( ) and motif( ) from Beginning Perl for Bioinformatics by James Tisdall,
extended by David Scott to tease out a super simple library.
=cut
package main; # default namespace
# Perl library entries
use English; # Use longer names for readability
# $PROGRAM_NAME:
use SelfLoader; # Autoload for test routines
use Pod::Usage; # documentation reader library, pod2usage()
use Getopt::Long; # option reader library, GetOptions()
pod2usage(0) if $#ARGV == -1; # no arguments if @ARGV has no entries
# general options
my $counts = ""; # count nucleotides, false by default
my $help = ""; # no help by default
my $man = ""; # no man page by default
my $quiet = ""; # opposite of verbose
my $showeg = ""; # no examples by default
my $test = ""; # test routine
my $verbose = ""; # false by default
# application options
my @motif = (); # empty list of motifs
my @filename = (); # empty list of filenames
GetOptions( "verbose!" => \$verbose, quiet => sub { $verbose = 0; },
'help|?' => \$help, man => \$man, eg => \$showeg,
test => \$test,
"motif:s" => \@motif, "filename:s" => \@filename,
counts => \$counts,
);
pod2usage(1) if $help;
pod2usage(-message => 'Spurious argument(s) left: ' . "@ARGV",
-verbose => 1) if $#ARGV != -1;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
show_eg(`podselect -s "EXAMPLES" $PROGRAM_NAME`) if $showeg;
test() if $test;
# Clear out quoting from the command line
my @clearance = ();
foreach my $motif (@motif) {
$motif =~ s/\"//g;
$motif =~ s/\'//g;
push @clearance, $motif;
}
@motif = ();
@motif = @clearance;
# Enable comma separated filenames
@filename = split(/,/,join(',',@filename));
# Clear out filename quoting from the command line
my @clearname = ();
foreach my $filename (@filename) {
$filename =~ s/\"$//;
$filename =~ s/^\"//;
$filename =~ s/\'$//;
$filename =~ s/^\'//;
push @clearname, $filename;
}
@filename = ();
@filename = @clearname;
# The DNA
# Show the filenames if verbose
if ( $verbose ) {
print "Filenames:\n";
foreach my $filename (@filename) {
print "$filename\n";
}
}
while (my $filename = shift @filename) {
my $sequence = slurp($filename);
# Remove whitespace
$$sequence =~ s/\s//g;
print count($filename, $sequence) if $counts;
my @motifs = @motif;
while (my $motif = shift @motifs) {
print motif($motif, $sequence, $filename);
}
}
# Successful exit
exit(0);
# Example 5 in functions
# my $results = count($filename,\$sequence);
# return a string of counts including errors, and the filename
sub count {
my ($filename, $dna) = @_;
# Initialize the counts.
# Notice that we can use scalar variables to hold numbers.
my ($a, $c, $g, $t, $e) = (0, 0, 0, 0, 0);
# Use a regular expression "trick", and five while loops,
# to find the counts of the four bases plus errors
while($$dna =~ /a/ig){++$a;}
while($$dna =~ /c/ig){++$c;}
while($$dna =~ /g/ig){++$g;}
while($$dna =~ /t/ig){++$t;}
while($$dna =~ /[^acgt\s]/ig){++$e;}
return "$filename:\tA=$a C=$c G=$g T=$t errors=$e\n";
}
# my $text = motif($re, $sequence, $filename);
sub motif {
my ($motif, $sequence, $filename ) = @_;
if ( $$sequence =~ /$motif/i ) {
return "$motif found: $filename\n";
} else {
return "$motif missing: $filename\n"
}
}
# Show only the examples
sub show_eg {
my @lines = @_;
# remove 1st and last lines
pop @lines;
shift @lines;
print @lines;
exit (0);
}
# \$data = slurp($filename);
# slurp in all data into a single variable
# return a reference to avoid copying data
# Recommend: File::Slurp library for this
# This function was benchmarked with reading an array and reading lines
# On a slow PC, it quickly out paced arrays, and did better than lines.
sub slurp {
my ($filename) = @_;
my $inf;
local $/;
open($inf, "< $filename") or die("Unable to open $filename: $!");
my $buf = <$inf>;
close $inf;
return \$buf
}
__DATA__
# test functions
# Tests have to be SelfLoaded because the test sets up its run
# during the loading of the library: Test::More
# Test the stable part of your program, the functions that reveal
# the science. A program is often the greatest point of changes.
# Not all functions should be tested ... many are seen immediately.
# Some functions need rewritten to test ... I return a string from
# count() instead of having it untestable as a function.
#
sub test {
use Test::More tests => 2;
# set up data
my $sequence = "throwawayslurp.data";
my $data = '
AGCTAGCTCATCATCATCATTAGBAGDADTATGAGCGCAAAAAAAA
BADBADGGGGCCCCATATATATATATATATAAAAAAAAAAAAAAAA
';
open(FILE, "> $sequence") or die("Unable to write: $sequence: $!");
print FILE $data;
close(FILE);
# run tests
my $slurp = slurp($sequence);
is($$slurp,$data,"Test data written with data read by slurp");
my $text = count($sequence,$slurp);
# note the use of tab as \t and newline as \n
# These are hard to spot in testing
# I let the test tell me the answers:
# is($text,"tell me", "Test of counts and output");
is($text,"throwawayslurp.data:\tA=45 C=12 G=11 T=17 errors=7\n",
"Test of counts and output");
# Clean up any test files
unlink($sequence) or die("Unable to delete $sequence: $!");
# successful exit
exit (0);
}