#!perl
# Revised to add Base Count (Jeen Bae, 5-28-2012)
# Homemade Genbank report parser using regular expressions.
# Once desired data is captured, it can be printed in any format.
use strict;
use warnings;
my $gb_report = $ARGV[0] || die "USAGE: $0 \n";
my ($trans,$protein);
open (GB, $gb_report) || die "cannot open $gb_report for reading: $!";
# Flag for multiline translation; 1 means translation "in progress"
$trans = 0;while ( ) {
if ( /(LOCUS\s*)(\w*)(.*)/ ) {
print "Locus: $2\n";
} elsif ( /(VERSION.*GI:)(\d*)/ ) {
print "GI: $2\n";
} elsif ( /(DEFINITION\s*)(.*)(\.)/ ) {
print "Sequence name: $2\n";
} elsif ( /(ORGANISM\s*)(.*)/ ) {
print "Organism: $2\n";
} elsif( /(gene)(\s*)(\d*)(\.\.)(\d*) /) {
print "Gene length: $5\n";
# ex: CDS 357..1541
} elsif ( /(CDS\s*)(\d*)(\.\.)(\d*)/ ) {
my $cds_start = $2;
my $cds_end = $4;
print "CDS: $cds_start - $cds_end\n";
# protein product begins
} elsif ( /(\/translation=")(.*)/ ) {
print "Translation: ";
$protein = $2;
$trans = 1;
# translation still going on
} elsif ( $trans ) {
# no terminal quote; translation continues
if ( !/"/ ) {
$protein .= $_;
# terminal quote; end of translation
} elsif ( /(.*)(")/ ) {
$protein .= $1;
$protein =~ s/\s*//g;
print "$protein\n";
$trans = 0;
} else {
print "Problems: end of translation product not found.\n";
}
# extract base counts
} elsif (/(BASE COUNT)\s*(.*)$/){
print "$1: ";
my $counts=$2;
$counts=~s/\s+/ /g;
my @tmp_array=split(' ', $counts);
my $i=0;
my $size=@tmp_array;
foreach my $tmp (@tmp_array){
if ($tmp=~/[a-z]/i){
print "$tmp - $tmp_array[$i-1]";
if ($i==$size-1){print "\n";}
else {print ", ";}
} else {}
$i++;
}
# Skip this data
} else { }
}
exit(0);