parse-genbank.pl

#!perl # Revised to add Base Count (Jeen Bae, 5-28-2012) # Homemade Genbank report parser using regular expressions. # Once desired data is captured, it can be printed in any format. use strict; use warnings; my $gb_report = $ARGV[0] || die "USAGE: $0 <GenBank file>\n"; my ($trans,$protein); open (GB, $gb_report) || die "cannot open $gb_report for reading: $!"; # Flag for multiline translation; 1 means translation "in progress" $trans = 0;while ( <GB> ) { if ( /(LOCUS\s*)(\w*)(.*)/ ) { print "Locus: $2\n"; } elsif ( /(VERSION.*GI:)(\d*)/ ) { print "GI: $2\n"; } elsif ( /(DEFINITION\s*)(.*)(\.)/ ) { print "Sequence name: $2\n"; } elsif ( /(ORGANISM\s*)(.*)/ ) { print "Organism: $2\n"; } elsif( /(gene)(\s*)(\d*)(\.\.)(\d*) /) { print "Gene length: $5\n"; # ex: CDS 357..1541 } elsif ( /(CDS\s*)(\d*)(\.\.)(\d*)/ ) { my $cds_start = $2; my $cds_end = $4; print "CDS: $cds_start - $cds_end\n"; # protein product begins } elsif ( /(\/translation=")(.*)/ ) { print "Translation: "; $protein = $2; $trans = 1; # translation still going on } elsif ( $trans ) { # no terminal quote; translation continues if ( !/"/ ) { $protein .= $_; # terminal quote; end of translation } elsif ( /(.*)(")/ ) { $protein .= $1; $protein =~ s/\s*//g; print "$protein\n"; $trans = 0; } else { print "Problems: end of translation product not found.\n"; } # extract base counts } elsif (/(BASE COUNT)\s*(.*)$/){ print "$1: "; my $counts=$2; $counts=~s/\s+/ /g; my @tmp_array=split(' ', $counts); my $i=0; my $size=@tmp_array; foreach my $tmp (@tmp_array){ if ($tmp=~/[a-z]/i){ print "$tmp - $tmp_array[$i-1]"; if ($i==$size-1){print "\n";} else {print ", ";} } else {} $i++; } # Skip this data } else { } } exit(0);