GBlite.pm

################################################################################ # GBlite ################################################################################ package GBlite; use strict; use DataBrowser; ################### # Package globals # ################### # Month - for converting GenBank date format to numeric format my %Month = ( JAN=>'01', FEB=>'02', MAR=>'03', APR=>'04', MAY=>'05', JUN=>'06', JUL=>'07', AUG=>'08', SEP=>'09', OCT=>'10', NOV=>'11', DEC=>'12' ); sub new { my ($class, $fh) = @_; if (ref $fh !~ /GLOB/) {die "GBlite error: new expects a GLOB reference not $fh\n"} my $this = bless {}; $this->{FH} = $fh; $this->{LASTLINE} = ""; $this->{DONE} = 0; return $this; } sub nextEntry { my ($this) = @_; $this->_fastForward or return 0; my $FH = $this->{FH}; # These are the fields that will be kept my ($locus, $mol_type, $division, $date, $definition, $accession, $version, $gi, $keywords, $organism, $features, $sequence); # get LOCUS, MOL_TYPE, DIVISION, DATE from LOCUS line my $locus_line = $this->{LASTLINE}; my @field = split(/\s+/, $locus_line); $date = $field[@field -1]; my ($day, $month, $year) = split(/\-/, $date); $locus = $field[1]; $date = "$year-$Month{$month}-$day"; $mol_type = $field[4]; $division = $field[@field -2]; # get DEFINITION, which may span several lines my @def_line; while(<$FH>) { if (/^ACCESSION/) { $this->{LASTLINE} = $_; last; } else { push @def_line, $_; } } $definition = join("", @def_line); $definition =~ s/\s+/ /g; $definition = substr($definition, 11); # get ACCESSION, VERSION, and GI from the VERSION line while(<$FH>) {last if /^VERSION/} my $versionline = $_; ($accession, $version, $gi) = $versionline =~ /^\S+\s+(\w+)\.(\d+)\s+GI:(\d+)/; if (not defined $gi) {die ">>> $versionline"} if (not defined $accession) {die "acc>> $versionline"} if (not defined $version) {die "ver>> $versionline"} # parse the KEYWORDS, which may span several lines my %keyword; while(<$FH>) { if (/^SOURCE/) { $this->{LASTLINE} = $_; last; } else { $_ =~ s/[\.;]//g; # remove punctuation my @words = split; foreach my $word (@words) { $keyword{$word}++; } } } delete $keyword{KEYWORDS}; $keywords = [keys %keyword]; # parse the ORGANISM while(<$FH>) {last if /^\s+ORGANISM/} my $orgline = $_; ($organism) = $orgline =~ /ORGANISM\s+(.+)/; my @tax_line; while(<$FH>) { if (/^REFERENCE/) { $this->{LASTLINE} = $_; last; } else { push @tax_line, $_; } } my $taxonomy = join("", @tax_line); $taxonomy =~ s/\s+/ /g; # parse the FEATURES while(<$FH>) {last if /^FEATURES/} # skip ahead my @lines; $features = []; while(<$FH>) { chomp; last if /^BASE COUNT/; if (substr($_, 5, 1) ne ' ' and @lines) { push @$features, GBlite::Feature::new(\@lines); @lines = ($_); } else { push @lines, $_; } } push @$features, GBlite::Feature::new(\@lines); if (@$features == 0) {die "unexpected fatal parsing error\n"} # parse the SEQUENCE <$FH>; # throw away origin line my @seq; while(<$FH>) { last if /^\/\//; $_ =~ s/\d+//g; $_ =~ s/\s+//g; push @seq, $_; } $sequence = join("", @seq); $sequence = uc $sequence; $this->{LASTLINE} = $_; my $entry = GBlite::Entry::new($locus, $mol_type, $division, $date, $definition, $accession, $version, $gi, $keywords, $organism, $features, $sequence, $taxonomy); return $entry; } sub _fastForward { my ($this) = @_; return 0 if $this->{DONE} or not defined $this->{LASTLINE}; return 1 if $this->{LASTLINE} =~ /^LOCUS/; my $FH = $this->{FH}; while(<$FH>) { if ($_ =~ /^LOCUS/) { $this->{LASTLINE} = $_; return 1; } } return 0 if not defined $_; warn "Possible parse error in _fastForward in GBlite.pm\n", $_; } ################################################################################ # GBlite::Entry ################################################################################ package GBlite::Entry; use strict; use DataBrowser; # Field - these are the fields that will be parsed for every GenBank entry my @Field = qw( LOCUS MOL_TYPE DIVISION DATE DEFINITION ACCESSION VERSION GI KEYWORDS ORGANISM FEATURES SEQUENCE TAXONOMY ); sub new { my $entry = bless {}; ($entry->{LOCUS}, $entry->{MOL_TYPE}, $entry->{DIVISION}, $entry->{DATE}, $entry->{DEFINITION}, $entry->{ACCESSION}, $entry->{VERSION}, $entry->{GI}, $entry->{KEYWORDS}, $entry->{ORGANISM}, $entry->{FEATURES}, $entry->{SEQUENCE}, $entry->{TAXONOMY}) = @_; my $CONSTRUCTOR_ERROR = 0; foreach my $name (@Field) { if (not defined $entry->{$name}) { $CONSTRUCTOR_ERROR++; warn "GBlite::Entry constructor error, $name undefined\n"; } } if ($CONSTRUCTOR_ERROR) {browse($entry); exit(1)} return $entry; } sub locus {shift->{LOCUS}} sub mol_type {shift->{MOL_TYPE}} sub division {shift->{DIVISION}} sub date {shift->{DATE}} sub definition {shift->{DEFINITION}} sub accession {shift->{ACCESSION}} sub version {shift->{VERSION}} sub gi {shift->{GI}} sub keywords {shift->{KEYWORDS}} sub organism {shift->{ORGANISM}} sub features {shift->{FEATURES}} sub sequence {shift->{SEQUENCE}} sub length {length(shift->{SEQUENCE})} sub taxonomy {shift->{TAXONOMY}} ################################################################################ # GBlite::Feature ################################################################################ package GBlite::Feature; use strict; use DataBrowser; sub new { my $feature = bless {}; my ($lines) = @_; my $string = join("", @$lines); # join all lines my @part = split(/\s{10,30}\//, $string); # split qualifiers from key/location $string =~ s/\s+/ /g; # trim multiple spaces to one space my $key_loc = shift @part; my ($key, $location) = $key_loc =~ /^\s+(\S+)\s+(.+)/; $location =~ s/\s+//g; my $qualifiers; foreach my $qual (@part) { if ($qual =~ /\S=\S/) { my ($key, $value) = $qual =~ /^(\S+)=(.+)/; $value =~ s/"//g; $value =~ s/\s+$//g; $value =~ s/\s+/ /g; if (not defined $key) { print "$key --> $qual\n"; print "@$lines\n"; die "GBlite::Feature constructor error\n"; } if ($key eq 'translation') {$value =~ s/\s+//g} $qualifiers->{$key} .= "$value "; } else { $qualifiers->{$key} = ""; } } $feature->{KEY} = $key; $feature->{LOCATION} = $location; $feature->{QUALIFIERS} = $qualifiers; return $feature; } sub key {shift->{KEY}} sub location {shift->{LOCATION}} sub qualifiers {shift->{QUALIFIERS}} 1; __END__ =head1 NAME GBlite.pm =head1 SYNOPSIS use GBlite; my $genbank = new GBlite(\*STDIN); while(my $entry = $genbank->nextEntry) { $entry->locus; $entry->mol_type; $entry->division; $entry->date; # yyyy-mm-dd $entry->definition; $entry->accession; $entry->version; $entry->gi; $entry->keywords; # reference to ARRAY $entry->organism; $entry->features; # reference to ARRAY $entry->sequence; foreach my $feature (@{$entry->features}) { $feature->key; $feature->location; $feature->qualifiers; # reference to HASH } } =head1 DESCRIPTION GBlite is a package for parsing concatenated GenBank flat files. The GenBank format is a common format for bioinformatics. Its specification is complicated, and anyone using this module should at least skim the GenBank release.notes and the DDJB/EMBL/GenBank feature table specification. These documents are available from the NCBI. =head1 AUTHOR Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf) =head1 ACKNOWLEDGEMENTS This software was developed at Washington Univeristy, St. Louis, MO. =head1 COPYRIGHT Copyright (C) 2000 Ian Korf. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut