tregex.pl

#!/usr/bin/perl -w # Copyright (C) 1999-2001 Ian Korf. All Rights Reserved. use strict; use FAlite; my $usage = " TREGEX - translated regular expressions. Reads FASTA database of nucleotide sequences (eg. ESTs) and searches for patterns using a regular expression. usage: tregex.pl <fasta file> <regex> "; die $usage unless @ARGV == 2; my ($FASTA, $REGEX) = @ARGV; # complete translation table including ambiguities. All caps, so one must first # convert their sequence to uppercase to use my %AA = ( 'AAA' => 'K', 'AAC' => 'N', 'AAG' => 'K', 'AAT' => 'N', 'AAR' => 'K', 'AAY' => 'N', 'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', 'ACR' => 'T', 'ACY' => 'T', 'ACK' => 'T', 'ACM' => 'T', 'ACW' => 'T', 'ACS' => 'T', 'ACB' => 'T', 'ACD' => 'T', 'ACH' => 'T', 'ACV' => 'T', 'ACN' => 'T', 'AGA' => 'R', 'AGC' => 'S', 'AGG' => 'R', 'AGT' => 'S', 'AGR' => 'R', 'AGY' => 'S', 'ATA' => 'I', 'ATC' => 'I', 'ATG' => 'M', 'ATT' => 'I', 'ATY' => 'I', 'ATM' => 'I', 'ATW' => 'I', 'ATH' => 'I', 'CAA' => 'Q', 'CAC' => 'H', 'CAG' => 'Q', 'CAT' => 'H', 'CAR' => 'Q', 'CAY' => 'H', 'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P', 'CCR' => 'P', 'CCY' => 'P', 'CCK' => 'P', 'CCM' => 'P', 'CCW' => 'P', 'CCS' => 'P', 'CCB' => 'P', 'CCD' => 'P', 'CCH' => 'P', 'CCV' => 'P', 'CCN' => 'P', 'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'CGR' => 'R', 'CGY' => 'R', 'CGK' => 'R', 'CGM' => 'R', 'CGW' => 'R', 'CGS' => 'R', 'CGB' => 'R', 'CGD' => 'R', 'CGH' => 'R', 'CGV' => 'R', 'CGN' => 'R', 'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'CTR' => 'L', 'CTY' => 'L', 'CTK' => 'L', 'CTM' => 'L', 'CTW' => 'L', 'CTS' => 'L', 'CTB' => 'L', 'CTD' => 'L', 'CTH' => 'L', 'CTV' => 'L', 'CTN' => 'L', 'GAA' => 'E', 'GAC' => 'D', 'GAG' => 'E', 'GAT' => 'D', 'GAR' => 'E', 'GAY' => 'D', 'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A', 'GCR' => 'A', 'GCY' => 'A', 'GCK' => 'A', 'GCM' => 'A', 'GCW' => 'A', 'GCS' => 'A', 'GCB' => 'A', 'GCD' => 'A', 'GCH' => 'A', 'GCV' => 'A', 'GCN' => 'A', 'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', 'GGR' => 'G', 'GGY' => 'G', 'GGK' => 'G', 'GGM' => 'G', 'GGW' => 'G', 'GGS' => 'G', 'GGB' => 'G', 'GGD' => 'G', 'GGH' => 'G', 'GGV' => 'G', 'GGN' => 'G', 'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V', 'GTR' => 'V', 'GTY' => 'V', 'GTK' => 'V', 'GTM' => 'V', 'GTW' => 'V', 'GTS' => 'V', 'GTB' => 'V', 'GTD' => 'V', 'GTH' => 'V', 'GTV' => 'V', 'GTN' => 'V', 'TAA' => '*', 'TAC' => 'Y', 'TAG' => '*', 'TAT' => 'Y', 'TAR' => '*', 'TAY' => 'Y', 'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'TCR' => 'S', 'TCY' => 'S', 'TCK' => 'S', 'TCM' => 'S', 'TCW' => 'S', 'TCS' => 'S', 'TCB' => 'S', 'TCD' => 'S', 'TCH' => 'S', 'TCV' => 'S', 'TCN' => 'S', 'TGA' => '*', 'TGC' => 'C', 'TGG' => 'W', 'TGT' => 'C', 'TGY' => 'C', 'TTA' => 'L', 'TTC' => 'F', 'TTG' => 'L', 'TTT' => 'F', 'TTR' => 'L', 'TTY' => 'F', 'TRA' => '*', 'YTA' => 'L', 'YTG' => 'L', 'YTR' => 'L', 'MGA' => 'R', 'MGG' => 'R', 'MGR' => 'R', ); # canonical FAlite usage open(FASTA, $FASTA) or die "could not open the FASTA database $FASTA\n"; my $fasta = new FAlite(\*FASTA); while(my $entry = $fasta->nextEntry) { # grab a sequence my $seq = uc($entry->seq); # uppercase the sequence my @pep = translate6frames($seq); # translate in 6 frames my $match = 0; foreach my $pep (@pep) { if ($pep =~ /($REGEX)/) { $match = $1; # keep the actual sequence found last; # no need to continue } } # output results if there are any if ($match) { print "$REGEX matches ($match) in ", $entry->def, "\n"; } } exit(0); ################################################################################ # subroutines from here down ################################################################################ sub translate6frames { my ($seq) = @_; my $rc = reversecomplement($seq); my @peptide = ( translate($seq), # frame 1 translate(substr($seq, 1)), # frame 2 translate(substr($seq, 2)), # frame 3 translate($rc), # frame -1 translate(substr($rc, 1)), # frame -2 translate(substr($rc, 2)) # frame -3 ); } sub reversecomplement { my ($seq) = @_; $seq = reverse $seq; # works on strings or arrays $seq =~ tr[ACGTRYKMSWBDHV] [TGCAYRMKWSVHDB]; # translate operator return $seq; } sub translate { my ($seq) = @_; my @aa; # translation kept as a list my $translate_length = length($seq) - (length($seq) % 3); for(my $i=0;$i<$translate_length;$i+=3) { my $codon = substr($seq, $i, 3); if (defined $AA{$codon}) {push @aa, $AA{$codon}} # found amino acid else {push @aa, 'X'} # ambiguous amino acid } return join("", @aa); # return translation as string instead of array } __END__ =head1 NAME tregex.pl - translate nucleotide sequences and search for protein patterns =head1 SYNOPSIS tregex.pl fasta_query regular_expression =head1 DESCRIPTION tregex.pl is useful for looking for protein patterns in nucleotide sequences such as EST databases. The regular expression should be in perl syntax. It may be more natural to use prosite syntax, and if there are enough requests, I'll make this an option. =head1 AUTHOR Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf) =head1 ACKNOWLEDGEMENTS This software was developed in the Department of Computer Science at Washington Univeristy, St. Louis, MO. =head1 COPYRIGHT Copyright (C) 2001 Ian Korf. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut