#!/usr/bin/perl -w
# Copyright (C) 1999-2001 Ian Korf. All Rights Reserved.
use strict;
use FAlite;
my $usage = "
TREGEX - translated regular expressions. Reads FASTA database of nucleotide
sequences (eg. ESTs) and searches for patterns using a regular expression.
usage: tregex.pl
";
die $usage unless @ARGV == 2;
my ($FASTA, $REGEX) = @ARGV;
# complete translation table including ambiguities. All caps, so one must first
# convert their sequence to uppercase to use
my %AA = (
'AAA' => 'K', 'AAC' => 'N', 'AAG' => 'K', 'AAT' => 'N',
'AAR' => 'K', 'AAY' => 'N', 'ACA' => 'T', 'ACC' => 'T',
'ACG' => 'T', 'ACT' => 'T', 'ACR' => 'T', 'ACY' => 'T',
'ACK' => 'T', 'ACM' => 'T', 'ACW' => 'T', 'ACS' => 'T',
'ACB' => 'T', 'ACD' => 'T', 'ACH' => 'T', 'ACV' => 'T',
'ACN' => 'T', 'AGA' => 'R', 'AGC' => 'S', 'AGG' => 'R',
'AGT' => 'S', 'AGR' => 'R', 'AGY' => 'S', 'ATA' => 'I',
'ATC' => 'I', 'ATG' => 'M', 'ATT' => 'I', 'ATY' => 'I',
'ATM' => 'I', 'ATW' => 'I', 'ATH' => 'I', 'CAA' => 'Q',
'CAC' => 'H', 'CAG' => 'Q', 'CAT' => 'H', 'CAR' => 'Q',
'CAY' => 'H', 'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P',
'CCT' => 'P', 'CCR' => 'P', 'CCY' => 'P', 'CCK' => 'P',
'CCM' => 'P', 'CCW' => 'P', 'CCS' => 'P', 'CCB' => 'P',
'CCD' => 'P', 'CCH' => 'P', 'CCV' => 'P', 'CCN' => 'P',
'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R',
'CGR' => 'R', 'CGY' => 'R', 'CGK' => 'R', 'CGM' => 'R',
'CGW' => 'R', 'CGS' => 'R', 'CGB' => 'R', 'CGD' => 'R',
'CGH' => 'R', 'CGV' => 'R', 'CGN' => 'R', 'CTA' => 'L',
'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'CTR' => 'L',
'CTY' => 'L', 'CTK' => 'L', 'CTM' => 'L', 'CTW' => 'L',
'CTS' => 'L', 'CTB' => 'L', 'CTD' => 'L', 'CTH' => 'L',
'CTV' => 'L', 'CTN' => 'L', 'GAA' => 'E', 'GAC' => 'D',
'GAG' => 'E', 'GAT' => 'D', 'GAR' => 'E', 'GAY' => 'D',
'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A',
'GCR' => 'A', 'GCY' => 'A', 'GCK' => 'A', 'GCM' => 'A',
'GCW' => 'A', 'GCS' => 'A', 'GCB' => 'A', 'GCD' => 'A',
'GCH' => 'A', 'GCV' => 'A', 'GCN' => 'A', 'GGA' => 'G',
'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', 'GGR' => 'G',
'GGY' => 'G', 'GGK' => 'G', 'GGM' => 'G', 'GGW' => 'G',
'GGS' => 'G', 'GGB' => 'G', 'GGD' => 'G', 'GGH' => 'G',
'GGV' => 'G', 'GGN' => 'G', 'GTA' => 'V', 'GTC' => 'V',
'GTG' => 'V', 'GTT' => 'V', 'GTR' => 'V', 'GTY' => 'V',
'GTK' => 'V', 'GTM' => 'V', 'GTW' => 'V', 'GTS' => 'V',
'GTB' => 'V', 'GTD' => 'V', 'GTH' => 'V', 'GTV' => 'V',
'GTN' => 'V', 'TAA' => '*', 'TAC' => 'Y', 'TAG' => '*',
'TAT' => 'Y', 'TAR' => '*', 'TAY' => 'Y', 'TCA' => 'S',
'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'TCR' => 'S',
'TCY' => 'S', 'TCK' => 'S', 'TCM' => 'S', 'TCW' => 'S',
'TCS' => 'S', 'TCB' => 'S', 'TCD' => 'S', 'TCH' => 'S',
'TCV' => 'S', 'TCN' => 'S', 'TGA' => '*', 'TGC' => 'C',
'TGG' => 'W', 'TGT' => 'C', 'TGY' => 'C', 'TTA' => 'L',
'TTC' => 'F', 'TTG' => 'L', 'TTT' => 'F', 'TTR' => 'L',
'TTY' => 'F', 'TRA' => '*', 'YTA' => 'L', 'YTG' => 'L',
'YTR' => 'L', 'MGA' => 'R', 'MGG' => 'R', 'MGR' => 'R',
);
# canonical FAlite usage
open(FASTA, $FASTA) or die "could not open the FASTA database $FASTA\n";
my $fasta = new FAlite(\*FASTA);
while(my $entry = $fasta->nextEntry) { # grab a sequence
my $seq = uc($entry->seq); # uppercase the sequence
my @pep = translate6frames($seq); # translate in 6 frames
my $match = 0;
foreach my $pep (@pep) {
if ($pep =~ /($REGEX)/) {
$match = $1; # keep the actual sequence found
last; # no need to continue
}
}
# output results if there are any
if ($match) {
print "$REGEX matches ($match) in ", $entry->def, "\n";
}
}
exit(0);
################################################################################
# subroutines from here down
################################################################################
sub translate6frames {
my ($seq) = @_;
my $rc = reversecomplement($seq);
my @peptide = (
translate($seq), # frame 1
translate(substr($seq, 1)), # frame 2
translate(substr($seq, 2)), # frame 3
translate($rc), # frame -1
translate(substr($rc, 1)), # frame -2
translate(substr($rc, 2)) # frame -3
);
}
sub reversecomplement {
my ($seq) = @_;
$seq = reverse $seq; # works on strings or arrays
$seq =~ tr[ACGTRYKMSWBDHV]
[TGCAYRMKWSVHDB]; # translate operator
return $seq;
}
sub translate {
my ($seq) = @_;
my @aa; # translation kept as a list
my $translate_length = length($seq) - (length($seq) % 3);
for(my $i=0;$i<$translate_length;$i+=3) {
my $codon = substr($seq, $i, 3);
if (defined $AA{$codon}) {push @aa, $AA{$codon}} # found amino acid
else {push @aa, 'X'} # ambiguous amino acid
}
return join("", @aa); # return translation as string instead of array
}
__END__
=head1 NAME
tregex.pl - translate nucleotide sequences and search for protein patterns
=head1 SYNOPSIS
tregex.pl fasta_query regular_expression
=head1 DESCRIPTION
tregex.pl is useful for looking for protein patterns in nucleotide sequences
such as EST databases. The regular expression should be in perl syntax. It may
be more natural to use prosite syntax, and if there are enough requests, I'll
make this an option.
=head1 AUTHOR
Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf)
=head1 ACKNOWLEDGEMENTS
This software was developed in the Department of Computer Science at Washington
Univeristy, St. Louis, MO.
=head1 COPYRIGHT
Copyright (C) 2001 Ian Korf. All Rights Reserved.
=head1 DISCLAIMER
This software is provided "as is" without warranty of any kind.
=cut