#!/usr/bin/perl -w use strict; use BPlite; use FAlite; use Getopt::Std; use vars qw( $opt_c $opt_b $opt_B $opt_s $opt_S $opt_p $opt_P $opt_i $opt_I $opt_r ); ############ # defaults # ############ my $MASK = ''; my $MIN_BITS = 1; my $MAX_BITS = 1e200; my $MIN_SCORE = 1; my $MAX_SCORE = 1e200; my $MIN_P = 0; my $MAX_P = 1; my $MIN_IDENTITY = 1; my $MAX_IDENTITY = 100; ######### # usage # ######### my $usage = " usage: xblast.pl [options] <query file> < <blast_report> options: [default] -c <masking character> [$MASK] -b <minimum bit score> [$MIN_BITS] -B <Maximum bit score> [$MAX_BITS] -s <minimum raw score> [$MIN_SCORE] -S <maximum raw score> [$MAX_SCORE] -p <minimum P-value> [$MIN_P] -P <maximum P-value> [$MAX_P] -i <mimimum identity> [$MIN_IDENTITY] -I <maximum identity> [$MAX_IDENTITY] -r switch to reverse-mask the sequence "; ########################### # command line processing # ########################### getopts('c:rb:b:s:S:p:P:v:'); die $usage unless @ARGV == 1; $MASK = $opt_c if $opt_c; $MIN_BITS = $opt_b if $opt_b; $MAX_BITS = $opt_B if $opt_B; $MIN_SCORE = $opt_s if $opt_s; $MAX_SCORE = $opt_S if $opt_S; $MIN_P = $opt_p if $opt_p; $MAX_P = $opt_P if $opt_P; $MIN_IDENTITY = $opt_i if $opt_i; $MAX_IDENTITY = $opt_I if $opt_I; my $REVERSE = $opt_r; my ($query) = @ARGV; ############################# # BPlite and FAlite objects # ############################# open(FASTA, $query) or die; my $BPlite = new BPlite::Multi(\*STDIN); my $FAlite = new FAlite(\*FASTA); ########################################## # iterate through both BPlite and FAlite # ########################################## while(my $blast = $BPlite->nextReport, my $fasta = $FAlite->nextEntry) { my $seq = $fasta->seq; ############### # parse BLAST # ############### while (my $sbjct = $blast->nextSbjct) { while (my $hsp = $sbjct->nextHSP) { ########### # filters # ########### next if $hsp->bits < $MIN_BITS; next if $hsp->bits > $MAX_BITS; next if $hsp->score < $MIN_SCORE; next if $hsp->score > $MAX_SCORE; next if $hsp->P < $MIN_P; next if $hsp->P > $MAX_P; next if $hsp->percent < $MIN_IDENTITY; next if $hsp->percent > $MAX_IDENTITY; ########### # masking # ########### my ($begin, $end) = ($hsp->queryBegin, $hsp->queryEnd); ($begin, $end) = ($end, $begin) if $begin > $end; my $len = $end - $begin + 1; substr($seq, $begin-1, $len) = lc(substr($seq, $begin-1, $len)); } } ####################### # output FASTA format # ####################### if ($REVERSE) {$seq =~ tr/a-zA-Z/A-Za-z/} if ($MASK) {$seq =~ s/[a-z]/$MASK/g} print $fasta->def, "\n"; for(my $i=0;$i<length($seq);$i+=50) { print substr($seq, $i, 50), "\n"; } } 1; __END__ =head1 NAME xblast.pl - read blast output and mask query =head1 SYNOPSIS xblast.pl fasta_query < blast_report =head1 DESCRIPTION xblast.pl is useful for masking out hits from a "junk" library before searching a more interesting library. xblast.pl is intentionally similar to Jean-Michel Claverie's xblast (see http://blast.wustl.edu), but has several important differences. The most important of these =head1 OPTIONS =head2 masking You have a choice for the mask character. The default is lowercase letters, but this can be overridden with the -c option, which is usually set to 'X' for protein sequences and 'N' for nucleotide sequence. You may set it to whatever you wish, but modern BLAST programs can skip lowercase letters when seeding alignments. =head2 filtering You can filter individual HSPs, for example, to select only the reasonably good ones for masking. The filters may be based on bit score (-b, -B) raw score (-s, -S), P-value (-p, -P), and percent identity (-i, -I). In each case, the lowercase letter is for the minimum value to keep an HSP and the uppercase letter is for the maximum value to keep. See examples below. =head2 reverse masking You can reverse mask the sequence with the -r option. =head1 EXAMPLES Here's how you screen your query against an Alu repeat library before searching dbEST (assuming you have appropriate libraries). blastn alu query | xblast.pl query | blastn dbEST query Now let's suppose that you only want to filter off the hits with at least a score of 20 bits. I've made the command line briefer in the following examples. ... | xblast.pl -b 20 query | ... You could also use P-value to mask with only those HSPs whose P-value < 1e-5. Note the capital P below, which is a cutoff for maximum P value to keep the HSP, and hence, mask with it. ... | xblast.pl -P 1e-5 | ... Another option would be to mask using percent identity. ... | xbblast.pl -i 80 | ... Generally, you'll want to use -b, -s, -P, and -i. But if for some reason you want to mask only the low-scoring hits, you can set -B, -S, -p, and -I. =head1 DIFFERENCES FROM XBLAST =head2 lowercase masking As mentioned above, xblast.pl uses lowercase masking by default. =head2 no deletions The original xblast supported deleting instead of masking by setting the mask character to "". This is not supported in xblast.pl, nor do I think it is a good idea because you could end up with a sequence with no letters in it. =head2 FASTA database queries Modern versions of BLAST support using a fasta database as a query. xblast.pl does too. FASTA database goes in... masked FASTA database comes out. =head2 filters The original xblast did not allow you to filter your blast report, you had to rely on the parameters of the blast search. xblast.pl is a bit more extensible, for example, allowing the use of percent identity. You can add whatever fliters you like by modifiying the code. For example, you may with to filter based on the number of HSPs, or the length of the alignment, or the coverage, etc. =head1 AUTHOR Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf) =head1 ACKNOWLEDGEMENTS This software was developed in the Department of Computer Science at Washington Univeristy, St. Louis, MO. =head1 COPYRIGHT Copyright (C) 2000 Ian Korf. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut