#!perl -w use strict; use feature qw/state/; package main; my $verbose = 1; # expecting three input parameters from command line: # enzyme_file, DNA_file, and output_file. # If files are missing, exit. if( $#ARGV != 2) { print "\nMissing command line arguments\n\nUsage: perl $0 <test enzyme file> <test dna file> <output file>\n"; exit(1); } my ($enzyme_file, $dna_file, $out_file) = @ARGV; # read enzyme definition # specify enzymes, in format EnzymeName/Recognition Sequence(cut_Site')// # print "\nEnter the file containing the enzymes: \n\n"; # chomp(my $enzyme_file = <STDIN>); open (ENZYMEFILE, $enzyme_file) or die("\nCannot open the enzyme file \"$enzyme_file\"\n"); my %enzyme_seq = (); my %enzyme_position = (); while ( my $line = <ENZYMEFILE> ) { chomp($line); # split enzyme entry into @ar[0]:enzyme and ar[1]:sequence my @ar = split /\//, $line; # enzyme name as hash key # find cut position and clean up sequence if ($ar[1] =~ /\'/) { $enzyme_position{$ar[0]} = $-[0]; } $ar[1] =~ s/\'//; # translate IUB code $enzyme_seq{$ar[0]} = IUB_to_regexp($ar[1]); print "$ar[0]\t$enzyme_seq{$ar[0]}\t$enzyme_position{$ar[0]}\n" if $verbose; } close(ENZYMEFILE); # read DNA data # multiple sequences as simple text, one in each line open (DNAFILE, $dna_file) or die("\nCannot open the DNA file \"$dna_file\"\n"); # assign header to output file open (OUTFILE, ">$out_file") or die("\nCannot write file \"$out_file\"\n"); print OUTFILE "DNA\tEnzyme\tCut_Postion\n"; while (my $line = <DNAFILE>) { chomp($line); # count # of RE sites found in DNA my $cut_count = 0; for my $key (keys %enzyme_seq) { while($line =~ /$enzyme_seq{$key}/ig ) { $cut_count++; print OUTFILE "$line\t$key\t", $-[0]+$enzyme_position{$key}, "\n"; } } # no sites found if ($cut_count==0) { print OUTFILE "$line\t\tNo match found\n"; } } close(DNAFILE); close(OUTFILE); print "\nRE Sites are written to file $out_file", "\n\n"; exit(0); # # subroutine to translate IUB ambiguity codes to regular expressions sub IUB_to_regexp { my($iub) = @_; my $regular_expression = ''; print "iub ", $iub, "\t rgx ", $regular_expression, "\n" if $verbose; state %iub2character_class; state $loaded = 0; if ( ! $loaded ) { %iub2character_class = ( A => 'A', C => 'C', G => 'G', T => 'T', R => '[GA]', Y => '[CT]', M => '[AC]', K => '[GT]', S => '[GC]', W => '[AT]', B => '[CGT]', D => '[AGT]', H => '[ACT]', V => '[ACG]', N => '[ACGT]',); $loaded = 1; } # Translate each character in the iub sequence for ( my $i = 0 ; $i < length($iub) ; ++$i ) { $regular_expression .= $iub2character_class{substr($iub, $i, 1)}; } return $regular_expression; } __END__ =head1 NAME perl test_restriction_enzyme.pl =head1 DESCRIPTION Read a list of enzymes (like enzyme_list.txt) and print any matches in a file of dna. =head1 SYNOPSIS perl test_restriction_enzyme.pl <enzyme file> <dna file> <output file> =head1 EXAMPLE perl test_restriction_enzyme.pl enzyme_list.txt test.dna myout.txt =head1 NOTES ( A => 'A', C => 'C', G => 'G', T => 'T', B stands for C, G, or T D stands for A, G, or T H stands for A, C, or T K stands for G or T M stands for A or C N stands for A, C, G, or T R stands for G or A S stands for G or C V stands for A, C, or G W stands for A or T Y stands for C or T ,); =cut