#!/usr/local/bin/perl # Author: Zemin Zhang # Genentech Inc. # Purpose: Assign a score, e-value, and cleavage site for each protein sequences # in a FASTA file # Options: -l length of N-terminus to be used, with default set to full length # Usage: sigpfam.pl -l 50 FastaFile OutputFileName SignalModelName # Note: This require that the hmmpfam program in the HMMER package has been installed use Getopt::Std; getopt("l"); $fafile = $ARGV[0]; $outfile = $ARGV[1]; $pfam = $ARGV[2]; open (IN, "<$fafile"); open (OUT, ">$outfile"); while ($line = ) { if (eof || $line =~ /^>(\S+)/) { $newname = $1; if ($line =~ /^>\S+\s+\d+aa\s+(\d+)\s+/) { $newloc = $1; } if (++$num == 1) { $name = $newname; $loc = $newloc; next; } $justseq = $seq; $justseq =~ s/\W//g; if ($opt_l) { $seq = substr($justseq, 0, $opt_l); } ($score, $eval, $predloc) = checkthis($name, $seq, $pfam); print OUT "$name\t$score\t$eval\t$predloc\t$loc\n"; $seq = ""; $name = $newname; $loc = $newloc; } else { $seq .= $line; } } sub checkthis { my $name = shift; my $seq = shift; my $model = shift; my $seqfile = "p1.$name"; my $linecount = 0; my $start = 0; open (FA, ">$seqfile"); print FA ">$name\n$seq"; close(FA); my $cmd = "hmmpfam $model $seqfile"; my ($loc, $score, $eval, @temp); open(PF, "$cmd | "); while ($line = ) { $linecount++; if ($line =~ /^Model\s+Domain\s+/) { $linecount = 0; $start = 1; } elsif ($start && $linecount == 2) { @temp = split(/\s+/,$line); ($loc, $score, $eval) = @temp[3, $#temp - 1, $#temp]; last; } } close(PF); unlink($seqfile); return ($score, $eval, $loc); }