#!/usr/local/bin/perl # Author: Zemin Zhang # Genentech Inc. # Purpose: process file that contains signal peptide information (following the format # below) and align them according to h-, c- and n-domains assignment # Modified on 6/6/06 # # Input file format: # Seq_Name Length Signal_Sequence # Example: # FCN3_HUMAN 23 MDLLWILPSLWLLLLGGPACLKT # EFA1_HUMAN 18 MEFLWAPLLGLCCSLAAA # NP_057276.2 40 MVLWESPRQCSSWTLCEGFCWLLLLPVMLLIVARPVKLAA # SPT2_HUMAN 27 MAQLCGLRRSRAFLALLGSLLLSGVLA # NP_077300.1 24 MRLPRRAALGLLPLLLLLPPAPEA # NP_057663.1 35 MSGGWMAQVGAWRTGALGLALLLLLGLGLGLEAAA # # Output Example: # FCN3_HUMAN MD----------------------LLWILPSLWLLLL----GGPACLKT # EFA1_HUMAN ME----------------------------FLWAPLL---GLCCSLAAA # SPT2_HUMAN MAQLCGLRRSR---------------AFLALLGSLLL-------SGVLA # NP_077300.1 MRLPRR-------------------AALGLLPLLLLL------PPAPEA # NP_057663.1 MSGGWMAQVGAWRTG------------ALGLALLLLL--GLGLGLEAAA # # Please report any bugs to zemin@gene.com if ($#ARGV<0) { print "Usage: pepalign.pl signal_file\n"; print "\nsignal_file format:\nSeq_Name Length Signal_Sequence\n\n"; print "Example: FCN3_HUMAN 23 MDLLWILPSLWLLLLGGPACLKT EFA1_HUMAN 18 MEFLWAPLLGLCCSLAAA NP_057276.2 40 MVLWESPRQCSSWTLCEGFCWLLLLPVMLLIVARPVKLAA SPT2_HUMAN 27 MAQLCGLRRSRAFLALLGSLLLSGVLA NP_077300.1 24 MRLPRRAALGLLPLLLLLPPAPEA NP_057663.1 35 MSGGWMAQVGAWRTGALGLALLLLLGLGLGLEAAA\n\n"; exit(0); } $sigfa = $ARGV[0]; $atypical = 0; $seqnotaligned = 0; open (IN, "<$sigfa") || die "Cannot open file $sigfa : $!"; format OUT = @<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $sig[0], $alignedseq . open (OUT, ">$sigfa.align"); open (ERR, ">$sigfa.err"); ## file to capture errors while () { @seq = split(/\s+/); subdomain(@seq); } print ERR "$atypical atypical sequence(s) ignored.\n"; print ERR "$seqnotaligned sequences discarded because they are not well subdivided.\n"; ##########sub subdomain######################################## sub subdomain { my @sig = @_; my $seqname = $sig[0]; my $totallength = $sig[1]; my $thisseq = $sig[2]; my @sigseq; my $cfound = 0; my $hfound = 0; for (my $i=0; $i<$totallength; ++$i) { $sigseq[$i] = substr($thisseq, $i, 1); } # set the start of c-region 3aa upstream of the cleavage site my $cstart = $totallength -3; # move the pointer toward N-terminus until first occurence of >=2 hydrophobic aa # if two hydro aa not found, or it is too close to Met, put in atypical catagory for ($i=$cstart-1; $i>6; --$i) { if (ishydrophobic($sigseq[$i]) && ishydrophobic($sigseq[$i-1])) { $cstart = $i+1; $cfound = 1; # this is used as a flag for atypical sequences last; } } # set the start of the h-region 6aa upstream of $cstart my $hstart = $cstart - 6; # set $hstart at the 1st occurrence of either a charged aa or >3 consecutive # nonhydrophobic aa for ($i=$hstart-1; $i>0; --$i) { if (ischarged($sigseq[$i]) || (nh($sigseq[$i])&&nh($sigseq[$i-1])&&nh($sigseq[$i-2]))) { $hstart = $i+1; $hfound = 1; # this is used as as flag for atypical sequences last; } } # If the boundary is not found, then set $hstart at 1 (2nd aa) unless ($hfound) { $hstart =1 ; $hfound = 1; } # now, move $hstart pointer backwards until a hydrophobic aa is found for ($i=$hstart; $i<$cstart-5; ++$i) { if (ishydrophobic($sigseq[$i])) { $hstart = $i; last; } } unless ($hfound && $cfound) { $atypical++; print ERR "$seqname appears to be atypical and is ignored.\n"; } # start to select those sequences that passes length selection critiria # and format those in the multiple sequence alignment form # maximum n-region length: 17 # maximum h-region length: 20 # maximum c-region length: 12 my $alignedseq = ""; format OUT = @<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $sig[0], $alignedseq . if(($hstart <=17) && (($cstart-$hstart) <= 20) && (($totallength-$cstart) <= 12)) { for ($i=0; $i<$totallength; ++$i) { if ($i==$hstart) { for (my $j=0;$j<37-$cstart; ++$j) {$alignedseq .= "-"; } } if ($i==$cstart) { for ($j=0; $j<12-$totallength+$cstart; ++$j) {$alignedseq .= "-"; } } $alignedseq .= $sigseq[$i]; } write (OUT); } else { $seqnotaligned++; print ERR "$seqname cannot be subdivided and is discarded.\n"; } } ####### simple aa evaluation methods ##################################### sub ishydrophobic { my $thisaa = $_[0]; if ($thisaa =~ /[AILMFVW]/i) { return 1; } else { return 0; } } sub ischarged { my $thisaa = $_[0]; if ($thisaa =~ /[RKDE]/i) { return 1; } else { return 0; } } sub nh { # subroutine for determine if non-hydrophobic or not my $thisaa = $_[0]; if ($thisaa =~ /[CDEGHKNPQRSTY]/i) { return 1; } else { return 0; } }