#!/usr/bin/perl use strict; #手工输入信息? my $protein = "MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLMRISDKLAGINAARFHDWQPDFTPANARQAILAFKGDVYTGLQAETFSEDDFDFAQQHLRMLSGLYGVLRPLDLMQPYRLEMGIRLENARGKDLYQFWGDIITNKLNEALAAQGDNVVINLASDEYFKSVKPKKLNAEIIKPVFLDEKNGKFKIISFYAKKARGLMSRFIIENRLTKPEQLTGFNSEGYFFDEDSSSNGELVFKRYEQR"; my $threshold = 0.020; my $sampleName = "E.Coli_K12"; my @targetNames = ("A.Aeolicus", "B.Melitensis"); open (SCORE_OUT, ">myscore.out"); my $result = IsConservedProtein($protein, $threshold, $sampleName, @targetNames); close SCORE_OUT; print "found conserved protein = $result \n"; ################################################################################# ## This sub-routine takes in a protein sequence, a threshold value, ## a sample specie name, and an array of target species names. ## ## The threshold value is a number between 0 and 1, such as 0.8. It means the ## comparison score must exceed 80% of the perfect score so that the protein ## is qualified for a conserved protein. ## ## Its output is written to a score.out file in the current directory. ## The output contains nothing if the protein is not a conserved protein; ## the output contains the sequence, the target name, the score, the protein id ## if the protein is a conserved protein. ## ## See the Design Document about how to decide whether a protein is a ## conserved protein or not. ################################################################################# sub IsConservedProtein { my ($protein, $threshold, $sampleName, @targetNames) = @_; my $foundConserved = 1; ## by default, it's true. ## For debug purpose print "protein: $protein\n"; print "threshold: $threshold\n"; print "sampleName: $sampleName\n"; print "targetNames: @targetNames\n"; ## End of debug ## 1, get the perfect score my $perfectScore = -1; my @result = GetScore($protein, $sampleName); ## For debug purpose print "$result[0]\n"; print "$result[1]\n"; print "$result[2]\n"; print "$result[3]\n"; ## End of debug $perfectScore = $result[3]; ## For debug purpose print "perfect score: $perfectScore\n"; ## End of debug my $thresholdScore = $perfectScore * $threshold; ## 2, search each target ## Loop through the targets my $target; ## target specie name my $targetAA; ## target specie faa file name my @results = (); ## a 2D array, contains targetName, protein_id, product, and score. $results[0][0] = $result[0]; $results[0][1] = $result[1]; $results[0][2] = $result[2]; $results[0][3] = $result[3]; ## $results[0] = \@result; use Data::Dumper; print Dumper \@results; ## print "Total target number is $#targetNames\n"; for (my $i=0; $i<=$#targetNames; $i++) ## foreach $target (@targetNames) { $target = $targetNames[$i]; ## For Debug Purpose print "Processing $target...\n"; ## End @result = GetScore($protein, $target); ## For debug purpose print "$result[0]\n"; print "$result[1]\n"; print "$result[2]\n"; print "$result[3]\n"; ## End of debug my $score = $result[3]; if($score >= $thresholdScore) { $results[$i+1][0] = $result[0]; $results[$i+1][1] = $result[1]; $results[$i+1][2] = $result[2]; $results[$i+1][3] = $result[3]; ## $results[$i+1] = \@result; ## push @results, @result; use Data::Dumper; print Dumper \@results; } else { $foundConserved = 0; last; } } if($foundConserved) { WriteToFiles($protein, @results); } return $foundConserved; } ########################################################################### ## Calculates score and other results. ## Input: protein sequence and target specie's name ## Output: an array contains targetName, protein_id, product, and score ########################################################################### sub GetScore { my ($protein, $target) = @_; my $targetAA = $target . ".faa"; if(!-e $targetAA) { die("\n$targetAA file does not exist!! You need to download it from NCBI FTP site and format it before run this program.\n"); } my @testout = `echo $protein | blastall -p blastp -d $targetAA`; ## print @testout; ## print "\n"; my $score = -1; my $protein_id; my $protein_product; for(my $i = 0; $i<= $#testout; $i++) { my $line = $testout[$i]; ## print "line[$i]: $line\n"; my $match = $line =~ m/^Sequences producing significant alignments:/; if($match) { print "match found\n"; # get the score my $score_line = $testout[$i+2]; print "score line is $score_line \n"; my @fields = split( / ( )*/, $score_line); $score = $fields[2]; my @subfields = split(/\|/, $fields[0]); $protein_id = $subfields[1]; $protein_product = $subfields[2]; # break the loop last; } } my @result = ($target, $protein_id, $protein_product, $score); return @result; } ########################################################################### ## Output results into the score.out file. ## Input: a 2D array contains conserved proteins' data and the protein. ## Output: no. ########################################################################### sub WriteToFiles { my ($protein, @results) = @_; ## use Data::Dumper; ## print Dumper \@results; ## global SCORE_OUT output file handle print SCORE_OUT "product=$results[0][2]\n\n"; print SCORE_OUT "protein_id=$results[0][1]\n"; print SCORE_OUT "translation=$protein\n\n"; for (my $i=1; $i<=$#results; $i++) { print SCORE_OUT "target=$results[$i][0]\t\tscore=$results[$i][3]\n"; } print SCORE_OUT "\n\n"; ## close SCORE_OUT; }