#!/usr/bin/perl use strict; ############################################################################ ######## Customize the following entries for your search ################## ############################################################################ ## $threshold is a real number between 0 and 1. The higher the threshold, the ## more similar of the two sequence. my $threshold = 0.55; ## $sampleName is for the benchmark specie. If you change the value, you need ## to make sure the [Name].faa, [Name].fna, and [Name].gbk exist. my $sampleName = "E.Coli_K12"; ## @targetNames contain all target species used for comparison. If you modify ## the value, you need to make sure the [Name].faa, [Name].fna, and [Name].gbk exist. my @targetNames = ("B.Halodurans", "C.Muridarum_60", "C.Perfringens_16", "C.T.TLS", "H.Pylori_26695", "T.Maritima", "Aquifex.A", "Bacillus.H", "Bacillus.S", "Clostridium.A"); ############### End of Customization ###################################### ############################################################################# my $DEBUG = 0; # set to true for debug prints my $conservedProteinCounter = 0; open (SCORE_OUT, ">score_55p.out"); parse_gbk($threshold, $sampleName, @targetNames); close SCORE_OUT; ################################################################################# ## This sub-routine takes in a protein sequence, a threshold value, ## a sample specie name, and an array of target species names. ## ## The threshold value is a number between 0 and 1, such as 0.8. It means the ## comparison score must exceed 80% of the perfect score so that the protein ## is qualified for a conserved protein. ## ## Its output is written to a score.out file in the current directory. ## The output contains nothing if the protein is not a conserved protein; ## the output contains the sequence, the target name, the score, the protein id ## if the protein is a conserved protein. ## ## See the Design Document about how to decide whether a protein is a ## conserved protein or not. ################################################################################# sub IsConservedProtein { my ($protein, $threshold, $sampleName, @targetNames) = @_; my $foundConserved = 1; ## by default, it's true. ## For debug purpose print "protein: $protein\n" if $DEBUG; print "threshold: $threshold\n" if $DEBUG; print "sampleName: $sampleName\n" if $DEBUG; print "targetNames: @targetNames\n" if $DEBUG; ## End of debug ## First, get the perfect score my $perfectScore = -1; my @result = GetScore($protein, $sampleName); ## For debug purpose print "$result[0]\n" if $DEBUG; print "$result[1]\n" if $DEBUG; print "$result[2]\n" if $DEBUG; print "$result[3]\n" if $DEBUG; ## End of debug $perfectScore = $result[3]; ## For debug purpose print "perfect score: $perfectScore\n" if $DEBUG; ## End of debug my $thresholdScore = $perfectScore * $threshold; ## Second, search each target ## Loop through the targets my $target; ## target specie name my $targetAA; ## target specie faa file name my @results = (); ## a 2D array, contains targetName, protein_id, product, and score. $results[0][0] = $result[0]; $results[0][1] = $result[1]; $results[0][2] = $result[2]; $results[0][3] = $result[3]; ## $results[0] = \@result; ## use Data::Dumper; ## print Dumper \@results; ## print "Total target number is $#targetNames\n"; for (my $i=0; $i<=$#targetNames; $i++) ## foreach $target (@targetNames) { $target = $targetNames[$i]; ## For Debug Purpose print "Processing $target...\n" if $DEBUG; ## End @result = GetScore($protein, $target); ## For debug purpose print "$result[0]\n" if $DEBUG; print "$result[1]\n" if $DEBUG; print "$result[2]\n" if $DEBUG; print "$result[3]\n" if $DEBUG; ## End of debug my $score = $result[3]; if($score >= $thresholdScore) { $results[$i+1][0] = $result[0]; $results[$i+1][1] = $result[1]; $results[$i+1][2] = $result[2]; $results[$i+1][3] = $result[3]; ## $results[$i+1] = \@result; ## push @results, @result; ## use Data::Dumper; ## print Dumper \@results; } else { $foundConserved = 0; last; } } if($foundConserved) { WriteToFiles($perfectScore, $protein, @results); } return $foundConserved; } ########################################################################### ## Calculates score and other results. ## Input: protein sequence and target specie's name ## Output: an array contains targetName, protein_id, product, and score ########################################################################### sub GetScore { my ($protein, $target) = @_; my $targetAA = $target . ".faa"; print "target AA is $targetAA.\n" if $DEBUG; if(!-e $targetAA) { die("\n$targetAA file does not exist!! You need to download it from NCBI FTP site and format it before run this program.\n"); } my @testout = `echo $protein | blastall -p blastp -d $targetAA`; ## print @testout if $DEBUG; ## print "\n"; my $score = -1; my $protein_id; my $protein_product; @testout = split /\n/, $testout[0]; for(my $i = 0; $i<= $#testout; $i++) { my $line = $testout[$i]; ## print "line[$i]: $line\n"; my $match = $line =~ m/^Sequences producing significant alignments:/; if($match) { print "match found\n" if $DEBUG; # get the score my $score_line = $testout[$i+2]; print "score line is $score_line \n" if $DEBUG; my @fields = split( / ( )*/, $score_line); $score = $fields[2]; my @subfields = split(/\|/, $fields[0]); $protein_id = $subfields[1]; $protein_product = $subfields[2]; # break the loop last; } } my @result = ($target, $protein_id, $protein_product, $score); return @result; } ########################################################################### ## Output results into the score.out file. ## Input: a 2D array contains conserved proteins' data and the protein. ## Output: no. ########################################################################### sub WriteToFiles { my ($perfectScore, $protein, @results) = @_; ## use Data::Dumper; ## print Dumper \@results; ## global SCORE_OUT output file handle $conservedProteinCounter++; print SCORE_OUT "$conservedProteinCounter\n\n"; print SCORE_OUT "product=$results[0][2]\n\n"; print SCORE_OUT "protein_id=$results[0][1]\n\n"; print SCORE_OUT "translation=$protein\n\n"; print SCORE_OUT "perfect_score=$perfectScore\n\n"; for (my $i=1; $i<=$#results; $i++) { print SCORE_OUT "target=$results[$i][0]\t\tprotein_id=$results[$i][1]\t\tscore=$results[$i][3]\n"; } print SCORE_OUT "\n\n"; ## close SCORE_OUT; } # This subroutine depends on the existence of a processed gbk file for the specified species # name., e.g., "U00096.gbk.processed". This processed file can be geenrated using the # program "preprocess_gbk.pl" # # This subroutine handles 2 cases # 1) if only the first argument is supplied, it parses the gbk file for that species # and calls the sub run_blast($protein_id, $start_pos, $end_pos, $seq) for every protein # encountered in the gbk file. # 2) If 4 args are supplied, it searches for the $search_protein_id in the gbk file, # collects the position info for that protein, # and calls sub parse_fna($species_name, $start_pos, $end_pos, $x, $y) ########################################################################################### sub parse_gbk { my ($threshold, $species_name, @targetNames) = @_; my $gbk_file = join '.', $species_name, 'gbk', "processed"; my $sep2 = '='x80; my $sep = '*'x80; $/ = $sep; open (GBK, $gbk_file) or die "Can't open $gbk_file: $!\n"; my $skip_count = 0; while () { my ($start_pos, $end_pos, $protein_id, $seq) = (); unless (/\/translation\=/) { warn "+++ No translation label...skipping...\n$sep2\n$_\n$sep2\n"; $skip_count++; next; } /\s+CDS\s+\D*\({0,1}(\d+)\.\.(\d+)[\S\s]*\/protein_id\=\"(\S+?)\"[\S\s]*\/translation\=\"([\S\s]+?)\"/ ## /\s+CDS\s+\D*\({0,1}(\d+)\.\.(\d+)[\S\s]*\/protein_id\=\"(\S+)\"[\S\s]*\/translation\=\"([\S\s]+)\"/ && do { $start_pos = $1; $end_pos = $2; $protein_id = $3; $seq = $4; }; unless ($seq && $protein_id && $start_pos && $end_pos) { # should never really get here warn "+++ No translation label...skipping...\n$sep2\n$_$sep2\n"; next; } $seq =~ s/\s+//g; print "\n$sep2\ncalling IsConservedProtein($seq,$threshold, $sampleName, @targetNames)\n" if $DEBUG; ## run_blast($protein_id, $start_pos, $end_pos, $seq); IsConservedProtein($seq, $threshold, $sampleName, @targetNames); } close GBK; warn "Total skips: $skip_count\n"; }