#!/usr/local/bin/perl -w use strict; my $DEBUG = 1; # set to true for debug prints #x上游y下游，手动输入。 my $x = 50; my $y = 40; my $sep = '*'x80; my $sep2 = '='x80; #寻找核苷酸序列的依据文件。 my $score_file = "score_55p.out"; open (SCORE, $score_file) or die "Can't open $score_file: $!\n"; undef $/; my @proteins = split(/\n\d+\n/, ); #print "Count: " , scalar(@proteins), "\n"; #print "@proteins\n"; my %output_hash = (); my %labels = (); my %min_upstream = (); my %protein_id_count = (); my $dir = '.\all'; opendir(DIR, $dir) or die "Can't open $dir: $!\n"; while (defined(my $file = readdir(DIR))) { next unless $file =~ /\.gbk\.processed$/; my $processed_gbk_file = join '\\', $dir, $file; print "$processed_gbk_file\n"; find_min_upstream($processed_gbk_file); } #foreach (sort keys %min_upstream) { print "$_: $min_upstream{$_}\n"; } #foreach (sort keys %protein_id_count) { print "$_: $protein_id_count{$_}\n" if $protein_id_count{$_} > 1; } my $ecoli_protein_count = 0; foreach my $protein (@proteins) { $ecoli_protein_count++; my @data = split /\n\n/, $protein; #print "XXX: @data\n"; my ($ecoli_protein_id, $search_protein_id, $translation, @targets) = (); foreach (@data) { #print "XXX:\n$_\n"; s/^\s//g; s/\s$//g; next unless $_; next if /^product/; next if /^perfect/; #print "$_\n\n"; /protein_id\=(\S+)/ && do { $ecoli_protein_id = $1; }; /translation\=(\S+)/ && do { $translation = $1; }; /target\=(\S+)\s+/ && do { push @targets, split /\n/, $_ }; } foreach my $target (@targets) { #print "$target\n"; $target =~ /target\=(\S+?)\s+protein_id\=(\S+?)\s+/; my $species_name = $1; my $sp_name = $1; my $search_protein_id = $2; #next unless $species_name =~ /muridarum/i; $species_name = "data\\$species_name"; print "Calling... locate_x_y($species_name, $x, $y, $search_protein_id)\n"; my @results = locate_x_y($species_name, $x, $y, $search_protein_id); #print "\nResults of locate_x_y: ", scalar(@results), "\n"; #foreach (@results) { print "*** $_\n"; } populate_output_hash(@results, $ecoli_protein_count, $sp_name) if @results == 3; print "$sep\n"; } } print_output_files(); ########################################################################################### # This subroutine populates the output hash ########################################################################################### sub populate_output_hash { my $this_func = (caller(0))[3]; die "$this_func: supplied wrong number of args: ", scalar(@_), "\n" unless (@_ == 5); my $gene = shift; my $upstream_x = shift; my $downstream_y = shift; my $ecoli_protein_count = shift; my $species_name = shift; my $label = "#".$species_name; $label = substr($label, 0, 10); $labels{$label}++; # the gene itself $output_hash{GENE}{$label} .= "$gene\n"; # need newline for this file # upstream_x $output_hash{UP}{$label} .= "$upstream_x"; # downstream_y $output_hash{DOWN}{$label} .= "$downstream_y"; # upstream_x + downstream_y $output_hash{UP_DOWN}{$label} .= "$upstream_x$downstream_y"; # upstream_x + gene + downstream_y $output_hash{UP_GENE_DOWN}{$label} .= "$upstream_x$gene$downstream_y\n"; # need newline for this file } ########################################################################################### # This subroutine prints the 5 output files ########################################################################################### sub print_output_files { my $mode = ">"; # truncate and open my $file_0 = "${mode}output\\gene.txt"; my $file_1 = "${mode}output\\up.txt"; my $file_2 = "${mode}output\\down.txt"; my $file_3 = "${mode}output\\up_down.txt"; my $file_4 = "${mode}output\\up_gene_down.txt"; # the gene itself print "printing $file_0...\n"; open (FH, $file_0) or die "Can't open $file_0: $!\n"; foreach my $label (sort keys %labels) { print FH "\n$label\n"; print FH "$output_hash{GENE}{$label}\n"; } close FH; # upstream_x print "printing $file_1...\n"; open (FH, $file_1) or die "Can't open $file_1: $!\n"; foreach my $label (sort keys %labels) { print FH "\n$label\n"; print FH "$output_hash{UP}{$label}\n"; } close FH; # downstream_y print "printing $file_2...\n"; open (FH, $file_2) or die "Can't open $file_2: $!\n"; foreach my $label (sort keys %labels) { print FH "\n$label\n"; print FH "$output_hash{DOWN}{$label}\n"; } close FH; # upstream_x + downstream_y print "printing $file_3...\n"; open (FH, $file_3) or die "Can't open $file_3: $!\n"; foreach my $label (sort keys %labels) { print FH "\n$label\n"; print FH "$output_hash{UP_DOWN}{$label}\n"; } close FH; # upstream_x + gene + downstream_y print "printing $file_4...\n"; open (FH, $file_4) or die "Can't open $file_4: $!\n"; foreach my $label (sort keys %labels) { print FH "\n$label\n"; print FH "$output_hash{UP_GENE_DOWN}{$label}\n"; } close FH; } ########################################################################################### # This subroutine prints out 5 output files per e.coli protein in the score file ########################################################################################### sub old_print_output_files { my $this_func = (caller(0))[3]; die "$this_func: supplied wrong number of args: ", scalar(@_), "\n" unless (@_ == 5); my $gene = shift; my $upstream_x = shift; my $downstream_y = shift; my $ecoli_protein_count = shift; my $target = shift; #$ecoli_protein_count = 3; my $mode = ">"; # truncate and open if ($ecoli_protein_count > 1) { $mode = ">>"; #open for appending; } my $file_0 = "${mode}output\\0_gene_${target}.txt"; my $file_1 = "${mode}output\\1_up_${target}.txt"; my $file_2 = "${mode}output\\2_down_${target}.txt"; my $file_3 = "${mode}output\\3_up_down_${target}.txt"; my $file_4 = "${mode}output\\4_up_gene_down_${target}.txt"; #print "XXXX: $file_0\n"; #print "XXXX: $file_1\n"; #print "XXXX: $file_2\n"; #print "XXXX: $file_3\n"; #print "XXXX: $file_4\n"; # the gene itself print "printing $file_0...\n"; open (FH, $file_0) or die "Can't open $file_0: $!\n"; print FH $gene; close FH; # upstream_x print "printing $file_1...\n"; open (FH, $file_1) or die "Can't open $file_1: $!\n"; print FH $upstream_x; close FH; # downstream_y print "printing $file_2...\n"; open (FH, $file_2) or die "Can't open $file_2: $!\n"; print FH $downstream_y; close FH; # upstream_x + downstream_y print "printing $file_3...\n"; open (FH, $file_3) or die "Can't open $file_3: $!\n"; print FH $upstream_x; print FH $downstream_y; close FH; # upstream_x + gene + downstream_y print "printing $file_4...\n"; open (FH, $file_4) or die "Can't open $file_4: $!\n"; print FH $upstream_x; print FH $gene; print FH $downstream_y; close FH; } ########################################################################################### # This subroutine depends on the existence of a processed gbk file for the specified species # name., e.g., "U00096.gbk.processed". This processed file can be generated using the # program "preprocess_gbk.pl" # # This subroutine loops thru the gbk file and populates the min_upstream hash for # each gene of each species ########################################################################################### sub find_min_upstream { my $gbk_file = shift; $gbk_file =~ /(.*)\.gbk/; my $species_name = $1; my $prev_end_pos = 0; $/ = $sep; open (GBK, $gbk_file) or die "Can't open $gbk_file: $!\n"; while () { my ($start_pos, $end_pos, $protein_id) = (); my $upstream_length = 0; unless (/\/translation\=/) { #warn "+++ No translation label...skipping...\n$sep2\n$_\n$sep2\n"; #next; } /\s+CDS\s+\D*\({0,1}(\d+)\.\.(\d+)[\S\s]*\/protein_id\=\"(\S+?)\"/ && do { $start_pos = $1; $end_pos = $2; $protein_id = $3; $upstream_length = $start_pos - $prev_end_pos; $upstream_length = $x if $upstream_length < 0; $protein_id_count{$protein_id}++; $min_upstream{$protein_id} = 10e10 unless $min_upstream{$protein_id}; if ($upstream_length < $min_upstream{$protein_id}) { $min_upstream{$protein_id} = $upstream_length; } $prev_end_pos = $end_pos; }; } close GBK; } ########################################################################################### # This subroutine depends on the existence of a processed gbk file for the specified species # name., e.g., "U00096.gbk.processed". This processed file can be generated using the # program "preprocess_gbk.pl" # # This subroutine searches for the $search_protein_id in the gbk file, # collects the position info for that protein, and calls # sub parse_fna($species_name, $start_pos, $end_pos, $x, $y) ########################################################################################### sub locate_x_y { my $species_name = shift; my $x = shift || -1; my $y = shift || -1; my $search_protein_id = shift || ""; my @results; my $gbk_file = join '.', $species_name, 'gbk', "processed"; my $skip_count = 0; $/ = $sep; open (GBK, $gbk_file) or die "Can't open $gbk_file: $!\n"; LOOP: while () { my ($start_pos, $end_pos, $protein_id, $seq) = (); unless (/\/translation\=/) { #warn "+++ No translation label...skipping...\n$sep2\n$_\n$sep2\n"; $skip_count++; next; } /\s+CDS\s+\D*\({0,1}(\d+)\.\.(\d+)[\S\s]*\/protein_id\=\"(\S+?)\"[\S\s]*\/translation\=\"([\S\s]+?)\"/ && do { $start_pos = $1; $end_pos = $2; $protein_id = $3; $seq = $4; next LOOP unless ($protein_id eq $search_protein_id); }; if ($seq && $protein_id && $start_pos && $end_pos) { if ($x > $min_upstream{$protein_id}) { warn "resetting $x -> $min_upstream{$protein_id}\n"; $x = $min_upstream{$protein_id}; } $seq =~ s/\s+//g; print "\n$sep2\nprotein: $protein_id: calling parse_fna($species_name, $start_pos, $end_pos, $x, $y)\n" if $DEBUG; @results = parse_fna($species_name, $start_pos, $end_pos, $x, $y); last; } } close GBK; warn "Total skips for $gbk_file: $skip_count\n"; return @results; } ########################################################################################### # This subroutine depends on the existence of a processed fna file for the specified species # name., e.g., "U00096.fna.processed". This processed file can be geenrated using the # program "preprocess_fna.pl" # # This subroutine extracts and returns the upstream_x, downstream_y, and the string itself ########################################################################################### sub parse_fna { my $species_name = shift; my $start_pos = shift; my $end_pos = shift; my $x = shift; my $y = shift; my ($string, $upstream_x, $downstream_y, $bytes_read, $offset, $length) = (); my $fna_file = join '.', $species_name, 'fna', "processed"; open (FNA, $fna_file) or die "Can't open $fna_file: $!\n"; # get X bytes upstream $offset = $start_pos - $x; $length = $x; seek FNA, $offset, 0; $bytes_read = read(FNA, $upstream_x, $length); # get Y bytes upstream $offset = $end_pos; $length = $y; seek FNA, $offset, 0; $bytes_read = read(FNA, $downstream_y, $length); # get the string itself $offset = $start_pos; $length = $end_pos - $start_pos; seek FNA, $offset, 0; $bytes_read = read(FNA, $string, $length); #print "parse_fna() returning...\nstring:\t\t$string\nupstream_x:\t$upstream_x\ndownstream_y:\t$downstream_y\n"; return ($string, $upstream_x, $downstream_y); }