#!/usr/ucb/perl -w ################################################################## # FindORF.pl # PERL Script by Sven Rahmann, (c) 2002 # # Usage: # FindORF.pl [options] FASTA-File(s) # # Options: -r Also look in reversed complement # -w Warn on wildcard characters # -l len Minimum ORF length # -s file File with codon scores # -f Only output ORFs that begin at position 1 # -p Only output ORFs with positive score # -b bbb Only output the bbb best ORFs # -c file FASTA file with ORFs to compare to # ################################################################## use strict; use strict 'refs'; use Getopt::Std; ################################################################# my $prg; my $reverse=0; my $warn=0; my $minlength=0; my $positive=0; my $beg=0; my $best=0; my $DEFAULTMINLENGTH=123; my %score = (); my %geneid = (); my $cmpfile = ""; my $totalseq=0; ################################################################# sub info { my $text=shift; print STDERR "$prg: Error: $text\n"; } sub usage { print STDERR <=$minlength && (!$positive || $scr>0) && (!$beg || $start==0)) { if ($ll>3) {$scr/=( ($ll/3)-1 );} else {$scr=0;} $ostart=($start+1)*$eins; $ostop =($stop+1)*$eins; if ($cmpfile) { $orf = substr($s,$start,$ll); $id = $geneid{$orf} || "---"; } #print("$ostart $ll $scr $id\n"); push(@ORFS, [$ostart, $ll, $scr, $id]); } } } return @ORFS; } ##################################################################### ############################################ # sub readscore($filename) # Assigns %score to scores from file $filename or zeros ############################################ sub readscore { my $scorefile=shift; my @ACGT = ('A','C','G','T'); my ($one,$two,$three); my $codon; if (!$scorefile) { foreach $one (@ACGT) { foreach $two (@ACGT) { foreach $three(@ACGT) { $codon=$one.$two.$three; $score{$codon}=(); } } } } else { open(SF,"<$scorefile") || die; while () { if (/(\S+)(\s+)([\d\-\.]+)/) { $score{$1}=$3; } else { info("Wrong format in $scorefile."); die; } } close(SF); } } ##################################################################### ############################################## # sub readcmp($filename) # Read FASTA file of true genes for comparison ############################################### sub readcmp { my $filename=shift; my $header = "NIX"; my $line; my $seq = ""; my $h; open FF,"<$filename" || die; while (! ($header =~ /^>/) ) { $header = ; } chomp($header); while() { chomp; $line=$_; if ($line =~ /^>/) { # Process $seq if ($header =~ /^>(\S+)/) {$h=$1;} else {$h="NIX";} $geneid{$seq}=$h; $header = $line; $seq=""; } else { $seq .= $line; } } if ($header =~ /^>(\S+)/) {$h=$1;} else {$h="NIX";} $geneid{$seq}=$h; } ##################################################################### ############################## # sub process ($header, $seq) ############################## sub process { my $h=shift; my $s=shift; my $id=""; my $sr; my $ww; my $l; my @flist=(); my @blist=(); $s=uc($s); ($ww=$s) =~ s/[ACGT]//g; if(($l=length($ww)) && $warn) { print STDERR "Warning: $l non-ACGT characters found.\n"; } if ($h =~ /^>(\S+)/) {$id=$1;} else {$id="NOP";} @flist=getorflist($s,1,$id); if($reverse) { $sr = reverse($s); $sr =~ tr/ACGT/TGCA/; @blist=getorflist($sr,-1,$id.'-RC'); push(@flist,@blist); } $totalseq++; return @flist; } ################################################################# sub printorflist { my $listref=shift; my $bbb=shift; my @list=@$listref; my $orfref; my @orf; my $i; if (!$bbb) { $bbb=scalar(@list); } for($i=0; $i<$bbb; $i++) { @orf=@{$list[$i]}; printf "%8d %6d %8.5f",$orf[0],$orf[1],$orf[2]; if ($beg || $cmpfile) { print " ",$orf[3]; } print "\n"; } } ################################################################# sub byscore { ($b->[2]) <=> ($a->[2]) }; ################################################################# my %options; $prg = $0; if ($prg =~ /^(.*)\/(.*)\.pl/) { $prg=$2; } elsif ($prg =~ /(.*)\.pl/) { $prg=$1; } getopts("rwfl:s:b:pc:", \%options); while (($ARGV[0]) && ($ARGV[0] =~ /^-/)) { shift; } $warn = $options{w}; $reverse = $options{r}; $positive = $options{p}; $beg = $options{f}; $minlength = $options{l} || $DEFAULTMINLENGTH; my $scorefile = $options{s}; readscore($scorefile); $best = $options{b}; $cmpfile = $options{c}; if ($cmpfile) { readcmp($cmpfile); } # init %geneid my @orflist = (); # ORF list my @slist = (); # sorted ORF list my $orfref; my @orf; # Processing loop my $header = "NIX"; my $line; my $seq = ""; while (! ($header =~ /^>/) ) { $header = <>; } chomp($header); while(<>) { chomp; $line=$_; if ($line =~ /^>/) { # Process $seq push(@orflist,process($header,$seq)); $header = $line; $seq=""; } else { $seq .= $line; } } push(@orflist,process($header, $seq)); @slist = sort byscore @orflist; #printorflist(\@orflist,$best); print "\n"; printorflist(\@slist,$best); # Show statistics if ($warn) { print STDERR "Done. Processed $totalseq input sequences.\n"; } #################################################################