# $Author: mm238 $ $Revision: 1.1 $ $Date: 2003/09/05 10:23:02 $ # # Copyright (C) 2002 MRC and Martin Madera # # This program may be freely redistributed and/or modified # under the terms of the GNU General Public License as published # by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # -------------------------------------------------------------- # # READING FILES # ============= # # $model = HiddenMarkovModel->readSAM("file.mod"); # $model = HiddenMarkovModel->readHMMER("file.hmm"); # $model = HiddenMarkovModel->readPSIBLAST("file.psiblast"); # # or # # $model = HiddenMarkovModel->clever_read("file.ext"); # # which decides by itself which routine to call. # # # WRITING FILES # ============= # # $model->writeSAM("file.mod"); # $model->writeHMMER("file.hmm"); # $model->writePSIBLAST("file.psiblast"); # # or # # $model->clever_write("file.ext"); # # which will work it out by itself. # # # NB writePSIBLAST needs to know the seed sequence; you set this by # doing # # $model->{seed} = "AASFASDFASDFASDF"; # # or via # # $model->read_seedseq("file.fasta"); # # before calling writePSIBLAST. If you don't do that, writePSIBLAST # will figure out the highest-scoring sequence and will use that # instead, and will save the sequence to file.psiblast.fa. Keep the # file, you will need it when using the profile. # # WARNING: the PSI-BLAST binary format only works on x86 and alphas # and architectures that store C floats in the same way # # package HiddenMarkovModel; use vars qw($DD $MD $ID $DM $MM $IM $DI $MI $II); use strict; ## ## order of transition probabilities in the @{trans} array ## ($DD, $MD, $ID, $DM, $MM, $IM, $DI, $MI, $II) = (0..8); ## ## allocation etc. ## sub new { my ($pkg) = @_; my $obj = &clear({}); bless $obj, $pkg; }; sub clear { my ($obj) = @_; $obj->{trans} = []; $obj->{matches} = []; $obj->{inserts} = []; $obj->{fims} = []; $obj->{calibration} = ""; $obj->{M} = 0; return $obj; }; ## ## *** SAM *** ## sub readBlock { local *HANDLE = shift; # file handle to read from my $filename = shift; # name of the corresponding file my $cols = shift; # number of columns to read my $rows = shift; # number of rows to read my $array = []; # array to return my ($i, @a); for $i (1 .. $rows) { @a = split /\s+/, ; die "Parsing error on line $. of $filename!\n..." unless @a == $cols; push @{$array}, @a; }; return $array; }; sub writeBlock { local *HANDLE = shift; # file handle to write to my $array = shift; # the array to write my $cols = shift; # ... number of columns my $rows = shift; # ... number of rows my ($i, $j); for $i (0 .. $rows-1) { for $j (0 .. $cols-1) { printf HANDLE "%8.6f ", $array->[$i*$cols+$j]; }; print HANDLE "\n"; }; }; sub pad { my $len = shift; # pad $text so that the final length is $len my $text = shift; # the text to pad return " "x($len-length $text) . $text; }; sub readSAM { # two ways of using this method: # # (a) $obj->readSAM("filename"); # # (b) $obj = PackageXYZ->readSAM("filename"); # my ($obj, $filename) = @_; my $m = -2; # current position in the model my $last = 0; # is this the last position? my $fim=""; # the fim line my ($array)=[]; my (@a, $i); if( ref($obj) ) { # $obj is a reference => (a) above $obj->clear(); } else { # $obj not a reference => (b) above # (assume $obj is a package name) $obj = new $obj; }; open( SAM, "$filename" ) or die "Cannot open $filename: $!"; while() { $m=-1 if /^Begin/; if( /^TYPE/ ) { $fim = $_; next; }; next unless $m >=-1; $m++; if( $m >0 ) { if ( /^End/ ) { $last=1; } elsif( /^\s*(\S+)/ ) { die "Parsing error on line $. of $filename!\n..." unless $1 == $m; } else { die "Parsing error on line $. of $filename!\n..."; }; }; push @{$obj->{fims}}, $fim; $fim = ""; push @{$obj->{trans}}, &readBlock( *SAM, $filename, 3, 3 ); push @{$obj->{matches}}, &readBlock( *SAM, $filename, 5, 4 ); push @{$obj->{inserts}}, &readBlock( *SAM, $filename, 5, 4 ); last if $last; }; close SAM; $obj->{M} = $m-1; return $obj; }; sub writeSAM { my ($obj, $filename) = @_; my $m; # current position in the model open( SAM, ">$filename" ) or die "Cannot open $filename: $!\n..."; print SAM "MODEL\nalphabet protein\n"; for $m (0 .. $obj->{M}+1) { print SAM ${$obj->{fims}}[$m] unless ${$obj->{fims}}[$m] eq ""; if( $m == 0 ) { print SAM "Begin\n"; } elsif ( $m == $obj->{M}+1 ) { print SAM "End\n"; } else { print SAM pad(4,$m)."\n"; }; &writeBlock(*SAM, $obj->{trans}[$m], 3, 3); &writeBlock(*SAM, $obj->{matches}[$m], 5, 4); &writeBlock(*SAM, $obj->{inserts}[$m], 5, 4); }; print SAM "\nENDMODEL\n"; close SAM; }; ## ## *** HMMER *** ## my $header1 = "ALPH Amino\n". "DATE Today\n". "COM Converted from Sam model\n". "XT -8455 -4 -1000 -1000 -8455 -4 -8455 -4 \n". "NULT -4 -8455\n". "NULE 595 -1558 85 338 -294 453 -1158 197 249 902 -1085 -142 -21 -313 45 531 201 384 -1998 -644 \n"; my $header2 = "HMM A C D E F G H I K L M N P Q R S T V W Y \n". " m->m m->i m->d i->m i->i d->m d->d b->m m->e\n"; my @NULE = ( 595, -1558, 85, 338, -294, 453, -1158, 197, 249, 902, -1085, -142, -21, -313, 45, 531, 201, 384, -1998, -644 ); sub score { my $prob = shift; # convert this probability to a score my $nullProb = shift; # given this null probability return $prob != 0 ? int( 0.5+1000*log($prob/$nullProb)/log(2.0) ) : "*"; }; sub prob { my $score = shift; # convert this score to a probability my $nullProb = shift; # given this null probability return $score=~ /\*/ ? 0 : $nullProb*exp(log(2)*$score/1000); }; sub readLine { local *HANDLE = shift; # file handle to read from my $filename = shift; # name of the file my @numbers = @_; # possible numbers of data items expected # on the line my @array; $_ = ; @array = split /\s+/; shift @array; die "Parsing error on line $. of $filename\n..." unless grep( scalar(@array) == $_, @numbers ); return @array; }; sub writeLine { local *HANDLE = shift; # file handle to print to print HANDLE &pad(6, shift); while( @_ ) { print HANDLE &pad(7, shift); }; print HANDLE "\n"; }; sub readHMMER { # two ways of using this method: # # (a) $obj->readHMMER("filename"); # # (b) $obj = PackageXYZ->readHMMER("filename"); # my ($obj, $filename, $global) = @_; my $m; # current position in the model # transition probabilities: $xy = P(X->Y) my ($dd, $dm, $md, $mm, $mi, $im, $ii, $bm, $me, $bd ); my ($i, $total, @a); if( ref($obj) ) { # $obj is a reference => (a) above $obj->clear(); } else { # $obj not a reference => (b) above # (assume $obj is a package name) $obj = new $obj; }; open( HMMER, "$filename" ) or die "Cannot open $filename: $!\n..."; ## ## header ## while() { if( /^LENG\s\s(\d+)/ ) { $obj->{M}=$1; # "allocate" @trans for $i (0 .. $obj->{M}+1) { push @{$obj->{trans}}, [0,0,0, 0,0,0, 0,0,0]; }; # the "Begin" position values push @{$obj->{matches}}, [ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0 ]; push @{$obj->{inserts}}, [ 0.077850, 0.016640, 0.057325, 0.065451, 0.039844, 0.062177, 0.024314, 0.060479, 0.064418, 0.082109, 0.024547, 0.047302, 0.040276, 0.041960, 0.048771, 0.067948, 0.059895, 0.072042, 0.011740, 0.034912 ]; } elsif( /^HMM\s/ ) { ; last; } elsif( /^EVD/ ) { $obj->{calibration} = $_; }; }; ## ## "3-line" ## @a = readLine( *HMMER, $filename, 3 ); $bd = prob( $a[2], 1 ); # Similar FS and LS configs ## ## data ## for $m (1 .. $obj->{M}) { ## ## match emissions ## @a = readLine( *HMMER, $filename, 21, 22 ); pop @a if scalar(@a)==22; # what's this last number ??! shift @a; for $i (0 .. 19) { $a[$i] = prob( $a[$i], prob($NULE[$i],1/20) ); }; push @{$obj->{matches}}, [@a]; ## ## insert emissions ## @a = readLine( *HMMER, $filename, 21 ); shift @a; for $i (0 .. 19) { $a[$i] = prob( $a[$i], prob($NULE[$i],1/20) ); }; push @{$obj->{inserts}}, [@a]; ## ## state transitions ## @a = readLine( *HMMER, $filename, 10 ); shift @a; for $i (0 .. 8) { $a[$i] = prob( $a[$i], 1 ); }; ($mm, $mi, $md, $im, $ii, $dm, $dd, $bm, $me) = @a; if( $m == $obj->{M} ) # Very end { $dd=0; $md=0; $dm=1; $mm=1; $im=1; $mi=0; $ii=0; } elsif ( $m == 1 ) # First node { # Here Madera & Gough's script is failing, or at least not correct, I think.. # HMMer LS config allow b->m1 and b->d1 transitions which is similar to Sam arch. # b->d1 is stored in "3-line" only, while b->m1 is stored in this line. # HMMer FS config allows b->mX and b->d1 transitions. This is done by # distributing half of the b->m1 probs. to the other b->mX transitions # # Sam actually corresponds better to the LS version as this is the one closest to # the underlying alignment. # What is happening to the mX->e transition!? Madera & Gough just leave them here although they have # to deal with them in Sam -> HMMer conversion if ($global==0) {# Here we're giving back the probs HMMer gives to b->mX to get LS config style. $bm = 2*$bm;} # This is not done by Madera & Gough. $total = $bm + $bd; $bm /= $total; $bd /= $total; $obj->{trans}[1][$MM] = $bm; $obj->{trans}[1][$MD] = $bd; $obj->{trans}[1][$IM] = 1; # No HMMer counterpart }; # renormalisation # We are including (giving back) the m->e probs (Madera & Gough aren't) # However, special case at the very last node due to HMMer having P(m->e)=1 here if ($global != 0 && $m != $obj->{M}) { $mm += $me;} $total = $mi + $md + $mm; if( $total ) { $mi /= $total; $md /= $total; $mm /= $total; }; $total = $dd + $dm; if( $total ) { $dd /= $total; $dm /= $total; }; $total = $ii + $im; if( $total ) { $ii /= $total; $im /= $total; }; # save the probabilities $obj->{trans}[$m+1][$MM] = $mm; $obj->{trans}[$m ][$MI] = $mi; $obj->{trans}[$m+1][$MD] = $md; $obj->{trans}[$m+1][$IM] = $im; $obj->{trans}[$m ][$II] = $ii; $obj->{trans}[$m+1][$DM] = $dm; $obj->{trans}[$m+1][$DD] = $dd; }; close HMMER; ## ## the "End" + last position values ## push @{$obj->{matches}}, [ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0 ]; pop @{$obj->{inserts}}; push @{$obj->{inserts}}, [ 0.077850, 0.016640, 0.057325, 0.065451, 0.039844, 0.062177, 0.024314, 0.060479, 0.064418, 0.082109, 0.024547, 0.047302, 0.040276, 0.041960, 0.048771, 0.067948, 0.059895, 0.072042, 0.011740, 0.034912 ]; push @{$obj->{inserts}}, [ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0 ]; return $obj; }; sub writeHMMER { my ($obj, $filename, $global) = @_; # transition probabilities: $xy = P(X->Y) my ($dd, $dm, $md, $mm, $mi, $im, $ii, $id, $bm, $me, $bd); my ($denom, $total, $i, $j, @a); open( HMMER, ">$filename" ) or die "Cannot open $filename: $!\n..."; ## ## header ## print HMMER "HMMER2.0\n"; print HMMER "NAME $filename\n"; print HMMER "LENG $obj->{M}\n"; print HMMER $header1; print HMMER $obj->{calibration}; print HMMER $header2; ## ## B->D1 line ## # Convert to Fs or Ls HMMer mode? # # The Sam mode is LS-ish (as I see it) $bd = $obj->{trans}[1][$MD]; $denom = $bd + $obj->{trans}[1][$MM]; writeLine( *HMMER, "", score(1-$bd/$denom,1), "*", score($bd/$denom,1) ); for $i (1 ... $obj->{M}) { ## ## match emissions ## @a=(); for $j (0 .. 19) { push @a, score( $obj->{matches}[$i][$j], prob($NULE[$j],1/20) ); }; writeLine( *HMMER, $i, @a, $i ); ## ## insert emissions ## @a=(); for $j (0 .. 19) { push @a, $i==$obj->{M} ? "*" : score( $obj->{inserts}[$i][$j], prob($NULE[$j],1/20) ); }; writeLine( *HMMER, "-", @a ); ## ## state transitions ## if( $i == $obj->{M} ) { $mm=$mi=$md=$im=$ii=$dm=$dd=0; $me=1; } else { $mm = $obj->{trans}[$i+1][$MM]; $mi = $obj->{trans}[ $i ][$MI]; $md = $obj->{trans}[$i+1][$MD]; $im = $obj->{trans}[$i+1][$IM]; $ii = $obj->{trans}[ $i ][$II]; $dm = $obj->{trans}[$i+1][$DM]; $dd = $obj->{trans}[$i+1][$DD]; $me = ($global==1) ? 0 : 1/(2*($obj->{M}-1)-$i); # Exit state # renormalization $total = $dd + $dm; $dd /= $total; $dm /= $total; $total = $ii + $im; $id /= $total; $im /= $total; $total = $me + $mi + $md + $mm; $me = ($global == 1) ? "*" : $me / $total; $mi /= $total; $md /= $total; $mm /= $total; }; # this seems to be the b->m transition into this match state... if ($global == 1) { $bm = ($i==1) ? 1-$bd/$denom : "*"; } else { # Converting to HMMer FS-config. As Sam is LS-ish it is having a b->m1 prob. 2 times # larger than we want to see in FS-config. We compensate by division 1/2. $bm = ($i==1) ? ($obj->{trans}[1][$MM])/2 : ($obj->{trans}[1][$MM]/($obj->{M}-1))/2; $bm = $bm / $denom; } writeLine( *HMMER, "-", score($mm, 1), score($mi, 1), score($md, 1), score($im, 1), score($ii, 1), score($dm, 1), score($dd, 1), score($bm, 1), score($me, 1) ); }; print HMMER "//\n"; close HMMER; warn "NB HMMER models need to be calibrated for best performance!\n"; }; ## ## *** PSI-BLAST *** ## # the order in which PSI-BLAST saves its 20 emission probabilities # apears to be ARNDCQEGHILKMFPSTWYV my %order = ( 0 => 0, 4 => 1, 3 => 2, 6 => 3, 13 => 4, 7 => 5, 8 => 6, 9 => 7, 11 => 8, 10 => 9, 12 => 10, 2 => 11, 14 => 12, 5 => 13, 1 => 14, 15 => 15, 16 => 16, 19 => 17, 17 => 18, 18 => 19 ); sub readPSIBLAST { # two ways of using this method: # # (a) $obj->readPSIBLAST("filename"); # # (b) $obj = PackageXYZ->readPSIBLAST("filename"); # my ($obj, $filename) = @_; if( ref($obj) ) { # $obj is a reference => (a) above $obj->clear(); } else { # $obj not a reference => (b) above # (assume $obj is a package name) $obj = new $obj; }; die "File $filename doesn't exist!\n... " unless -e $filename; die "Can't read $filename!\n... " unless -r $filename; my ($binM, $M, $seed, $m, $i, $binP, @inserts, $sum); open PSI, $filename or die "Error reading parsed $filename: $!\n... "; binmode PSI; # number of nodes, int read PSI, $binM, 4; $M = unpack "I", $binM; $obj->{M} = $M; # the seed sequence read PSI, $seed, $M; $obj->{seed} = $seed; # now the probabilities for $m (1 .. $obj->{M}) { for $i (0 .. 19) { read PSI, $binP, 8; $obj->{matches}[$m][$order{$i}] = unpack "d", $binP; }; }; close PSI; # calculate inserts: arithmetic average of match state emmissions for $i (0..19) { for $m (1..$M) { $inserts[$i] += $obj->{matches}[$m][$i]; }; $inserts[$i] /= $M; $sum += $inserts[$i]; }; for $i (0..19) { $inserts[$i] /= $sum; }; # now sort out the rest of the model for $m (1..$M) { $obj->{ trans }[$m] = [ # DD MD ID DM MM IM 0.850, 0.020, 0.000 , 0.150, 0.965, 0.350, # DI MI II 0.000, 0.015, 0.650 ]; $obj->{inserts}[$m] = [@inserts]; }; return $obj; }; sub writePSIBLAST { my ($obj, $filename) = @_; my ($seed, $m, $i, $max, $aa); # sort out what to do about the seed sequence if( defined $obj->{seed} ) { # splendid, we have the seed # (get this e.g. by running read_seedseq above) # # but need to check it's OK ! # die "Seed sequence and model length do not match!\n... " unless length($seed) == $obj->{M}; } else { # uh-oh, no seed sequence # better create one warn "No seed sequence given for $filename!\n"; warn "Using most likely sequence, dumping it into $filename.fa ...\n"; $obj->{seed} = $obj->most_likely_seq; open F, ">$filename.fa" or die "Cannot open $filename.fa for writing: $!\n... "; print F ">$filename seed sequence\n"; print F "$obj->{seed}\n"; close F; }; # write the PSI-BLAST profile open PSI, ">$filename" or die "Cannot open $filename: $!\n..."; binmode PSI; # header syswrite PSI, pack "I", $obj->{M}, 4; syswrite PSI, $obj->{seed}, $obj->{M}; # print the match emission probabilities for $m (1 .. $obj->{M}) { for $i (0 .. 19) { syswrite PSI, pack "d", $obj->{matches}[$m][$order{$i}], 8; }; }; close PSI; }; # rudimentary hack to read a PSI-BLAST seed sequence from a FASTA file sub read_seedseq { my ($obj, $filename) = @_; my ($seed); open SEED, "$filename" or die "Cannot open $filename: $!\n... "; while() { if( /^>/ ) { defined($seed) ? last : next; }; next unless /^(\S+)/; $seed .= $1; }; close SEED; $obj->{seed} = $seed; }; # generate the most likely sequence from a model # WARNING: ignores transition probabilities, only looks at match emmissions my @num2let = ( 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ); sub most_likely_seq { my ($obj) = @_; my ($m, $i, $best_i, $max, $seq); for $m (1 .. $obj->{M}) { $max = 0; for $i (0 .. 19) { next unless $obj->{matches}[$m][$i] > $max; $max = $obj->{matches}[$m][$i]; $best_i = $i; }; $seq .= $num2let[$best_i]; }; return $seq; }; ## ## *** CLEVER READ & WRITE *** ## sub clever_read { # two ways of using this method: # # (a) $obj->clever_read("filename"); # # (b) $obj = PackageXYZ->clever_read("filename"); # my ($obj, $filename, $global) = @_; if( ref($obj) ) { # $obj is a reference => (a) above $obj->clear(); } else { # $obj not a reference => (b) above # (assume $obj is a package name) $obj = new $obj; }; if( $filename =~ /\.mod$/ ) { die "File $filename doesn't exist!\n... " unless -e $filename; die "Can't read $filename!\n... " unless -r $filename; if( $filename !~ /\.asc\.mod/ and `grep '^BINARY' $filename` ) { # hmm, looks like a binary file => get the ASCII version my $f = $filename; $f =~ s/\.mod$//; print `hmmconvert $f.asc -model_file $f.mod -binary_output 0`; wait; $filename = "$f.asc.mod"; }; $obj->readSAM($filename); } elsif( $filename =~ /\.hmm$/ ) { $obj->readHMMER($filename, $global); } elsif( $filename =~ /\.pbla$/ or $filename =~ /\.profile$/ or $filename =~ /\.psiblast$/ or $filename =~ /\.psi$/ ) { $obj->readPSIBLAST($filename); } else { print STDERR "Uknown file extension for file $filename!\n\n"; print STDERR "Can only read the following:\n\n"; print STDERR "\t.mod or .asc.mod (SAM 3.0 ASCII)\n"; print STDERR "\t.hmm (HMMER 2.x)\n"; print STDERR "\t.profile, .pbla or .psiblast (PSI-BLAST)\n\n"; die "... "; }; return $obj; }; sub clever_write { my ($obj, $filename, $global) = @_; die "File $filename exists, remove it first!\n... " if -e $filename; if( $filename =~ /\.mod$/ ) { $obj->writeSAM($filename); } elsif( $filename =~ /\.hmm$/ ) { $obj->writeHMMER($filename, $global); } elsif( $filename =~ /\.pbla$/ or $filename =~ /\.profile$/ or $filename =~ /\.psiblast$/ or $filename =~ /\.psi$/ ) { $obj->writePSIBLAST($filename); } else { print STDERR "Uknown file extension for file $filename!\n\n"; print STDERR "Can only read the following:\n\n"; print STDERR "\t.mod or .asc.mod (SAM ASCII)\n"; print STDERR "\t.hmm (HMMER 2.x)\n"; print STDERR "\t.profile, .pbla or .psiblast (PSI-BLAST)\n\n"; die "... "; }; return $obj; }; return 1;