#!/usr/bin/perl -w use strict; use warnings; sub usage{ print < 1, H => 10, Cu => 1, O => 9, }; # Cautionary counterexample: # Beware: the "5" multiplies the "2+" here: :; chem-parse 'Fe(NCS)·5H2O(2+)' Fe(NCS)·5H2O(2+) = { S => 1, H => 10, O => 5, N => 1, C => 1, Fe => 1, + => 10, }; # Much more probably what you wanted: # The "^" is the recommened way to indicate charge. # Note that "^" and "·" have the same operator precedence. :; chem-parse 'Fe(NCS)·5H2O^2+' Fe(NCS)·5H2O^2+ = { S => 1, H => 10, O => 5, N => 1, C => 1, Fe => 1, + => 2, }; # Another valid way of expressing the same thing: :; chem-parse 'Fe(NCS)[H2O]5(2+)' Fe(NCS)[H2O]5(2+) = { H => 10, S => 1, O => 5, N => 1, C => 1, Fe => 1, + => 2, }; # Charge can be expressed either way, as 2+ or +2: :; chem-parse 'Fe(NCS)[H2O]5+2' Fe(NCS)[H2O]5+2 = { H => 10, S => 1, O => 5, N => 1, C => 1, Fe => 1, + => 2, }; :; chem-parse '[Cr(N2H4CO)6]4 [Cr(CN)6]3' [Cr(N2H4CO)6]4 [Cr(CN)6]3 = { H => 96, O => 24, C => 42, N => 66, Cr => 7, }; EoF } # Principles of operation: See # http://en.wikipedia.org/wiki/Shunting-yard_algorithm # Note: I am aware of the perl library Chemistry::Formula # As of March 2012, this code is considerably more feature-rich. my $num1 = '\d+\.\d+|\d+'; my $sym1 = '[-+]|[[:upper:]][[:lower:]]?'; my $numpat = "^$num1\$"; my $sympat = "^$sym1\$"; my %precedence_table = ( ''=>9, # empty list '_neg_notused'=>7, # unary minus '_sfx'=>4, '_div_notused'=>4, # tight multiplication '_tadd'=>3, '_sub_notused'=>3, # tight addition '_premul'=>2, # loose multiplication '_ladd'=>1, # loose addition '('=>0, '['=>0 # grouping ); my %right = ('_neg' => 1); # Returns > 0 for any operator. # Returns 0 for left-paren or left-square-bracket # Returns -1 for any noun. sub getprec { my ($op) = @_; return exists $precedence_table{$op} ? $precedence_table{$op} : -1; } main: { my $rpn_mode = 0; my $list_mode = 0; my $formula = ''; my @args = @ARGV; argx: while (0+@args){ my $arg = shift @args; if ($arg =~ m'^-[-]h') { usage(); exit (0); } if ($arg eq '-rpn'){ $rpn_mode++; next argx; } if ($arg eq '-list'){ $list_mode++; next argx; } $formula .= $arg; } if ($formula eq '') { usage(); exit(0); } if (!$list_mode && !$rpn_mode){ $list_mode++; } my $pcode = chem_parse($formula); if ($rpn_mode) { print join(' ', @$pcode), "\n"; # Print out the RPN pseudocode. } if ($list_mode) { my $foo = chem_eval($pcode); # Evaluate it. print " $formula = {\n"; foreach my $key(keys %$foo) { # Print the result. my $str = sprintf("%-2s", $key); printf "%8s => %s,\n", $str, $foo->{$key}; } print " };\n"; } } sub chem_parse{ my ($expr) = @_; $expr =~ s"[ \t]+"·"g; # This is weird, because the regex nominally defines the splits, # (plus some captured tokens) # but any bogus stuff /between/ legit captured tokens also shows # up in the @tokens list. my @tokens = split("[ \t]*([\\(\\)\\[\\]^]|$num1|$sym1)[ \t]*", $expr); my @shunt; my $pcode = []; # an anonymous listref my $depth = 0; my $nounpat = "$sympat|$numpat"; my $last = ""; my $lookahead = ''; scanner: while (0+@tokens || $lookahead ne '') { my $token; if ($lookahead ne '') { $token = $lookahead; $lookahead = ''; } else { $token = shift @tokens; } if ($token eq ''){ next scanner; } if ($token eq ':' || $token eq '·' || $token eq '^') { $token = '_ladd'; } my $lpr = getprec($last); my $tokpr = getprec($token); # Special check for unary minus ## (no such thing within a formula) ## if ($lpr>=0 && $token eq '-') {$token = '_neg';} # Do a whole bunch of work to check for /implied/ operators: # An appositive such as NaCl: if ($lpr<0 && ($token =~ $sympat || $tokpr == 0)){ $lookahead = $token; $token = '_tadd'; } # A suffix as in O2: if ($lpr< 0 && $token =~ $numpat){ # it's a number ## print STDERR "got suffix number '$token' following '$last' lpr '$lpr'\n"; $lookahead = $token; $token = '_sfx'; } # A prefactor as in 3H: elsif (($last eq '' || $last eq '_ladd' || $lpr == 0) && $token =~ $numpat) { $lookahead = '_premul'; } # Here after we have finished munging the token: $tokpr = getprec($token); # good time to recompute this if ($token =~ $numpat && $last =~ '_premul') { die "Should not have prefactor in front of number '$token'.\n"; } # It's a noun: if ($token =~ $nounpat) { if ($last =~ $nounpat || $last eq ")" || $last eq ']') { die "Saw noun '$token' following '$last'.\n" . " Should have been separated by an operator.\n "; } push(@$pcode, $token); $depth++; } # It's the start of a group: elsif ($tokpr == 0) { unshift(@shunt, $token); } # It's the end of a group: elsif ($token eq ')') { unparen: for (;;) { if (0+@shunt == 0) { die "Saw ')' with no matching '('\n"; } my $thing = shift @shunt; if ($thing eq '(') {last unparen} push(@$pcode, $thing); $depth--; } } elsif ($token eq ']') { unparen: for (;;) { if (0+@shunt == 0) { die "Saw ']' with no matching '['\n"; } my $thing = shift @shunt; if ($thing eq '[') {last unparen} push(@$pcode, $thing); $depth--; } } # It's an operator: elsif ($tokpr > 0) { my $flag = exists $right{$token}; while(0+@shunt > 0 && $tokpr < $flag + getprec($shunt[0])) { push(@$pcode, shift(@shunt)); $depth--; } unshift(@shunt, $token); } else { die "Unknown token: '$token'\n"; } $last = $token; } # end of loop over input tokens. # Unstack any remaining operators: if (1+@shunt > $depth){ die "Too many operators: depth: $depth ops: ", 0+@shunt, "\n"; } if (1+@shunt < $depth){ die "Too few operators: depth: $depth ops: ", 0+@shunt, "\n"; } while(0+@shunt > 0) { push(@$pcode, shift(@shunt)); } return $pcode; } # evaluate some RPN pseducode # return a hashref of the form # {H => 4, N => 1, '+' => 1} # NH4+ my @evalstack = (); sub chem_eval{ my ($pcode) = @_; my @todo = @$pcode; while (@todo) { my $token = shift @todo; if (getprec($token) > 0) { #print STDERR "Calling '$token'\n"; my $subref = \&$token; ## one step of indirection ... &$subref; ## to get around "strict refs" } elsif ($token =~ $numpat) { push @evalstack, $token; } else { if ($token eq '-') { push @evalstack, {'+' => -1}; } else { push @evalstack, {$token => 1}; ## push an anonymous hash } } } return pop @evalstack; } # multiply with tight precedence sub _sfx { my $num = pop @evalstack; my $A = pop @evalstack; foreach my $key(keys %$A) { $A->{$key} *= $num; } push @evalstack, $A; } # multiply with loose precedence sub _premul { my $A = pop @evalstack; my $num = pop @evalstack; foreach my $key(keys %$A) { $A->{$key} *= $num; } push @evalstack, $A; } # add with tight precedence: sub _tadd { my $A = pop @evalstack; my $B = pop @evalstack; my %both = (%$A, %$B); foreach my $key(keys %both) { $both{$key} = ($A->{$key}||0) + ($B->{$key}||0); } push @evalstack, \%both; } # add with loose precedence: sub _ladd { _tadd(); }