#!/usr/bin/perl -w

use strict;
use warnings;

sub usage{
        print <<EoF;
Program to parse chemical formulas.

Usage: chem-parse [options] formula

Options include:
 -list          Print the list of elements, and the amount of each.
 -rpn           Print the parse of the formula
                in RPN (i.e. Reverse Polish Notation)

If no options are specified, the default is an implicit "-list".

Note: As you can see in the examples, (...) and [...] can be
 used for grouping.

Note: The middot symbol '·' can be generated on some keyboards
 by Alt-7 ... or you can use ':' which is synonymous on input.


Examples:
:; chem-parse '[Cu·4H2O]SO4·H2O'
  [Cu·4H2O]SO4·H2O = {
       S => 1,
       H => 10,
      Cu => 1,
       O => 9,
  };

# Cautionary counterexample:
# Beware: the "5" multiplies the "2+" here:
:; chem-parse 'Fe(NCS)·5H2O(2+)'
  Fe(NCS)·5H2O(2+) = {
       S => 1,
       H => 10,
       O => 5,
       N => 1,
       C => 1,
      Fe => 1,
       + => 10,
  };

# Much more probably what you wanted:
# The "^" is the recommened way to indicate charge.
# Note that "^" and "·" have the same operator precedence.
:; chem-parse 'Fe(NCS)·5H2O^2+'
  Fe(NCS)·5H2O^2+ = {
       S => 1,
       H => 10,
       O => 5,
       N => 1,
       C => 1,
      Fe => 1,
       + => 2,
  };

# Another valid way of expressing the same thing:
:; chem-parse 'Fe(NCS)[H2O]5(2+)'
  Fe(NCS)[H2O]5(2+) = {
       H => 10,
       S => 1,
       O => 5,
       N => 1,
       C => 1,
      Fe => 1,
       + => 2,
  };

# Charge can be expressed either way, as 2+ or +2:
:; chem-parse 'Fe(NCS)[H2O]5+2'
  Fe(NCS)[H2O]5+2 = {
      H  => 10,
      S  => 1,
      O  => 5,
      N  => 1,
      C  => 1,
      Fe => 1,
      +  => 2,
  };

:; chem-parse '[Cr(N2H4CO)6]4 [Cr(CN)6]3'
  [Cr(N2H4CO)6]4 [Cr(CN)6]3 = {
      H  => 96,
      O  => 24,
      C  => 42,
      N  => 66,
      Cr => 7,
  };
EoF
}

# Principles of operation:  See
#   http://en.wikipedia.org/wiki/Shunting-yard_algorithm

# Note:  I am aware of the perl library Chemistry::Formula
# As of March 2012, this code is considerably more feature-rich.

my $num1 = '\d+\.\d+|\d+';
my $sym1 = '[-+]|[[:upper:]][[:lower:]]?';
my $numpat = "^$num1\$";
my $sympat = "^$sym1\$";

my %precedence_table = (
        ''=>9,                                  # empty list
        '_neg_notused'=>7,                      # unary minus
        '_sfx'=>4, '_div_notused'=>4,           # tight multiplication
        '_tadd'=>3, '_sub_notused'=>3,          # tight addition
        '_premul'=>2,                           # loose multiplication
        '_ladd'=>1,                             # loose addition
        '('=>0, '['=>0                          # grouping
);

my %right = ('_neg' => 1);

# Returns > 0 for any operator.
# Returns 0 for left-paren or left-square-bracket
# Returns -1 for any noun.
sub getprec {
  my ($op) = @_;
  return exists $precedence_table{$op} ? $precedence_table{$op} : -1;
}

main: {
  my $rpn_mode = 0;
  my $list_mode = 0;
  my $formula = '';
  my @args = @ARGV;
  argx: while (0+@args){
    my $arg = shift @args;
    if ($arg =~ m'^-[-]h') {
      usage();
      exit (0);
    }
    if ($arg eq '-rpn'){
      $rpn_mode++;
      next argx;
    }
    if ($arg eq '-list'){
      $list_mode++;
      next argx;
    }
    $formula .= $arg;
  }
  if ($formula eq '') {
    usage();
    exit(0);
  }
  if (!$list_mode && !$rpn_mode){
    $list_mode++;
  }
  my $pcode = chem_parse($formula);

  if ($rpn_mode) {
    print join(' ', @$pcode), "\n";       # Print out the RPN pseudocode.
  }

  if ($list_mode) {
    my $foo = chem_eval($pcode);          # Evaluate it.
    print "  $formula = {\n";
    foreach my $key(keys %$foo) {         # Print the result.
      my $str = sprintf("%-2s", $key);
      printf "%8s => %s,\n", $str, $foo->{$key};
    }
    print "  };\n";
  }
}

sub chem_parse{
  my ($expr) = @_;

  $expr =~ s"[ \t]+"·"g;

# This is weird, because the regex nominally defines the splits,
# (plus some captured tokens)
# but any bogus stuff /between/ legit captured tokens also shows
# up in the @tokens list.
  my @tokens = split("[ \t]*([\\(\\)\\[\\]^]|$num1|$sym1)[ \t]*", $expr);

  my @shunt;
  my $pcode = [];               # an anonymous listref
  my $depth = 0;

  my $nounpat = "$sympat|$numpat";
  my $last = "";
  my $lookahead = '';
  scanner: while (0+@tokens || $lookahead ne '') {
    my $token;
    if ($lookahead ne '') {
      $token = $lookahead;
      $lookahead = '';
    } else {
      $token = shift @tokens;
    }

    if ($token eq ''){
      next scanner;
    }
    if ($token eq ':' || $token eq '·' || $token eq '^') {
      $token = '_ladd';
    }

    my $lpr = getprec($last);
    my $tokpr = getprec($token);

# Special check for unary minus
## (no such thing within a formula)
##    if ($lpr>=0 && $token eq '-') {$token = '_neg';}

# Do a whole bunch of work to check for /implied/ operators:

# An appositive such as NaCl:
    if ($lpr<0 && ($token =~ $sympat || $tokpr == 0)){
      $lookahead = $token;
      $token = '_tadd';
    }

# A suffix as in O2:
    if ($lpr< 0 && $token =~ $numpat){          # it's a number
      ## print STDERR "got suffix number '$token'  following '$last'  lpr '$lpr'\n";
      $lookahead = $token;
      $token = '_sfx';
    }

# A prefactor as in 3H:
    elsif (($last eq '' || $last eq '_ladd' || $lpr == 0)
    && $token =~ $numpat) {
      $lookahead = '_premul';
    }

# Here after we have finished munging the token:
    $tokpr = getprec($token);           # good time to recompute this
    if ($token =~ $numpat && $last =~ '_premul') {
      die "Should not have prefactor in front of number '$token'.\n";
    }

# It's a noun:
    if ($token =~ $nounpat) {
      if ($last =~ $nounpat || $last eq ")" || $last eq ']') {
        die "Saw noun '$token' following '$last'.\n"
          . "  Should have been separated by an operator.\n ";
      }
      push(@$pcode, $token);
      $depth++;
    }

# It's the start of a group:
    elsif ($tokpr == 0) {
      unshift(@shunt, $token);
    }

# It's the end of a group:
    elsif ($token eq ')') {
      unparen: for (;;) {
        if (0+@shunt == 0) {
          die "Saw ')' with no matching '('\n";
        }
        my $thing = shift @shunt;
        if ($thing eq '(') {last unparen}
        push(@$pcode, $thing);
        $depth--;
      }
    }
    elsif ($token eq ']') {
      unparen: for (;;) {
        if (0+@shunt == 0) {
          die "Saw ']' with no matching '['\n";
        }
        my $thing = shift @shunt;
        if ($thing eq '[') {last unparen}
        push(@$pcode, $thing);
        $depth--;
      }
    }

# It's an operator:
    elsif ($tokpr > 0) {
      my $flag = exists $right{$token};
      while(0+@shunt > 0 && $tokpr < $flag + getprec($shunt[0])) {
        push(@$pcode, shift(@shunt));
        $depth--;
      }
      unshift(@shunt, $token);
    } else {
      die "Unknown token: '$token'\n";
    }
    $last = $token;
  } # end of loop over input tokens.

# Unstack any remaining operators:
  if (1+@shunt > $depth){
    die "Too many operators: depth: $depth  ops: ", 0+@shunt, "\n";
  }
  if (1+@shunt < $depth){
    die "Too few operators: depth: $depth  ops: ", 0+@shunt, "\n";
  }
  while(0+@shunt > 0) {
    push(@$pcode, shift(@shunt));
  }
  return $pcode;
}

# evaluate some RPN pseducode
# return a hashref of the form
# {H => 4, N  => 1, '+' => 1}   # NH4+

my @evalstack = ();

sub chem_eval{
  my ($pcode) = @_;
  my @todo = @$pcode;
  while (@todo) {
    my $token = shift @todo;
    if (getprec($token) > 0) {
      #print STDERR "Calling '$token'\n";
      my $subref = \&$token;            ## one step of indirection ...
      &$subref;                         ## to get around "strict refs"
    } elsif ($token =~ $numpat) {
      push @evalstack, $token;
    }
    else {
      if ($token eq '-') {
        push @evalstack, {'+' => -1};
      } else {
        push @evalstack, {$token => 1};   ## push an anonymous hash
      }
    }
  }
  return pop @evalstack;
}

# multiply with tight precedence
sub _sfx {
  my $num = pop @evalstack;
  my $A = pop @evalstack;
  foreach my $key(keys %$A) {
    $A->{$key} *= $num;
  }
  push @evalstack, $A;
}

# multiply with loose precedence
sub _premul {
  my $A = pop @evalstack;
  my $num = pop @evalstack;
  foreach my $key(keys %$A) {
    $A->{$key} *= $num;
  }
  push @evalstack, $A;
}

# add with tight precedence:
sub _tadd {
  my $A = pop @evalstack;
  my $B = pop @evalstack;
  my %both = (%$A, %$B);
  foreach my $key(keys %both) {
    $both{$key} = ($A->{$key}||0) + ($B->{$key}||0);
  }
  push @evalstack, \%both;
}

# add with loose precedence:
sub _ladd {
  _tadd();
}