#!/usr/bin/perl
#
# Your computer science professor doesn't want you to know how easy
# it is to "parse" postfix. Here's a hint: The entire grammar for a
# Turing-complete postfix computer language is:
#
#   Program ::= Expression
#   Expression ::= Token | Expression UnaryOp | Expression Expression BinaryOp
#   Token ::= Constant | Variable
#
# (plus lists of about 10 UnaryOps and 10 BinaryOps, which is enough
# to make the language useful for practical work)

# Define the hash (a.k.a. "dictionary") of RIES symbol names
%name = (
 # constants
  '0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4',
  '5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
  'e' => 'e', 'f' => 'phi', 'p' => 'pi', 'x' => 'x',
 # single-argument functions
  'C' => 'cos', 'E' => 'exp', 'l' => 'ln', 'n' => 'neg', 'q' => 'sqrt',
  's' => 'square', 'r' => 'recip', 'S' => 'sin', 'T' => 'tan',
  'W' => 'lambert',
 # two-argument functions
  '+' => 'add', '-' => 'sub', '*' => 'mult', '/' => 'div', '^' => 'pow',
  'A' => 'atan2', 'L' => 'logba', 'v' => 'root',
);

$symchars = '[';
foreach $k (sort (keys %name)) {
  if (($k eq '-') || ($k eq '^')) {
    $symchars .= "\\$k";
  } else {
    $symchars .= "$k";
  }
  $standard_symbols .= " $k";
}
$symchars .= ']+';
# symchars is now: [*+\-/0123456789ACELSTW\^eflnpqrsvx]+

$help = qq`
NAME

   pf2if.pl   -- Postfix to Internal Function conversion for RIES

DESCRIPTION

This program takes input like:

  x1A = p5+r                         for x = T + 1.54028e-05 {76}

and converts it to a nested function format:

  atan2(x,1) = recip(add(pi,5))      for x = T + 1.54028e-05 {76}

where each function e.g. 'recip', 'add', etc. represents one of the
operations RIES knows about (the functions built-in or "internal" to
RIES, contrasted with a hypothetical user-defined function). This
format is quite different from "infix", which would be:

  arctan(1/x) = 1/(pi+5)             for x = T + 1.54028e-05 {76}

The need for a program like this is greater now than in the 1990s or
earlier, because computer science classes switched from systems
programming languages (like C) to multi-modular languages (Java,
Python). They stopped teaching data structures and parsing because
these were intended to be provided by imported modules or libraries.

You can supply a single argument, which should be a filename
containing output of RIES that was generated with the -F0 option. If
you don't give a filename, it reads the standard input.

BUGS

If the input is from RIES but using a set of symbols other than these
standard ones:

 $standard_symbols

then the parsing won't work. In this case, as well as with input from
any other source (not RIES), the output will usually be the same as
the input, but with occasional infix expressions where previously
there were none.

EXAMPLE

  ries -F0 0.1234321 | ./pf2if.pl

`;

$ignored_block_comment = qq`

REVISION HISTORY:
 20230731 First version
 20230803 Better built-in help

TO-DO

Something about higher-level education reform?

NOTES

An overview of computer language translation is at:

  en.wikipedia.org/wiki/Parsing

It is useful to think of language translation as comprising four
steps, which generally occur in this sequence though might be combined
in some way:

  * Tokenize (Lexical Analysis): Go through the input in sequence (start
    to end) one character at a time, identifying characters or groups of
    characters as being e.g. variable names, keywords like IF or WHILE,
    constants like 3.45, etc.

  * Parsing (Semantic Analysis): Go through the tokens in sequence, keeping
    track of how they fit into grammar rules (e.g. the proper ordering(s) of
    the parts of an assignment statement); and build data structures to
    keep track of all the information about ordering and structure

  * Assemble output: Use the data structures created during parsing to
    generate a new structure (component parts, sub-parts, ordering thereof,
    etc.) describing everything that must be given as output.

  * De-tokenize: Generate exact output text for each token in the output.

This program counts on RIES to do the first step; the other three are
performed by the &pf2if() function. It essentially does thr 3rd and
4th steps at the same time by generating stringified versions of the
(infix) syntax trees, and using the stack to implicitly perform the
2nd step and reorder the strings (partial expressions) as needed.

`;

# 'stack effect', as in RIES this is 'a' for constants, 'b' for
# one-argument functions and 'c' for two-argument functions
%seft = (
  '0' => 'a', '1' => 'a', '2' => 'a', '3' => 'a', '4' => 'a',
  '5' => 'a', '6' => 'a', '7' => 'a', '8' => 'a', '9' => 'a',
  'e' => 'a', 'f' => 'a', 'p' => 'a', 'x' => 'a',
  'C' => 'b', 'E' => 'b', 'l' => 'b', 'n' => 'b', 'q' => 'b',
  's' => 'b', 'r' => 'b', 'S' => 'b', 'T' => 'b', 'W' => 'b',
  '+' => 'c', '-' => 'c', '*' => 'c', '/' => 'c', '^' => 'c',
  'A' => 'c', 'L' => 'c', 'v' => 'c',
);

# Some languages define a "log of A to base B" function instead of
# "log to base B of A", If you want that, the arguments of 'L' need
# to be reversed.
$reverse{'L'} = 1; # 1=true, 0=false
# Others you might reverse: A=atan2(a,b)=arctan(b/a), v=root(a,b)=b^(1/a)

# Postfix to functional conversion. It's hard to even call this a "parser"
# because postfix parses itself. We do however need a stack of syntax
# trees, but these can be represented by their formatted output. Thus,
# instead of holding numeric results (as it would if this were a
# postfix interpreter), the stack holds unevaluated subexpressions each
# of which is simply a string of symbols with parentheses and commas.
sub pf2if
{
  my($postfix_expr) = @_;
  my(@stack); my($sp, $c, $sft, $nam, $arg1, $arg2);

  # We start the stack pointer at 2 to allow underflow, this is to prevent
  # a 'negative subscript' error from Perl in the event this function is
  # given a string that is incomplete or otherwise unparsable.
  $sp = 2;
  foreach $c (split('', $postfix_expr)) {
    $sft = $seft{$c}; $nam = $name{$c};
    if ($sft eq 'a') {
      # Push a constant onto the stack
      $stack[$sp++] = $nam;
    } elsif ($sft eq 'b') {
      # Get one argument from the stack
      $arg1 = $stack[--$sp];
      # Push function(arg1) onto the stack
      $stack[$sp++] = "$nam($arg1)";
    } elsif ($sft eq 'c') {
      # Get two arguments from the stack
      $arg2 = $stack[--$sp]; $arg1 = $stack[--$sp];
      # Push function(arg1,arg2) onto the stack
      if ($reverse{$c}) {
        # No, make that function(arg2,arg1) instead
        $stack[$sp++] = "$nam($arg2,$arg1)";
      } else {
        $stack[$sp++] = "$nam($arg1,$arg2)";
      }
    } else {
      # Unknown symbol
      return $postfix_expr;
    }
    if ($sp <= 2) {
      # It looks like we might get an array indexing error soon, avoid the
      # humiliation by exiting voluntarily!
      return $postfix_expr;
    }
  }
  # If the input was complete and well-formed, the stack will end up with
  # exactly 1 item on it.
  if ($sp == 3) {
    return $stack[--$sp];
  } # else, just return the original input unmodified
  return $postfix_expr;
} # End of pf.2if

# Example:
#   print (&pf2if("9p/5vs") . "\n"); exit(0);
# should print "square(root(div(9,pi),5))"

while ($arg = shift) {
  if ($arg =~ m/^-[-]?h(elp)?$/) {
    print $help; exit(0);
  } elsif (-f $arg) {
    close STDIN;
    open(STDIN, $arg);
  } else {
    die "Unknown argument or no such file: $arg\n";
  }
}

# A result from "ries -F0 1.2345" contains two of these 'symchars' strings
# with an equals sign between them and some extra text at the end
$result_line_regex = "^( +)($symchars) = ($symchars)( .+)\$";
# If your language uses the original POSIX regex API, then you need to
# do a "regex compile" now (such as re.compile() in Python)

$contains_brace_regex = "[{]"; # Or use something like mystring.contains("{")

while ($input_line = <>) {
  if (!($input_line =~ m/$contains_brace_regex/)) {
    # This is not a result item because it lacks the "{123}" score at the end
    print $input_line;
  } elsif ($input_line =~ m|$result_line_regex|) {
    # matched something like " xlC = 221nA*r  for x = T - 3.27774e-08 {108}"
    $pre = $1; $expr1 = $2; $expr2 = $3; $post = $4;
    $expr1 = &pf2if($expr1); $expr2 = &pf2if($expr2);
    print "$pre$expr1 = $expr2$post\n";
  } else {
    # Also not a result item
    print $input_line;
  }
}