1155 lines
27 KiB
Awk
1155 lines
27 KiB
Awk
#!/usr/bin/awk -f
|
|
|
|
###############################################################################
|
|
#
|
|
# FILE: y2l
|
|
# DESCRIPTION: Yacc-to-LaTeX grammar processor
|
|
#
|
|
# Copyright (c) 1994-2001 by Kris Van Hees, Belgium. All rights reserved.
|
|
# See the "Artistic License" included in this package (file: "Artistic") for
|
|
# terms and conditions.
|
|
#
|
|
# $Id: y2l,v 1.1 2003/08/10 15:18:49 bgiroud Exp $
|
|
#
|
|
###############################################################################
|
|
|
|
#
|
|
# Reserved variables:
|
|
# _debug Debug level
|
|
# _exit Exiting due to an error
|
|
# _line Current line being processed
|
|
# _spclen Length of string holding padding spaces
|
|
# _spcstr String to hold padding spaces
|
|
#
|
|
|
|
###############################################################################
|
|
# Support functions
|
|
###############################################################################
|
|
|
|
#
|
|
# NAME: SPACES()
|
|
# DESCRIPTION: return a given number of spaces
|
|
#
|
|
function SPACES(snum) {
|
|
if (!_spclen) { # uninitialized (first use)
|
|
_spcstr = " ";
|
|
_spclen = 1;
|
|
}
|
|
|
|
while (_spclen < snum) { # extend the string of spaces
|
|
_spcstr = _spcstr _spcstr; # double the spaces
|
|
_spclen *= 2; # update the string length
|
|
}
|
|
|
|
return substr(_spcstr, 1, snum); # requested number of spaces
|
|
}
|
|
|
|
#
|
|
# NAME: DBG()
|
|
# DESCRIPTION: write debug output to standard error (if debugging is enabled)
|
|
#
|
|
function DBG(dlvl, dmsg) {
|
|
if (!_debug) # debugging is disabled
|
|
return;
|
|
|
|
print "DBG (" SPACES(5 - length(_line)) _line ")" SPACES(dlvl * 2 + 1) \
|
|
dmsg >"/dev/stderr";
|
|
}
|
|
|
|
#
|
|
# NAME: error()
|
|
# DESCRIPTION: write an error message to standard error
|
|
#
|
|
function error(emsg) {
|
|
print "ERROR: " emsg >"/dev/stderr";
|
|
|
|
if (_line)
|
|
print " Line is (on or about) " _line >"/dev/stderr";
|
|
|
|
_exit = 1; # mark program as exiting
|
|
exit;
|
|
}
|
|
|
|
#
|
|
# NAME: exiting()
|
|
# DESCRIPTION: return whether the program is exiting
|
|
#
|
|
function exiting() {
|
|
return _exit; # exit status
|
|
}
|
|
|
|
#
|
|
# NAME: exists()
|
|
# DESCRIPTION: return whether a file exists
|
|
#
|
|
function exists(fname) {
|
|
return !system("ls " fname " >/dev/null 2>&1");
|
|
}
|
|
|
|
###############################################################################
|
|
# Yacc-to-LaTeX initialization
|
|
###############################################################################
|
|
|
|
#
|
|
# NAME: init()
|
|
# DESCRIPTION: initialization
|
|
#
|
|
function init() {
|
|
PERC_PERC = 257;
|
|
PERC_LACC = 258;
|
|
PERC_RACC = 259;
|
|
PERC_LEFT = 260;
|
|
PERC_ASSC = 261;
|
|
PERC_RGHT = 262;
|
|
PERC_STRT = 263;
|
|
PERC_TOKN = 264;
|
|
PERC_TYPE = 265;
|
|
PERC_UNIO = 266;
|
|
PERC_PREC = 267;
|
|
|
|
CHAR = 268;
|
|
IDENT = 269;
|
|
STRING = 270;
|
|
TYPE = 271;
|
|
|
|
EOF = -1;
|
|
BAD = -2;
|
|
|
|
reserved["%%"] = PERC_PERC;
|
|
reserved["%{"] = PERC_LACC;
|
|
reserved["%}"] = PERC_RACC;
|
|
reserved["%left"] = PERC_LEFT;
|
|
reserved["%nonassoc"] = PERC_ASSC;
|
|
reserved["%prec"] = PERC_PREC;
|
|
reserved["%right"] = PERC_RGHT;
|
|
reserved["%start"] = PERC_STRT;
|
|
reserved["%token"] = PERC_TOKN;
|
|
reserved["%type"] = PERC_TYPE;
|
|
reserved["%union"] = PERC_UNIO;
|
|
}
|
|
|
|
###############################################################################
|
|
# Yacc-to-LaTeX processor functions
|
|
###############################################################################
|
|
|
|
#
|
|
# NAME: token()
|
|
# DESCRIPTION: verify whether something is a valid token
|
|
#
|
|
function token(tokname) {
|
|
return tokname ~ /^[_\.A-Za-z][_\.A-Za-z0-9]*$/;
|
|
}
|
|
|
|
#
|
|
# NAME: read()
|
|
# DESCRIPTION: read another (non-blank) line from the input file
|
|
#
|
|
function read() {
|
|
while (getline < grammar_file == 1) {
|
|
_line++;
|
|
|
|
if ($1 != "")
|
|
return;
|
|
}
|
|
|
|
return grammar_eof = 1;
|
|
}
|
|
|
|
#
|
|
# NAME: skip_ws()
|
|
# DESCRIPTION: skip a continuous block of whitespace
|
|
#
|
|
function skip_ws() {
|
|
in_comment = 0;
|
|
|
|
while (!grammar_eof) {
|
|
sub(/^[ \t]+/, ""); # remove leading whitespace
|
|
|
|
if ($1 != "") {
|
|
if (/^\/\//) { # C++ comment
|
|
$0 = "";
|
|
} else if (in_comment) { # in a comment
|
|
if (match($0, /\*\//)) { # end of a comment
|
|
$0 = substr($0, RSTART + RLENGTH);
|
|
in_comment = 0;
|
|
} else
|
|
$0 = "";
|
|
} else if (/^\/\*"[^"]+"\*\//) { # special marker
|
|
sub(/\/\*"/, "\"");
|
|
sub(/"\*\//, "\"");
|
|
|
|
return;
|
|
} else if (/^\/\*/) { # regular comment
|
|
$0 = substr($0, 3);
|
|
in_comment = 1;
|
|
} else
|
|
return;
|
|
|
|
sub(/[ \t]+$/, ""); # remove trailing whitespace
|
|
}
|
|
|
|
if ($1 == "")
|
|
read();
|
|
}
|
|
}
|
|
|
|
#
|
|
# NAME: lex()
|
|
# DESCRIPTION: read the next token from the input
|
|
#
|
|
function lex() {
|
|
if (tok_prev) {
|
|
tok = tok_prev;
|
|
tok_str = tok_pstr;
|
|
tok_prev = 0;
|
|
|
|
return tok;
|
|
}
|
|
|
|
skip_ws();
|
|
|
|
if (grammar_eof)
|
|
return EOF;
|
|
|
|
if (/^%/)
|
|
if (match($0, /^%(%|{|}|left|nonassoc|prec|right|start|token|type|union)/)) {
|
|
tok_str = substr($0, 1, RLENGTH);
|
|
$0 = substr($0, RLENGTH + 1);
|
|
|
|
return reserved[tok_str];
|
|
} else if (match($0, /^%[_A-Za-z][_A-Za-z0-9]*/)) {
|
|
tok_str = substr($0, 1, RLENGTH);
|
|
$0 = substr($0, RLENGTH + 1);
|
|
|
|
return BAD;
|
|
}
|
|
|
|
if (match($0, /^[_\.A-Za-z][_\.A-zA-z0-9]*/)) {
|
|
tok_str = substr($0, 1, RLENGTH);
|
|
$0 = substr($0, RLENGTH + 1);
|
|
|
|
return IDENT;
|
|
}
|
|
|
|
if (match($0, /^<[_\.A-Za-z][_\.A-zA-z0-9]*>/)) {
|
|
tok_str = substr($0, 1, RLENGTH);
|
|
$0 = substr($0, RLENGTH + 1);
|
|
|
|
return TYPE;
|
|
}
|
|
|
|
if (/^'/)
|
|
if (match($0, /^'(.|\\([tnarfbv\\'"]|[0-7][0-7]*))'/)) {
|
|
tok_str = "@C" (consts++);
|
|
transtab[tok_str] = "\"" substr($0, 2, RLENGTH - 2) "\"";
|
|
$0 = substr($0, RLENGTH + 1);
|
|
|
|
return CHAR;
|
|
} else
|
|
error("Excuse me, but this character constant is a bit weird.");
|
|
|
|
if (/^"/)
|
|
if (match($0, /([^\\]|[ \t]+)(\\\\)*"/)) {
|
|
tok_str = "@S" (strings++);
|
|
transtab[tok_str] = substr($0, 1, RSTART + RLENGTH - 1);
|
|
$0 = substr($0, RSTART + RLENGTH);
|
|
|
|
return STRING;
|
|
} else
|
|
error("Newlines in strings are interesting, but not allowed.");
|
|
|
|
tok_str = substr($0, 1, 1);
|
|
$0 = substr($0, 2);
|
|
|
|
return tok_str;
|
|
}
|
|
|
|
#
|
|
# NAME: unlex()
|
|
# DESCRIPTION: return a token to the input stream
|
|
#
|
|
function unlex(tok) {
|
|
tok_prev = tok;
|
|
tok_pstr = tok_str;
|
|
}
|
|
|
|
#
|
|
# NAME: skip_definition()
|
|
# DESCRIPTION: skip the contents of a %{ ... %} block
|
|
#
|
|
function skip_definition() {
|
|
do {
|
|
skip = lex();
|
|
} while (skip != PERC_RACC && skip != EOF);
|
|
}
|
|
|
|
#
|
|
# NAME: decl_token()
|
|
# DESCRIPTION: read a token declaration
|
|
#
|
|
function decl_token() {
|
|
first = 1;
|
|
|
|
do {
|
|
tok = lex();
|
|
|
|
if (tok == ",") {
|
|
symbol = 0;
|
|
} else if (tok == CHAR) {
|
|
DBG(1, transtab[tok_str] ": No need to remember this token.");
|
|
} else if (tok == IDENT) {
|
|
if (_tkpat && tok_str !~ _tkpat) {
|
|
if (transtab[tok_str])
|
|
DBG(2, "WARNING: Redefining '" tok_str "'.");
|
|
|
|
transtab[tok_str] = "\"" tolower(tok_str) "\"";
|
|
DBG(1, tok_str ": Defined as " transtab[tok_str] ".");
|
|
}
|
|
|
|
symbol = tok_str;
|
|
} else if (tok == NUMBER) {
|
|
if (!symbol)
|
|
error("How about giving a token first?");
|
|
|
|
symbol = 0;
|
|
} else if (tok == STRING) {
|
|
if (!symbol)
|
|
error("How about giving a token first?");
|
|
|
|
str = transtab[tok_str];
|
|
transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\"";
|
|
DBG(1, SPACES(length(symbol) + 2) \
|
|
"Defined as " transtab[symbol] ".");
|
|
|
|
symbol = 0;
|
|
} else if (tok == TYPE) {
|
|
if (!first)
|
|
error("This is no place for a type name.");
|
|
} else {
|
|
unlex(tok);
|
|
return;
|
|
}
|
|
|
|
first = 0;
|
|
} while (tok != EOF);
|
|
}
|
|
|
|
#
|
|
# NAME: decl_start()
|
|
# DESCRIPTION: read a start rule declaration
|
|
#
|
|
function decl_start() {
|
|
if (grammar_start)
|
|
error("Hm, you want the grammar to start with two rules?");
|
|
else if (lex() == IDENT) {
|
|
grammar_start = tok_str;
|
|
DBG(2, "The grammar start rule is '" grammar_start "'.");
|
|
} else
|
|
error("How about a nice juicy identifier?");
|
|
}
|
|
|
|
#
|
|
# NAME: decl_type()
|
|
# DESCRIPTION: read a type declaration
|
|
#
|
|
function decl_type() {
|
|
if (lex() != TYPE)
|
|
error("So where is the typename?");
|
|
|
|
do {
|
|
tok = lex();
|
|
|
|
if (tok == ",") {
|
|
symbol = 0;
|
|
} else if (tok == CHAR) {
|
|
error("Bison etc may accept literals in a %type declaration, " \
|
|
"but the Unix 7th\n Ed manual clearly indicates " \
|
|
"that it is NOT legal. And I think that the\n Bell " \
|
|
"Labs guys know what they are talking about; but anyway, " \
|
|
"do you\n really spend the time reading this long " \
|
|
"error message?");
|
|
} else if (tok == IDENT) {
|
|
if (_tkpat && tok_str !~ _tkpat) {
|
|
if (transtab[tok_str])
|
|
DBG(2, "WARNING: Redefining '" tok_str "'.");
|
|
|
|
transtab[tok_str] = "\"" tolower(tok_str) "\"";
|
|
DBG(1, tok_str ": Defined as " transtab[tok_str] ".");
|
|
}
|
|
|
|
symbol = tok_str;
|
|
} else if (tok == NUMBER) {
|
|
if (!symbol)
|
|
error("How about giving a token first?");
|
|
|
|
symbol = 0;
|
|
} else if (tok == STRING) {
|
|
if (!symbol)
|
|
error("How about giving a token first?");
|
|
|
|
str = transtab[tok_str];
|
|
transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\"";
|
|
DBG(1, SPACES(length(symbol) + 2) \
|
|
"Defined as " transtab[symbol] ".");
|
|
|
|
symbol = 0;
|
|
} else {
|
|
unlex(tok);
|
|
return;
|
|
}
|
|
} while (tok != EOF);
|
|
}
|
|
|
|
#
|
|
# NAME: decl_union()
|
|
# DESCRIPTION: read a union declaration
|
|
#
|
|
function decl_union() {
|
|
if (grammar_union)
|
|
error("How about sticking to one single union declaration?");
|
|
|
|
grammar_union = 1;
|
|
DBG(2, "Extended types have been registered with a union declaration.");
|
|
|
|
do {
|
|
tok = lex();
|
|
|
|
if (tok == "{")
|
|
block++;
|
|
else if (tok == "}") {
|
|
if (!block)
|
|
error("Why close an unopened block?");
|
|
if (--block <= 0)
|
|
return;
|
|
} else if (tok == EOF)
|
|
error("The file ends before the union is finished. How rude!");
|
|
} while (1);
|
|
}
|
|
|
|
#
|
|
# NAME: read_declarations()
|
|
# DESCRIPTION: read the yacc declarations
|
|
#
|
|
function read_declarations() {
|
|
do {
|
|
tok = lex(); # next token
|
|
|
|
if (tok == PERC_PERC || tok == EOF) # end of the declarations
|
|
return;
|
|
if (tok == PERC_LACC) { # definition block
|
|
skip_definition();
|
|
} else if (tok == PERC_LEFT) { # left associative declaration
|
|
decl_token();
|
|
} else if (tok == PERC_ASSC) { # non-associative declaration
|
|
decl_token();
|
|
} else if (tok == PERC_RGHT) { # right associative declaration
|
|
decl_token();
|
|
} else if (tok == PERC_STRT) { # start rule declaration
|
|
decl_start();
|
|
} else if (tok == PERC_TOKN) { # token declaration(s)
|
|
decl_token();
|
|
} else if (tok == PERC_TYPE) { # type declaration
|
|
decl_type();
|
|
} else if (tok == PERC_UNIO) { # union declaration
|
|
decl_union();
|
|
} else
|
|
DBG(2, "WARNING: Ignoring the unknown token '" tok_str "'.");
|
|
} while (1);
|
|
}
|
|
|
|
#
|
|
# NAME: skip_action()
|
|
# DESCRIPTION: skip the contents of an action block
|
|
#
|
|
function skip_action() {
|
|
block = 1;
|
|
|
|
do {
|
|
tok = lex();
|
|
|
|
if (tok == "{")
|
|
block++;
|
|
else if (tok == "}") {
|
|
if (!block)
|
|
error("Why close an unopened block?");
|
|
if (--block <= 0)
|
|
return;
|
|
} else if (tok == EOF)
|
|
error("The file ends before the action is finished. How rude!");
|
|
} while (tok != EOF);
|
|
}
|
|
|
|
#
|
|
# NAME: read_grammar()
|
|
# DESCRIPTION: read the yacc grammar
|
|
#
|
|
function read_grammar() {
|
|
tok = lex();
|
|
|
|
do {
|
|
if (tok == PERC_PERC || tok == EOF) # end of the grammar
|
|
return;
|
|
|
|
if (tok == IDENT) { # rule begins here
|
|
if (!(rule_idx = rule_ref[tok_str])) {
|
|
rule_idx = ++rule_cnt; # new rule
|
|
rule_lhs[rule_idx] = tok_str;
|
|
rule_ref[tok_str] = rule_idx;
|
|
rule_sub = ++rule_len[rule_idx];
|
|
|
|
if (!grammar_start)
|
|
grammar_start = tok_str;
|
|
}
|
|
|
|
if (lex() != ":")
|
|
error("The LHS and RHS would like to be separated by a colon.");
|
|
} else if (tok == "|") { # alternative RHS for a rule
|
|
if (!rule_cnt)
|
|
error("The grammar shouldn't start with a |.");
|
|
|
|
rule_sub = ++rule_len[rule_idx];
|
|
} else
|
|
error("Why are you giving me '" tok "'?");
|
|
|
|
rule_rhs[rule_cnt] = ""; # empty RHS
|
|
|
|
do { # read the RHS
|
|
tok = lex();
|
|
|
|
if (tok == PERC_PREC) { # %prec
|
|
tok = lex();
|
|
|
|
if (tok != IDENT && tok != CHAR)
|
|
error("What precedence are you talking about?");
|
|
|
|
tok = lex(); # continue with the rest
|
|
}
|
|
|
|
if (tok == IDENT) { # might be a new rule
|
|
old_str = tok_str;
|
|
nxt = lex(); # look ahead
|
|
unlex(nxt);
|
|
tok_str = old_str;
|
|
|
|
if (nxt == ":") # yup, a new rule started
|
|
break;
|
|
|
|
rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub] \
|
|
" " tok_str;
|
|
} else if (tok == CHAR) {
|
|
if (tok_str ~ /^@/)
|
|
tok_str = transtab[tok_str];
|
|
|
|
rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub] \
|
|
" " tok_str;
|
|
} else if (tok == "{") { # an action block
|
|
skip_action();
|
|
} else # can't be part of a rule
|
|
break;
|
|
} while (1);
|
|
|
|
sub(/^ /, "", rule_rhs[rule_idx, rule_sub]);
|
|
|
|
if (rule_rhs[rule_idx, rule_sub] ~ \
|
|
/(^|[^_\.A-Za-z0-9])error($|[^_\.A-Za-z0-9])/) {
|
|
DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub] \
|
|
" [IGNORED]");
|
|
|
|
rule_rhs[rule_idx, rule_sub] = "";
|
|
rule_len[rule_idx]--;
|
|
} else
|
|
DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub]);
|
|
|
|
if (tok == ";")
|
|
tok = lex();
|
|
} while (1);
|
|
}
|
|
|
|
#
|
|
# NAME: is_optional()
|
|
# DESCRIPTION: check whether the given non-terminal is optional
|
|
#
|
|
function is_optional(idx) {
|
|
if ((len = rule_len[idx]) <= 1)
|
|
return 0;
|
|
|
|
#
|
|
# One or more empty rules for a non-terminal indicate that the other rules
|
|
# are in fact optional. There shouldn't be multiple empty rules, but it is
|
|
# is technically possible.
|
|
#
|
|
for (rule_sub = 1; rule_sub <= len; rule_sub++)
|
|
if (rule_rhs[idx, rule_sub] == "") {
|
|
while (++rule_sub <= len)
|
|
rule_rhs[idx, rule_sub - 1] = rule_rhs[idx, rule_sub];
|
|
|
|
rule_len[idx]--;
|
|
}
|
|
|
|
return rule_len[idx] < len;
|
|
}
|
|
|
|
#
|
|
# NAME: get_prefix()
|
|
# DESCRIPTION: check whether the given non-terminal has a common prefix
|
|
#
|
|
function get_prefix(idx) {
|
|
if ((len = rule_len[idx]) <= 1)
|
|
return 0;
|
|
|
|
#
|
|
# Split up the first rule into tokens. These will be compared with all the
|
|
# other rules for this non-terminal.
|
|
#
|
|
gp_last = split(rule_rhs[idx, 1], arr);
|
|
gp_pref = gp_last;
|
|
|
|
#
|
|
# Look for the longest common prefix.
|
|
#
|
|
for (rule_sub = 2; rule_sub <= len; rule_sub++) {
|
|
$0 = rule_rhs[idx, rule_sub];
|
|
gp_tokc = NF;
|
|
|
|
if (gp_tokc < gp_pref)
|
|
gp_pref = gp_tokc;
|
|
|
|
for (j = 1; j <= gp_pref; j++)
|
|
if (arr[j] != $j) {
|
|
if (!(gp_pref = j - 1))
|
|
return 0;
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Construct the prefix string.
|
|
#
|
|
gp_pstr = arr[1];
|
|
for (j = 2; j <= gp_pref; j++)
|
|
gp_pstr = gp_pstr " " arr[j];
|
|
|
|
#
|
|
# Remove the common prefix from all rules for this non-terminal.
|
|
#
|
|
for (rule_sub = 1; rule_sub <= len; rule_sub++) {
|
|
$0 = rule_rhs[idx, rule_sub];
|
|
|
|
for (j = 1; j <= gp_pref; j++)
|
|
$j = "";
|
|
|
|
sub(/^ +/, "");
|
|
rule_rhs[idx, rule_sub] = $0;
|
|
}
|
|
|
|
return gp_pstr;
|
|
}
|
|
|
|
#
|
|
# NAME: get_suffix()
|
|
# DESCRIPTION: check whether the given non-terminal has a common suffix
|
|
#
|
|
function get_suffix(idx) {
|
|
if ((len = rule_len[idx]) <= 1)
|
|
return 0;
|
|
|
|
#
|
|
# Split up the first rule into tokens. These will be compared with all the
|
|
# other rules for this non-terminal.
|
|
#
|
|
gs_last = split(rule_rhs[idx, 1], arr);
|
|
gs_suff = gs_last;
|
|
|
|
#
|
|
# Look for the longest common suffix.
|
|
#
|
|
for (rule_sub = 2; rule_sub <= len; rule_sub++) {
|
|
$0 = rule_rhs[idx, rule_sub];
|
|
gs_tokc = NF;
|
|
|
|
if (gs_tokc < gs_suff)
|
|
gs_suff = gs_tokc;
|
|
|
|
for (j = 0; j < gs_suff; j++)
|
|
if (arr[gs_last - j] != $(gs_tokc - j)) {
|
|
if (!(gs_suff = j))
|
|
return 0;
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Construct the suffix string.
|
|
#
|
|
gs_sstr = arr[gs_last];
|
|
for (j = 1; j < gs_suff; j++)
|
|
gs_sstr = arr[gs_last - j] " " gs_sstr;
|
|
|
|
#
|
|
# Remove the common suffix from all rules for this non-terminal.
|
|
#
|
|
for (rule_sub = 1; rule_sub <= len; rule_sub++) {
|
|
$0 = rule_rhs[idx, rule_sub];
|
|
|
|
for (j = 0; j < gs_suff; j++)
|
|
$(NF - j) = "";
|
|
|
|
sub(/ +$/, "");
|
|
rule_rhs[idx, rule_sub] = $0;
|
|
}
|
|
|
|
return gs_sstr;
|
|
}
|
|
|
|
#
|
|
# NAME: optimize1()
|
|
# DESCRIPTION: first pass of the optimizer
|
|
#
|
|
function optimize1() {
|
|
DBG(0, "Optimization pass 1...");
|
|
|
|
for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
|
|
#
|
|
# Non-terminals with only a single rule can't be optimized here.
|
|
#
|
|
if ((len = rule_len[rule_idx]) <= 1)
|
|
continue;
|
|
|
|
#
|
|
# First record whether the entire non-terminal might be optional.
|
|
#
|
|
rule_opt[rule_idx] = is_optional(rule_idx);
|
|
|
|
#
|
|
# The actual optimization takes place in this endless loop. It will in
|
|
# fact end when an iteration does not yield an optional ruleset. This
|
|
# is a proper and correct stop criteria, since a non-optional ruleset
|
|
# cannot have any common prefix or suffix. If it did, those would have
|
|
# been added to the already present prefix or suffix.
|
|
#
|
|
pref = "";
|
|
suff = "";
|
|
do {
|
|
if (pstr = get_prefix(rule_idx))
|
|
pref = pref " " pstr;
|
|
else if (sstr = get_suffix(rule_idx))
|
|
suff = sstr " " suff;
|
|
|
|
if (is_optional(rule_idx)) {
|
|
pref = pref " [";
|
|
suff = "] " suff;
|
|
} else {
|
|
if (pstr || sstr) {
|
|
pref = pref " (";
|
|
suff = ") " suff;
|
|
}
|
|
|
|
break;
|
|
}
|
|
} while (1);
|
|
|
|
#
|
|
# Compose the single composite rule for this non-terminal, if a common
|
|
# prefix or suffix was found.
|
|
#
|
|
if (pref != "" || suff != "") {
|
|
sub(/^ /, "", pref);
|
|
sub(/ $/, "", suff);
|
|
|
|
DBG(2, "Rules for '" rule_lhs[rule_idx] "' have:");
|
|
|
|
if (pref != "")
|
|
DBG(3, "Prefix '" pref "'");
|
|
if (suff != "")
|
|
DBG(3, "Suffix '" suff "'");
|
|
|
|
len = rule_len[rule_idx];
|
|
pref = pref " " rule_rhs[rule_idx, 1];
|
|
|
|
for (rule_sub = 2; rule_sub <= len; rule_sub++)
|
|
pref = pref " | " rule_rhs[rule_idx, rule_sub];
|
|
|
|
rule_rhs[rule_idx, 1] = pref " " suff;
|
|
rule_len[rule_idx] = 1;
|
|
|
|
DBG(3, "Combined rule '" rule_rhs[rule_idx, 1] "'");
|
|
|
|
if (rule_opt[rule_idx])
|
|
DBG(3, "(The non-terminal is optional.)");
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# NAME: optimize2()
|
|
# DESCRIPTION: second pass of the optimizer
|
|
#
|
|
function optimize2() {
|
|
DBG(0, "Optimization pass 2...");
|
|
|
|
for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
|
|
$0 = rule_rhs[rule_idx, 1];
|
|
|
|
if ((len = rule_len[rule_idx]) > 1)
|
|
for (rule_sub = 2; rule_sub <= len; rule_sub++)
|
|
$0 = $0 " | " rule_rhs[rule_idx, rule_sub];
|
|
|
|
if ($1 != "[" && rule_opt[rule_idx]) {
|
|
$0 = "[ " $0 " ]";
|
|
rule_opt[rule_idx] = 0;
|
|
}
|
|
|
|
if ($1 == "[")
|
|
if ($2 == rule_lhs[rule_idx]) {
|
|
for (i = NF; i > 0; i--)
|
|
if ($i == "]")
|
|
break;
|
|
|
|
if ((NF - i) < 3) {
|
|
$1 = "";
|
|
subst = "{";
|
|
$i = "}";
|
|
|
|
while (++i <= NF)
|
|
subst = subst " " $i;
|
|
|
|
$2 = subst;
|
|
|
|
sub(/^ +/, "");
|
|
}
|
|
}
|
|
|
|
if ($NF == "]")
|
|
if ($(NF - 1) == rule_lhs[rule_idx]) {
|
|
for (i = 1; i <= NF; i++)
|
|
if ($i == "[")
|
|
break;
|
|
|
|
if (i < 3) {
|
|
$i = "{";
|
|
subst = "}";
|
|
$NF = "";
|
|
|
|
while (--i > 0)
|
|
subst = $i " " subst;
|
|
|
|
$(NF - 1) = subst;
|
|
|
|
sub(/ +$/, "");
|
|
}
|
|
}
|
|
|
|
if (rule_opt[rule_idx]) {
|
|
$0 = "[ " $0 " ]";
|
|
rule_opt[rule_idx] = 0;
|
|
}
|
|
|
|
rule_rhs[rule_idx, 1] = $0;
|
|
rule_len[rule_idx] = 1;
|
|
}
|
|
}
|
|
|
|
#
|
|
# NAME: latexify()
|
|
# DESCRIPTION: make substitutions to ensure that the string is LaTeX ok
|
|
#
|
|
function latexify(txt) {
|
|
gsub(/[{&%}]/, "$\\\\&$", txt);
|
|
gsub(/[!|[\]=+\-*\/<>]/, "$&$", txt);
|
|
gsub(/\$\$/, "\\!", txt);
|
|
gsub(/"[^"]+"/, "``{\\bf &}''", txt);
|
|
gsub(/_/, "\\_", txt);
|
|
gsub(/\^/, "\\verb+^+", txt);
|
|
gsub(/"/, "", txt);
|
|
|
|
return txt;
|
|
}
|
|
|
|
#
|
|
# NAME: break_string()
|
|
# DESCRIPTION: possibly break up a string in multiple lines
|
|
#
|
|
function break_string(str) {
|
|
match(str, /^([_A-Za-z][_A-Za-z0-9]* +| +)(::=|\|)/);
|
|
bs_len = RLENGTH;
|
|
bs_str = substr(str, 1, RLENGTH);
|
|
bs_pln = RLENGTH;
|
|
bs_pre = SPACES(RLENGTH);
|
|
|
|
$0 = substr(str, RLENGTH + 2);
|
|
for (i = 1; i <= NF; i++) {
|
|
if (bs_len + 1 + length($i) > 79) {
|
|
bs_str = bs_str "\n" bs_pre;
|
|
bs_len = bs_pln;
|
|
}
|
|
|
|
bs_str = bs_str " " $i;
|
|
bs_len += 1 + length($i);
|
|
}
|
|
|
|
return bs_str;
|
|
}
|
|
|
|
#
|
|
# NAME: output()
|
|
# DESCRIPTION: write out the rewritten rules
|
|
#
|
|
function output(LaTeX) {
|
|
rule_use[grammar_start] = 1;
|
|
|
|
for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
|
|
len = rule_len[rule_idx];
|
|
|
|
if (rule_opt[rule_idx]) {
|
|
rule_rhs[rule_idx, 1] = "[ " rule_rhs[rule_idx, 1];
|
|
rule_rhs[rule_idx, len] = rule_rhs[rule_idx, len] " ]";
|
|
}
|
|
|
|
str = LaTeX ? latexify(rule_lhs[rule_idx]) \
|
|
: rule_lhs[rule_idx];
|
|
|
|
if (length(str) > rule_max) {
|
|
rule_max = length(str);
|
|
rule_mls = str;
|
|
}
|
|
|
|
for (rule_sub = 1; rule_sub <= len; rule_sub++) {
|
|
$0 = rule_rhs[rule_idx, rule_sub];
|
|
|
|
for (j = 1; j <= NF; j++) {
|
|
#
|
|
# A couple of notes here... Non-terminals that merely have a
|
|
# single terminal as definition are substituted anywhere they
|
|
# are used. And some special casing is needed for situations
|
|
# where someone actually defines the non-terminals as tokens.
|
|
# Those should definitely not be quotated.
|
|
#
|
|
if ((idx = rule_ref[$j]) &&
|
|
rule_len[$j] == 1 && rule_rhs[idx, 1] ~ /^[^ ]+$/)
|
|
$j = rule_rhs[idx, 1];
|
|
else if ((str = transtab[$j]) && !rule_ref[$j])
|
|
$j = str;
|
|
|
|
rule_use[$j] = 1;
|
|
}
|
|
|
|
rule_rhs[rule_idx, rule_sub] = $0;
|
|
}
|
|
}
|
|
|
|
if (LaTeX)
|
|
#
|
|
# Some LaTeX magic... We calculate the available size for the RHS of
|
|
# the rules. This will be provided as argument to the minipages used
|
|
# for each independent rule.
|
|
#
|
|
# The formula is fairly easy:
|
|
# textwidth - width(LHS) - width("::=") - 6 * tabcolsep
|
|
#
|
|
print "\\newlength\\rulelhs\n" \
|
|
"\\newlength\\rulemid\n" \
|
|
"\\newlength\\rulerhs\n" \
|
|
"\\settowidth\\rulelhs{" rule_mls "}\n" \
|
|
"\\settowidth\\rulemid{::=}\n" \
|
|
"\\setlength\\rulerhs{\\textwidth}\n" \
|
|
"\\addtolength\\rulerhs{-\\rulelhs}\n" \
|
|
"\\addtolength\\rulerhs{-\\rulemid}\n" \
|
|
"\\addtolength\\rulerhs{-6\\tabcolsep}\n\n" \
|
|
"\\begin{longtable}{lrl}";
|
|
|
|
for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
|
|
if (!rule_use[rule_lhs[rule_idx]])
|
|
continue;
|
|
|
|
len = rule_len[rule_idx];
|
|
|
|
for (rule_sub = 1; rule_sub <= len; rule_sub++) {
|
|
if (LaTeX) {
|
|
str = latexify(rule_lhs[rule_idx]);
|
|
out = str SPACES(rule_max - length(str)) " & ::= &\n" \
|
|
" \\begin{minipage}[t]{\\rulerhs}\n" \
|
|
" \\raggedright\n ";
|
|
} else {
|
|
str = rule_lhs[rule_idx];
|
|
out = str SPACES(rule_max - length(str)) " ::= ";
|
|
}
|
|
|
|
rhs = rule_rhs[rule_idx, rule_sub];
|
|
lvl = 0;
|
|
|
|
while (match(rhs, /(^[\(\{\[] | [\(\{\[\|\]\}\)] | [\]\}\)]$)/)) {
|
|
o_beg = RSTART;
|
|
o_len = RLENGTH;
|
|
|
|
if (substr(rhs, o_beg) ~ /(^[\(\{\[] | [\(\{\[] )/) {
|
|
lvl++;
|
|
|
|
str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \
|
|
: substr(rhs, 1, o_beg + o_len - 1);
|
|
|
|
out = out str;
|
|
rhs = substr(rhs, o_beg + o_len);
|
|
} else if (substr(rhs, o_beg) ~ /( [\]\}\)] | [\]\}\)]$)/) {
|
|
lvl--;
|
|
|
|
str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \
|
|
: substr(rhs, 1, o_beg + o_len - 1);
|
|
|
|
out = out str;
|
|
rhs = substr(rhs, o_beg + o_len);
|
|
} else if (substr(rhs, o_beg + 1, 1) == "|") {
|
|
if (lvl) {
|
|
out = out substr(rhs, 1, o_beg + o_len - 1);
|
|
rhs = substr(rhs, o_beg + o_len);
|
|
} else {
|
|
if (LaTeX) {
|
|
str = latexify(substr(rhs, 1, o_beg - 1));
|
|
|
|
print out str \
|
|
SPACES(64 - rule_max - length(str)) "\n" \
|
|
" \\end{minipage}" SPACES(60) " \\\\";
|
|
|
|
out = SPACES(rule_max) " & $|$ &\n" \
|
|
" \\begin{minipage}[t]{\\rulerhs}\n" \
|
|
" \\raggedright\n ";
|
|
} else {
|
|
print break_string(out substr(rhs, 1, o_beg - 1));
|
|
|
|
out = SPACES(rule_max + 1) " | ";
|
|
}
|
|
|
|
rhs = substr(rhs, o_beg + o_len);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (LaTeX) {
|
|
str = latexify(rhs);
|
|
|
|
print out str SPACES(76 - length(out) - length(str)) "\n" \
|
|
" \\end{minipage}" SPACES(60) " \\\\";
|
|
} else
|
|
print break_string(out rhs);
|
|
if (!Latex)
|
|
print "\t\t\t.";
|
|
|
|
}
|
|
}
|
|
|
|
if (LaTeX)
|
|
print "\\end{longtable}";
|
|
}
|
|
|
|
#
|
|
# NAME: process()
|
|
# DESCRIPTION: process a given yacc grammar definition file
|
|
#
|
|
function process(grammar_file) {
|
|
if (!exists(grammar_file))
|
|
error("So where exactly is this file?");
|
|
|
|
DBG(0, "Reading grammar from '" grammar_file "'.");
|
|
|
|
read_declarations();
|
|
read_grammar();
|
|
|
|
if (_optim >= 1)
|
|
optimize1();
|
|
if (_optim >= 2)
|
|
optimize2();
|
|
|
|
output(!_plain);
|
|
}
|
|
|
|
#
|
|
# NAME: load_tokens()
|
|
# DESCRIPTION: load a list of literal tokens from a file
|
|
#
|
|
function load_tokens(file) {
|
|
while (getline < file == 1)
|
|
transtab[$1] = "\"" $2 "\"";
|
|
|
|
close(file);
|
|
}
|
|
|
|
#
|
|
# NAME: give_help()
|
|
# DISCRIPTION: offer some help to the poor user
|
|
#
|
|
function give_help() {
|
|
print "Usage: y2l -- [options] yacc-file\n" \
|
|
"Options:\n" \
|
|
" -d\n" \
|
|
"\tWrite debugging output to the standard error stream. One can\n"\
|
|
"\tsupply a numeric argument to this option to set an indentation\n"\
|
|
"\tlevel for the messages.\n" \
|
|
" -h\n" \
|
|
"\tDisplay this help information.\n" \
|
|
" -O, -O[012]\n"u \
|
|
"\tOptimize the grammar to get more typical EBNF. If no argument\n"\
|
|
"\tis provided, the highest optimization level is selected.\n" \
|
|
" -p\n" \
|
|
"\tGive basic ASCII output rather than LaTeX output.\n" \
|
|
" -t/regexp/, -tfile\n" \
|
|
"\tIn the first form, the regular expression is used to decide\n" \
|
|
"\twhether a terminal in the grammar is a real token or rather a\n" \
|
|
"\tliteral. The second variant specifies a file which contains\n" \
|
|
"\tlines listing a token and the literal it represents.\n" \
|
|
" -v\n" \
|
|
"\tPrint out version information.";
|
|
exit;
|
|
}
|
|
|
|
#
|
|
# All processing is done from the BEGIN block. The one and only mandatory
|
|
# argument to the program is passed on to the grammar processor.
|
|
#
|
|
BEGIN {
|
|
_debug = 0; # no debugging
|
|
_optim = 0; # no optimization
|
|
_plain = 0; # no plain output
|
|
_tkpat = 0; # no token pattern
|
|
|
|
_bannr = "Yacc-to-LaTeX grammar processor v1.2";
|
|
|
|
for (i = 1; i < ARGC; i++) # loop over all arguments
|
|
if (ARGV[i] ~ /^-d$/) # debugging
|
|
_debug = 1;
|
|
else if (ARGV[i] ~ /^-h$/) # help the user
|
|
give_help();
|
|
else if (ARGV[i] ~ /^-O([012]|$)/) # optimization level
|
|
if (length(ARGV[i]) > 2)
|
|
_optim = substr(ARGV[i], 3, 1);
|
|
else
|
|
_optim = 1; # default optimization level
|
|
else if (ARGV[i] ~ /^-p$/) # non-LaTeX output
|
|
_plain = 1;
|
|
else if (ARGV[i] ~ /^-t/) # regexp or file for tokens
|
|
if (ARGV[i] ~ /^-t(\/\^|\/)[_\.A-Za-z][_\.A-zA-z0-9]*(\/|\$\/)$/) {
|
|
l = length(ARGV[i]);
|
|
_tkpat = substr(ARGV[i], 4, l - 4);
|
|
} else if (exists(file = substr(ARGV[i], 3)))
|
|
load_tokens(file);
|
|
else
|
|
error("So where is the token regexp pattern or file?");
|
|
else if (ARGV[i] ~ /^-v$/) # version
|
|
print _bannr >"/dev/stderr";
|
|
else if (ARGV[i] ~ /^-/) # unknown option
|
|
error("Am I supposed to do something with '" ARGV[i] "'?");
|
|
else if (grammar_file) # more than one grammar
|
|
error("How about just processing one grammar at a time?");
|
|
else
|
|
grammar_file = ARGV[i]; # name of the grammar file
|
|
|
|
if (!grammar_file) # no grammar file provided
|
|
error("Any particular grammar you have in mind?");
|
|
|
|
init(); # initialization
|
|
|
|
process(grammar_file) # process the given grammar
|
|
|
|
exit;
|
|
}
|