tinycobol/utils/y2l

#!/usr/bin/awk -f

###############################################################################
#
#  FILE:	y2l
#  DESCRIPTION: Yacc-to-LaTeX grammar processor
#
#  Copyright (c) 1994-2001 by Kris Van Hees, Belgium.  All rights reserved.
#  See the "Artistic License" included in this package (file: "Artistic") for
#  terms and conditions.
#
#  $Id: y2l,v 1.1 2003/08/10 15:18:49 bgiroud Exp $
#
###############################################################################

#
# Reserved variables:
#	_debug		Debug level
#	_exit		Exiting due to an error
#	_line		Current line being processed
#	_spclen		Length of string holding padding spaces
#	_spcstr		String to hold padding spaces
#

###############################################################################
# Support functions
###############################################################################

#
#  NAME:	SPACES()
#  DESCRIPTION: return a given number of spaces
#
function SPACES(snum) {
    if (!_spclen) {				# uninitialized (first use)
	_spcstr = " ";
	_spclen = 1;
    }

    while (_spclen < snum) {			# extend the string of spaces
	_spcstr = _spcstr _spcstr;		# double the spaces
	_spclen *= 2;				# update the string length
    }

    return substr(_spcstr, 1, snum);		# requested number of spaces
}

#
#  NAME:	DBG()
#  DESCRIPTION: write debug output to standard error (if debugging is enabled)
#
function DBG(dlvl, dmsg) {
    if (!_debug)				# debugging is disabled
	return;

    print "DBG (" SPACES(5 - length(_line)) _line ")" SPACES(dlvl * 2 + 1) \
		dmsg						>"/dev/stderr";
}

#
#  NAME:	error()
#  DESCRIPTION: write an error message to standard error
#
function error(emsg) {
    print "ERROR: " emsg					>"/dev/stderr";

    if (_line)
	print "       Line is (on or about) " _line		>"/dev/stderr";

    _exit = 1;					# mark program as exiting
    exit;
}

#
#  NAME:	exiting()
#  DESCRIPTION: return whether the program is exiting
#
function exiting() {
    return _exit;				# exit status
}

#
#  NAME:	exists()
#  DESCRIPTION: return whether a file exists
#
function exists(fname) {
    return !system("ls " fname " >/dev/null 2>&1");
}

###############################################################################
# Yacc-to-LaTeX initialization
###############################################################################

#
#  NAME:	init()
#  DESCRIPTION: initialization
#
function init() {
    PERC_PERC			= 257;
    PERC_LACC			= 258;
    PERC_RACC			= 259;
    PERC_LEFT			= 260;
    PERC_ASSC			= 261;
    PERC_RGHT			= 262;
    PERC_STRT			= 263;
    PERC_TOKN			= 264;
    PERC_TYPE			= 265;
    PERC_UNIO			= 266;
    PERC_PREC			= 267;

    CHAR			= 268;
    IDENT			= 269;
    STRING			= 270;
    TYPE			= 271;

    EOF				= -1;
    BAD				= -2;

    reserved["%%"]		= PERC_PERC;
    reserved["%{"]		= PERC_LACC;
    reserved["%}"]		= PERC_RACC;
    reserved["%left"]		= PERC_LEFT;
    reserved["%nonassoc"]	= PERC_ASSC;
    reserved["%prec"]		= PERC_PREC;
    reserved["%right"]		= PERC_RGHT;
    reserved["%start"]		= PERC_STRT;
    reserved["%token"]		= PERC_TOKN;
    reserved["%type"]		= PERC_TYPE;
    reserved["%union"]		= PERC_UNIO;
}

###############################################################################
# Yacc-to-LaTeX processor functions
###############################################################################

#
#  NAME:	token()
#  DESCRIPTION: verify whether something is a valid token
#
function token(tokname) {
    return tokname ~ /^[_\.A-Za-z][_\.A-Za-z0-9]*$/;
}

#
#  NAME:	read()
#  DESCRIPTION: read another (non-blank) line from the input file
#
function read() {
    while (getline < grammar_file == 1) {
	_line++;

	if ($1 != "")
	    return;
    }

    return grammar_eof = 1;
}

#
#  NAME:	skip_ws()
#  DESCRIPTION: skip a continuous block of whitespace
#
function skip_ws() {
    in_comment = 0;

    while (!grammar_eof) {
	sub(/^[ \t]+/, "");			# remove leading whitespace

	if ($1 != "") {
	    if (/^\/\//) {			# C++ comment
		$0 = "";
	    } else if (in_comment) {		# in a comment
		if (match($0, /\*\//)) {	# end of a comment
		    $0 = substr($0, RSTART + RLENGTH);
		    in_comment = 0;
		} else
		    $0 = "";
	    } else if (/^\/\*"[^"]+"\*\//) {	# special marker
		sub(/\/\*"/, "\"");
		sub(/"\*\//, "\"");

		return;
	    } else if (/^\/\*/) {		# regular comment
		$0 = substr($0, 3);
		in_comment = 1;
	    } else
		return;

	    sub(/[ \t]+$/, "");			# remove trailing whitespace
	}

	if ($1 == "")
	    read();
    }
}

#
#  NAME:	lex()
#  DESCRIPTION: read the next token from the input
#
function lex() {
    if (tok_prev) {
	tok = tok_prev;
	tok_str = tok_pstr;
	tok_prev = 0;

	return tok;
    }

    skip_ws();

    if (grammar_eof)
	return EOF;

    if (/^%/)
	if (match($0, /^%(%|{|}|left|nonassoc|prec|right|start|token|type|union)/)) {
	    tok_str = substr($0, 1, RLENGTH);
	    $0 = substr($0, RLENGTH + 1);

	    return reserved[tok_str];
	} else if (match($0, /^%[_A-Za-z][_A-Za-z0-9]*/)) {
	    tok_str = substr($0, 1, RLENGTH);
	    $0 = substr($0, RLENGTH + 1);

	    return BAD;
	}

    if (match($0, /^[_\.A-Za-z][_\.A-zA-z0-9]*/)) {
	tok_str = substr($0, 1, RLENGTH);
	$0 = substr($0, RLENGTH + 1);

	return IDENT;
    }

    if (match($0, /^<[_\.A-Za-z][_\.A-zA-z0-9]*>/)) {
	tok_str = substr($0, 1, RLENGTH);
	$0 = substr($0, RLENGTH + 1);

	return TYPE;
    }

    if (/^'/)
	if (match($0, /^'(.|\\([tnarfbv\\'"]|[0-7][0-7]*))'/)) {
	    tok_str = "@C" (consts++);
	    transtab[tok_str] = "\"" substr($0, 2, RLENGTH - 2) "\"";
	    $0 = substr($0, RLENGTH + 1);

	    return CHAR;
	} else
	    error("Excuse me, but this character constant is a bit weird.");

    if (/^"/)
	if (match($0, /([^\\]|[ \t]+)(\\\\)*"/)) {
	    tok_str = "@S" (strings++);
	    transtab[tok_str] = substr($0, 1, RSTART + RLENGTH - 1);
	    $0 = substr($0, RSTART + RLENGTH);

	    return STRING;
	} else
	    error("Newlines in strings are interesting, but not allowed.");

    tok_str = substr($0, 1, 1);
    $0 = substr($0, 2);

    return tok_str;
}

#
#  NAME:	unlex()
#  DESCRIPTION: return a token to the input stream
#
function unlex(tok) {
    tok_prev = tok;
    tok_pstr = tok_str;
}

#
#  NAME:	skip_definition()
#  DESCRIPTION: skip the contents of a %{ ... %} block
#
function skip_definition() {
    do {
	skip = lex();
    } while (skip != PERC_RACC && skip != EOF);
}

#
#  NAME:	decl_token()
#  DESCRIPTION: read a token declaration
#
function decl_token() {
    first = 1;

    do {
	tok = lex();

	if (tok == ",") {
	    symbol = 0;
	} else if (tok == CHAR) {
	    DBG(1, transtab[tok_str] ": No need to remember this token.");
	} else if (tok == IDENT) {
	    if (_tkpat && tok_str !~ _tkpat) {
		if (transtab[tok_str])
		    DBG(2, "WARNING: Redefining '" tok_str "'.");

		transtab[tok_str] = "\"" tolower(tok_str) "\"";
		DBG(1, tok_str ": Defined as " transtab[tok_str] ".");
	    }

	    symbol = tok_str;
	} else if (tok == NUMBER) {
	    if (!symbol)
		error("How about giving a token first?");

	    symbol = 0;
	} else if (tok == STRING) {
	    if (!symbol)
		error("How about giving a token first?");

	    str = transtab[tok_str];
	    transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\"";
	    DBG(1, SPACES(length(symbol) + 2)				      \
		   "Defined as " transtab[symbol] ".");

	    symbol = 0;
	} else if (tok == TYPE) {
	    if (!first)
		error("This is no place for a type name.");
	} else {
	    unlex(tok);
	    return;
	}

	first = 0;
    } while (tok != EOF);
}

#
#  NAME:	decl_start()
#  DESCRIPTION: read a start rule declaration
#
function decl_start() {
    if (grammar_start)
	error("Hm, you want the grammar to start with two rules?");
    else if (lex() == IDENT) {
	grammar_start = tok_str;
	DBG(2, "The grammar start rule is '" grammar_start "'.");
    } else
	error("How about a nice juicy identifier?");
}

#
#  NAME:	decl_type()
#  DESCRIPTION: read a type declaration
#
function decl_type() {
    if (lex() != TYPE)
	error("So where is the typename?");

    do {
	tok = lex();

	if (tok == ",") {
	    symbol = 0;
	} else if (tok == CHAR) {
	    error("Bison etc may accept literals in a %type declaration, "    \
		  "but the Unix 7th\n       Ed manual clearly indicates "     \
		  "that it is NOT legal.  And I think that the\n       Bell " \
		  "Labs guys know what they are talking about; but anyway, "  \
		  "do you\n       really spend the time reading this long "   \
		  "error message?");
	} else if (tok == IDENT) {
	    if (_tkpat && tok_str !~ _tkpat) {
		if (transtab[tok_str])
		    DBG(2, "WARNING: Redefining '" tok_str "'.");

		transtab[tok_str] = "\"" tolower(tok_str) "\"";
		DBG(1, tok_str ": Defined as " transtab[tok_str] ".");
	    }

	    symbol = tok_str;
	} else if (tok == NUMBER) {
	    if (!symbol)
		error("How about giving a token first?");

	    symbol = 0;
	} else if (tok == STRING) {
	    if (!symbol)
		error("How about giving a token first?");

	    str = transtab[tok_str];
	    transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\"";
	    DBG(1, SPACES(length(symbol) + 2)				      \
		   "Defined as " transtab[symbol] ".");

	    symbol = 0;
	} else {
	    unlex(tok);
	    return;
	}
    } while (tok != EOF);
}

#
#  NAME:	decl_union()
#  DESCRIPTION: read a union declaration
#
function decl_union() {
    if (grammar_union)
	error("How about sticking to one single union declaration?");

    grammar_union = 1;
    DBG(2, "Extended types have been registered with a union declaration.");

    do {
	tok = lex();

	if (tok == "{")
	    block++;
	else if (tok == "}") {
	    if (!block)
		error("Why close an unopened block?");
	    if (--block <= 0)
		return;
	} else if (tok == EOF)
	    error("The file ends before the union is finished.  How rude!");
    } while (1);
}

#
#  NAME:	read_declarations()
#  DESCRIPTION: read the yacc declarations
#
function read_declarations() {
    do {
	tok = lex();				# next token

	if (tok == PERC_PERC || tok == EOF)	# end of the declarations
	    return;
	if (tok == PERC_LACC) {			# definition block
	    skip_definition();
	} else if (tok == PERC_LEFT) {		# left associative declaration
	    decl_token();
	} else if (tok == PERC_ASSC) {		# non-associative declaration
	    decl_token();
	} else if (tok == PERC_RGHT) {		# right associative declaration
	    decl_token();
	} else if (tok == PERC_STRT) {		# start rule declaration
	    decl_start();
	} else if (tok == PERC_TOKN) {		# token declaration(s)
	    decl_token();
	} else if (tok == PERC_TYPE) {		# type declaration
	    decl_type();
	} else if (tok == PERC_UNIO) {		# union declaration
	    decl_union();
	} else
	    DBG(2, "WARNING: Ignoring the unknown token '" tok_str "'.");
    } while (1);
}

#
#  NAME:	skip_action()
#  DESCRIPTION: skip the contents of an action block
#
function skip_action() {
    block = 1;

    do {
	tok = lex();

	if (tok == "{")
	    block++;
	else if (tok == "}") {
	    if (!block)
		error("Why close an unopened block?");
	    if (--block <= 0)
		return;
	} else if (tok == EOF)
	    error("The file ends before the action is finished.  How rude!");
    } while (tok != EOF);
}

#
#  NAME:	read_grammar()
#  DESCRIPTION: read the yacc grammar
#
function read_grammar() {
    tok = lex();

    do {
	if (tok == PERC_PERC || tok == EOF)	# end of the grammar
	    return;

	if (tok == IDENT) {			# rule begins here
	    if (!(rule_idx = rule_ref[tok_str])) {
		rule_idx = ++rule_cnt;		# new rule
		rule_lhs[rule_idx] = tok_str;
		rule_ref[tok_str] = rule_idx;
		rule_sub = ++rule_len[rule_idx];

		if (!grammar_start)
		    grammar_start = tok_str;
	    }

	    if (lex() != ":")
		error("The LHS and RHS would like to be separated by a colon.");
	} else if (tok == "|") {		# alternative RHS for a rule
	    if (!rule_cnt)
		error("The grammar shouldn't start with a |.");

	    rule_sub = ++rule_len[rule_idx];
	} else
	    error("Why are you giving me '" tok "'?");

	rule_rhs[rule_cnt] = "";		# empty RHS

	do {					# read the RHS
	    tok = lex();

	    if (tok == PERC_PREC) {		# %prec
		tok = lex();

		if (tok != IDENT && tok != CHAR)
		    error("What precedence are you talking about?");

		tok = lex();			# continue with the rest
	    }

	    if (tok == IDENT) {			# might be a new rule
		old_str = tok_str;
		nxt = lex();			# look ahead
		unlex(nxt);
		tok_str = old_str;

		if (nxt == ":")			# yup, a new rule started
		    break;

		rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub]   \
					       " " tok_str;
	    } else if (tok == CHAR) {
		if (tok_str ~ /^@/)
		    tok_str = transtab[tok_str];

		rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub]   \
					       " " tok_str;
	    } else if (tok == "{") {		# an action block
		skip_action();
	    } else				# can't be part of a rule
		break;
	} while (1);

	sub(/^ /, "", rule_rhs[rule_idx, rule_sub]);

	if (rule_rhs[rule_idx, rule_sub] ~				      \
	    /(^|[^_\.A-Za-z0-9])error($|[^_\.A-Za-z0-9])/) {
	    DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub]	      \
		   " [IGNORED]");

	    rule_rhs[rule_idx, rule_sub] = "";
	    rule_len[rule_idx]--;
	} else
	    DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub]);

	if (tok == ";")
	    tok = lex();
    } while (1);
}

#
#  NAME:	is_optional()
#  DESCRIPTION: check whether the given non-terminal is optional
#
function is_optional(idx) {
    if ((len = rule_len[idx]) <= 1)
	return 0;

    #
    # One or more empty rules for a non-terminal indicate that the other  rules
    # are in fact optional.  There shouldn't be multiple empty rules, but it is
    # is technically possible.
    #
    for (rule_sub = 1; rule_sub <= len; rule_sub++)
	if (rule_rhs[idx, rule_sub] == "") {
	    while (++rule_sub <= len)
		rule_rhs[idx, rule_sub - 1] = rule_rhs[idx, rule_sub];

	    rule_len[idx]--;
	}

    return rule_len[idx] < len;
}

#
#  NAME:	get_prefix()
#  DESCRIPTION: check whether the given non-terminal has a common prefix
#
function get_prefix(idx) {
    if ((len = rule_len[idx]) <= 1)
	return 0;

    #
    # Split up the first rule into tokens.  These will be compared with all the
    # other rules for this non-terminal.
    #
    gp_last = split(rule_rhs[idx, 1], arr);
    gp_pref = gp_last;

    #
    # Look for the longest common prefix.
    #
    for (rule_sub = 2; rule_sub <= len; rule_sub++) {
	$0 = rule_rhs[idx, rule_sub];
	gp_tokc = NF;

	if (gp_tokc < gp_pref)
	    gp_pref = gp_tokc;

	for (j = 1; j <= gp_pref; j++)
	    if (arr[j] != $j) {
		if (!(gp_pref = j - 1))
		    return 0;

		break;
	    }
    }

    #
    # Construct the prefix string.
    #
    gp_pstr = arr[1];
    for (j = 2; j <= gp_pref; j++)
	gp_pstr = gp_pstr " " arr[j];

    #
    # Remove the common prefix from all rules for this non-terminal.
    #
    for (rule_sub = 1; rule_sub <= len; rule_sub++) {
	$0 = rule_rhs[idx, rule_sub];

	for (j = 1; j <= gp_pref; j++)
	    $j = "";

	sub(/^ +/, "");
	rule_rhs[idx, rule_sub] = $0;
    }

    return gp_pstr;
}

#
#  NAME:	get_suffix()
#  DESCRIPTION: check whether the given non-terminal has a common suffix
#
function get_suffix(idx) {
    if ((len = rule_len[idx]) <= 1)
	return 0;

    #
    # Split up the first rule into tokens.  These will be compared with all the
    # other rules for this non-terminal.
    #
    gs_last = split(rule_rhs[idx, 1], arr);
    gs_suff = gs_last;

    #
    # Look for the longest common suffix.
    #
    for (rule_sub = 2; rule_sub <= len; rule_sub++) {
	$0 = rule_rhs[idx, rule_sub];
	gs_tokc = NF;

	if (gs_tokc < gs_suff)
	    gs_suff = gs_tokc;

	for (j = 0; j < gs_suff; j++)
	    if (arr[gs_last - j] != $(gs_tokc - j)) {
		if (!(gs_suff = j))
		    return 0;

		break;
	    }
    }

    #
    # Construct the suffix string.
    #
    gs_sstr = arr[gs_last];
    for (j = 1; j < gs_suff; j++)
	gs_sstr = arr[gs_last - j] " " gs_sstr;

    #
    # Remove the common suffix from all rules for this non-terminal.
    #
    for (rule_sub = 1; rule_sub <= len; rule_sub++) {
	$0 = rule_rhs[idx, rule_sub];

	for (j = 0; j < gs_suff; j++)
	    $(NF - j) = "";

	sub(/ +$/, "");
	rule_rhs[idx, rule_sub] = $0;
    }

    return gs_sstr;
}

#
#  NAME:	optimize1()
#  DESCRIPTION: first pass of the optimizer
#
function optimize1() {
    DBG(0, "Optimization pass 1...");

    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
	#
	# Non-terminals with only a single rule can't be optimized here.
	#
	if ((len = rule_len[rule_idx]) <= 1)
	    continue;

	#
	# First record whether the entire non-terminal might be optional.
	#
	rule_opt[rule_idx] = is_optional(rule_idx);

	#
	# The actual optimization takes place in this endless loop.  It will in
	# fact end when an iteration does not yield an optional ruleset.   This
	# is a proper and correct stop criteria, since a  non-optional  ruleset
	# cannot have any common prefix or suffix.  If it did, those would have
	# been added to the already present prefix or suffix.
	#
	pref = "";
	suff = "";
	do {
	    if (pstr = get_prefix(rule_idx))
		pref = pref " " pstr;
	    else if (sstr = get_suffix(rule_idx))
		suff = sstr " " suff;

	    if (is_optional(rule_idx)) {
		pref = pref " [";
		suff = "] " suff;
	    } else {
		if (pstr || sstr) {
		    pref = pref " (";
		    suff = ") " suff;
		}

		break;
	    }
	} while (1);

	#
	# Compose the single composite rule for this non-terminal, if a  common
	# prefix or suffix was found.
	#
	if (pref != "" || suff != "") {
	    sub(/^ /, "", pref);
	    sub(/ $/, "", suff);

	    DBG(2, "Rules for '" rule_lhs[rule_idx] "' have:");

	    if (pref != "")
		DBG(3, "Prefix '" pref "'");
	    if (suff != "")
		DBG(3, "Suffix '" suff "'");

	    len = rule_len[rule_idx];
	    pref = pref " " rule_rhs[rule_idx, 1];

	    for (rule_sub = 2; rule_sub <= len; rule_sub++)
		pref = pref " | " rule_rhs[rule_idx, rule_sub];

	    rule_rhs[rule_idx, 1] = pref " " suff;
	    rule_len[rule_idx] = 1;

	    DBG(3, "Combined rule '" rule_rhs[rule_idx, 1] "'");

	    if (rule_opt[rule_idx])
		DBG(3, "(The non-terminal is optional.)");
	}
    }
}

#
#  NAME:	optimize2()
#  DESCRIPTION: second pass of the optimizer
#
function optimize2() {
    DBG(0, "Optimization pass 2...");

    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
	$0 = rule_rhs[rule_idx, 1];

	if ((len = rule_len[rule_idx]) > 1)
	    for (rule_sub = 2; rule_sub <= len; rule_sub++)
		$0 = $0 " | " rule_rhs[rule_idx, rule_sub];

	if ($1 != "[" && rule_opt[rule_idx]) {
	    $0 = "[ " $0 " ]";
	    rule_opt[rule_idx] = 0;
	}

	if ($1 == "[")
	    if ($2 == rule_lhs[rule_idx]) {
		for (i = NF; i > 0; i--)
		    if ($i == "]")
			break;

		if ((NF - i) < 3) {
		    $1 = "";
		    subst = "{";
		    $i = "}";

		    while (++i <= NF)
			subst = subst " " $i;

		    $2 = subst;

		    sub(/^ +/, "");
		}
	    }

	if ($NF == "]")
	    if ($(NF - 1) == rule_lhs[rule_idx]) {
		for (i = 1; i <= NF; i++)
		    if ($i == "[")
			break;

		if (i < 3) {
		    $i = "{";
		    subst = "}";
		    $NF = "";

		    while (--i > 0)
			subst = $i " " subst;

		    $(NF - 1) = subst;

		    sub(/ +$/, "");
		}
	    }

	if (rule_opt[rule_idx]) {
	    $0 = "[ " $0 " ]";
	    rule_opt[rule_idx] = 0;
	}

	rule_rhs[rule_idx, 1] = $0;
	rule_len[rule_idx] = 1;
    }
}

#
#  NAME:	latexify()
#  DESCRIPTION: make substitutions to ensure that the string is LaTeX ok
#
function latexify(txt) {
    gsub(/[{&%}]/, "$\\\\&$", txt);
    gsub(/[!|[\]=+\-*\/<>]/, "$&$", txt);
    gsub(/\$\$/, "\\!", txt);
    gsub(/"[^"]+"/, "``{\\bf &}''", txt);
    gsub(/_/, "\\_", txt);
    gsub(/\^/, "\\verb+^+", txt);
    gsub(/"/, "", txt);

    return txt;
}

#
#  NAME:	break_string()
#  DESCRIPTION: possibly break up a string in multiple lines
#
function break_string(str) {
    match(str, /^([_A-Za-z][_A-Za-z0-9]* +| +)(::=|\|)/);
    bs_len = RLENGTH;
    bs_str = substr(str, 1, RLENGTH);
    bs_pln = RLENGTH;
    bs_pre = SPACES(RLENGTH);

    $0 = substr(str, RLENGTH + 2);
    for (i = 1; i <= NF; i++) {
	if (bs_len + 1 + length($i) > 79) {
	    bs_str = bs_str "\n" bs_pre;
	    bs_len = bs_pln;
	}

	bs_str = bs_str " " $i;
	bs_len += 1 + length($i);
   }

    return bs_str;
}

#
#  NAME:	output()
#  DESCRIPTION: write out the rewritten rules
#
function output(LaTeX) {
    rule_use[grammar_start] = 1;

    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
	len = rule_len[rule_idx];

	if (rule_opt[rule_idx]) {
	    rule_rhs[rule_idx, 1] = "[ "  rule_rhs[rule_idx, 1];
	    rule_rhs[rule_idx, len] = rule_rhs[rule_idx, len] " ]";
	}

	str = LaTeX ? latexify(rule_lhs[rule_idx])			      \
		    : rule_lhs[rule_idx];

	if (length(str) > rule_max) {
	    rule_max = length(str);
	    rule_mls = str;
	}

	for (rule_sub = 1; rule_sub <= len; rule_sub++) {
	    $0 = rule_rhs[rule_idx, rule_sub];

	    for (j = 1; j <= NF; j++) {
		#
		# A couple of notes here...  Non-terminals that merely  have  a
		# single terminal as definition are substituted  anywhere  they
		# are used.  And some special casing is needed  for  situations
		# where someone actually defines the non-terminals  as  tokens.
		# Those should definitely not be quotated.
		#
		if ((idx = rule_ref[$j]) &&
		    rule_len[$j] == 1 && rule_rhs[idx, 1] ~ /^[^ ]+$/)
		    $j = rule_rhs[idx, 1];
		else if ((str = transtab[$j]) && !rule_ref[$j])
		    $j = str;

		rule_use[$j] = 1;
	    }

	    rule_rhs[rule_idx, rule_sub] = $0;
	}
    }

    if (LaTeX)
	#
	# Some LaTeX magic...  We calculate the available size for the  RHS  of
	# the rules.  This will be provided as argument to the  minipages  used
	# for each independent rule.
	#
	# The formula is fairly easy:
	#	textwidth - width(LHS) - width("::=") - 6 * tabcolsep
	#
	print "\\newlength\\rulelhs\n"					      \
	      "\\newlength\\rulemid\n"					      \
	      "\\newlength\\rulerhs\n"					      \
	      "\\settowidth\\rulelhs{" rule_mls "}\n"			      \
	      "\\settowidth\\rulemid{::=}\n"				      \
	      "\\setlength\\rulerhs{\\textwidth}\n"			      \
	      "\\addtolength\\rulerhs{-\\rulelhs}\n"			      \
	      "\\addtolength\\rulerhs{-\\rulemid}\n"			      \
	      "\\addtolength\\rulerhs{-6\\tabcolsep}\n\n"		      \
	      "\\begin{longtable}{lrl}";

    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
	if (!rule_use[rule_lhs[rule_idx]])
	    continue;

	len = rule_len[rule_idx];

	for (rule_sub = 1; rule_sub <= len; rule_sub++) {
	    if (LaTeX) {
		str = latexify(rule_lhs[rule_idx]);
		out = str SPACES(rule_max - length(str)) " & ::= &\n"	      \
		      "  \\begin{minipage}[t]{\\rulerhs}\n"		      \
		      "    \\raggedright\n    ";
	    } else {
		str = rule_lhs[rule_idx];
		out = str SPACES(rule_max - length(str)) " ::= ";
	    }

	    rhs = rule_rhs[rule_idx, rule_sub];
	    lvl = 0;

	    while (match(rhs, /(^[\(\{\[] | [\(\{\[\|\]\}\)] | [\]\}\)]$)/)) {
		o_beg = RSTART;
		o_len = RLENGTH;

		if (substr(rhs, o_beg) ~ /(^[\(\{\[] | [\(\{\[] )/) {
		    lvl++;

		    str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \
				: substr(rhs, 1, o_beg + o_len - 1);

		    out = out str;
		    rhs = substr(rhs, o_beg + o_len);
		} else if (substr(rhs, o_beg) ~ /( [\]\}\)] | [\]\}\)]$)/) {
		    lvl--;

		    str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \
				: substr(rhs, 1, o_beg + o_len - 1);

		    out = out str;
		    rhs = substr(rhs, o_beg + o_len);
		} else if (substr(rhs, o_beg + 1, 1) == "|") {
		    if (lvl) {
			out = out substr(rhs, 1, o_beg + o_len - 1);
			rhs = substr(rhs, o_beg + o_len);
		    } else {
			if (LaTeX) {
			    str = latexify(substr(rhs, 1, o_beg - 1));

			    print out str				      \
				  SPACES(64 - rule_max - length(str)) "\n"    \
				  "  \\end{minipage}" SPACES(60) " \\\\";

			    out = SPACES(rule_max) " & $|$ &\n"		      \
				  "  \\begin{minipage}[t]{\\rulerhs}\n"	      \
				  "    \\raggedright\n    ";
			} else {
			    print break_string(out substr(rhs, 1, o_beg - 1));

			    out = SPACES(rule_max + 1) "  | ";
			}

			rhs = substr(rhs, o_beg + o_len);
		    }
		}
	    }

	    if (LaTeX) {
		str = latexify(rhs);

		print out str SPACES(76 - length(out) - length(str)) "\n"     \
		      "  \\end{minipage}" SPACES(60) " \\\\";
	    } else
		print break_string(out rhs);
	    if (!Latex)
	        print "\t\t\t.";

	}
    }

    if (LaTeX)
	print "\\end{longtable}";
}

#
#  NAME:	process()
#  DESCRIPTION: process a given yacc grammar definition file
#
function process(grammar_file) {
    if (!exists(grammar_file))
	error("So where exactly is this file?");

    DBG(0, "Reading grammar from '" grammar_file "'.");

    read_declarations();
    read_grammar();

    if (_optim >= 1)
	optimize1();
    if (_optim >= 2)
	optimize2();

    output(!_plain);
}

#
#  NAME:	load_tokens()
#  DESCRIPTION: load a list of literal tokens from a file
#
function load_tokens(file) {
    while (getline < file == 1)
	transtab[$1] = "\"" $2 "\"";

    close(file);
}

#
#  NAME:	give_help()
#  DISCRIPTION: offer some help to the poor user
#
function give_help() {
    print "Usage: y2l -- [options] yacc-file\n"				      \
	  "Options:\n"							      \
	  "    -d\n"							      \
	  "\tWrite debugging output to the standard error stream.  One  can\n"\
	  "\tsupply a numeric argument to this option to set an indentation\n"\
	  "\tlevel for the messages.\n"					      \
	  "    -h\n"							      \
	  "\tDisplay this help information.\n"				      \
	  "    -O, -O[012]\n"u						      \
	  "\tOptimize the grammar to get more typical EBNF.  If no argument\n"\
	  "\tis provided, the highest optimization level is selected.\n"      \
	  "    -p\n"							      \
	  "\tGive basic ASCII output rather than LaTeX output.\n"	      \
	  "    -t/regexp/, -tfile\n"					      \
	  "\tIn the first form, the regular expression is used  to  decide\n" \
	  "\twhether a terminal in the grammar is a real token or rather a\n" \
	  "\tliteral.  The second variant specifies a file which  contains\n" \
	  "\tlines listing a token and the literal it represents.\n"	      \
	  "    -v\n"							      \
	  "\tPrint out version information.";
    exit;
}

#
#  All processing is done from the BEGIN block.  The  one  and  only  mandatory
#  argument to the program is passed on to the grammar processor.
#
BEGIN {
    _debug = 0;					# no debugging
    _optim = 0;					# no optimization
    _plain = 0;					# no plain output
    _tkpat = 0;					# no token pattern

    _bannr = "Yacc-to-LaTeX grammar processor v1.2";

    for (i = 1; i < ARGC; i++)			# loop over all arguments
	if (ARGV[i] ~ /^-d$/)			# debugging
	    _debug = 1;
	else if (ARGV[i] ~ /^-h$/)		# help the user
	    give_help();
	else if (ARGV[i] ~ /^-O([012]|$)/)	# optimization level
	    if (length(ARGV[i]) > 2)
		_optim = substr(ARGV[i], 3, 1);
	    else
		_optim = 1;			# default optimization level
	else if (ARGV[i] ~ /^-p$/)		# non-LaTeX output
	    _plain = 1;
	else if (ARGV[i] ~ /^-t/)		# regexp or file for tokens
	    if (ARGV[i] ~ /^-t(\/\^|\/)[_\.A-Za-z][_\.A-zA-z0-9]*(\/|\$\/)$/) {
		l = length(ARGV[i]);
		_tkpat = substr(ARGV[i], 4, l - 4);
	    } else if (exists(file = substr(ARGV[i], 3)))
		load_tokens(file);
	    else
		error("So where is the token regexp pattern or file?");
	else if (ARGV[i] ~ /^-v$/)		# version
	    print _bannr					>"/dev/stderr";
	else if (ARGV[i] ~ /^-/)		# unknown option
	    error("Am I supposed to do something with '" ARGV[i] "'?");
	else if (grammar_file)			# more than one grammar
	    error("How about just processing one grammar at a time?");
	else
	    grammar_file = ARGV[i];		# name of the grammar file

    if (!grammar_file)				# no grammar file provided
	error("Any particular grammar you have in mind?");

    init();					# initialization

    process(grammar_file)			# process the given grammar

    exit;
}