mirror of https://github.com/yasm/yasm.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
536 lines
15 KiB
536 lines
15 KiB
.ds re \fBre2c\fP |
|
.ds le \fBlex\fP |
|
.ds rx regular expression |
|
.ds lx \fIl\fP-expression |
|
.TH RE2C 1 "8 April 1994" "Version 0.5" |
|
\"$Log: re2c.1,v $ |
|
\"Revision 1.1 2002/04/07 22:27:06 peter |
|
\"Initial revision |
|
\" |
|
\"Revision 1.2 1994/04/16 15:50:32 peterr |
|
\"Fix bug in simple example. |
|
\" |
|
\"Revision 1.1 1994/04/08 15:39:09 peterr |
|
\"Initial revision |
|
\" |
|
.SH NAME |
|
re2c \- convert regular expressions to C/C++ |
|
|
|
.SH SYNOPSIS |
|
\*(re [\fB-esb\fP] \fIname\fP |
|
|
|
.SH DESCRIPTION |
|
\*(re is a preprocessor that generates C-based recognizers from regular |
|
expressions. |
|
The input to \*(re consists of C/C++ source interleaved with |
|
comments of the form \fC/*!re2c\fP ... \fC*/\fP which contain |
|
scanner specifications. |
|
In the output these comments are replaced with code that, when |
|
executed, will find the next input token and then execute |
|
some user-supplied token-specific code. |
|
|
|
For example, given the following code |
|
|
|
.in +3 |
|
.nf |
|
#define NULL ((char*) 0) |
|
char *scan(char *p){ |
|
char *q; |
|
#define YYCTYPE char |
|
#define YYCURSOR p |
|
#define YYLIMIT p |
|
#define YYMARKER q |
|
#define YYFILL(n) |
|
/*!re2c |
|
[0-9]+ {return YYCURSOR;} |
|
[\\000-\\377] {return NULL;} |
|
*/ |
|
} |
|
.fi |
|
.in -3 |
|
|
|
\*(re will generate |
|
|
|
.in +3 |
|
.nf |
|
/* Generated by re2c on Sat Apr 16 11:40:58 1994 */ |
|
#line 1 "simple.re" |
|
#define NULL ((char*) 0) |
|
char *scan(char *p){ |
|
char *q; |
|
#define YYCTYPE char |
|
#define YYCURSOR p |
|
#define YYLIMIT p |
|
#define YYMARKER q |
|
#define YYFILL(n) |
|
{ |
|
YYCTYPE yych; |
|
unsigned int yyaccept; |
|
goto yy0; |
|
yy1: ++YYCURSOR; |
|
yy0: |
|
if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); |
|
yych = *YYCURSOR; |
|
if(yych <= '/') goto yy4; |
|
if(yych >= ':') goto yy4; |
|
yy2: yych = *++YYCURSOR; |
|
goto yy7; |
|
yy3: |
|
#line 10 |
|
{return YYCURSOR;} |
|
yy4: yych = *++YYCURSOR; |
|
yy5: |
|
#line 11 |
|
{return NULL;} |
|
yy6: ++YYCURSOR; |
|
if(YYLIMIT == YYCURSOR) YYFILL(1); |
|
yych = *YYCURSOR; |
|
yy7: if(yych <= '/') goto yy3; |
|
if(yych <= '9') goto yy6; |
|
goto yy3; |
|
} |
|
#line 12 |
|
|
|
} |
|
.fi |
|
.in -3 |
|
|
|
.SH OPTIONS |
|
\*(re provides the following options: |
|
.TP |
|
\fB-e\fP |
|
Cross-compile from an ASCII platform to an EBCDIC one. |
|
.TP |
|
\fB-s\fP |
|
Generate nested \fCif\fPs for some \fCswitch\fPes. Many compilers need this |
|
assist to generate better code. |
|
.TP |
|
\fB-b\fP |
|
Implies \fB-s\fP. Use bit vectors as well in the attempt to coax better |
|
code out of the compiler. Most useful for specifications with more than a |
|
few keywords (e.g. for most programming languages). |
|
|
|
.SH "INTERFACE CODE" |
|
Unlike other scanner generators, \*(re does not generate complete scanners: |
|
the user must supply some interface code. |
|
In particular, the user must define the following macros: |
|
.TP |
|
\fCYYCHAR\fP |
|
Type used to hold an input symbol. |
|
Usually \fCchar\fP or \fCunsigned char\fP. |
|
.TP |
|
\fCYYCURSOR\fP |
|
\*(lx of type \fC*YYCHAR\fP that points to the current input symbol. |
|
The generated code advances \fCYYCURSOR\fP as symbols are matched. |
|
On entry, \fCYYCURSOR\fP is assumed to point to the first character of the |
|
current token. On exit, \fCYYCURSOR\fP will point to the first character of |
|
the following token. |
|
.TP |
|
\fCYLIMIT\fP |
|
Expression of type \fC*YYCHAR\fP that marks the end of the buffer |
|
(\fCYLIMIT[-1]\fP is the last character in the buffer). |
|
The generated code repeatedly compares \fCYYCURSOR\fP to \fCYLIMIT\fP |
|
to determine when the buffer needs (re)filling. |
|
.TP |
|
\fCYYMARKER\fP |
|
\*(lx of type \fC*YYCHAR\fP. |
|
The generated code saves backtracking information in \fCYYMARKER\fP. |
|
.TP |
|
\fCYYFILL(\fP\fIn\fP\fC)\fP |
|
The generated code "calls" \fCYYFILL\fP when the buffer needs |
|
(re)filling: at least \fIn\fP additional characters should |
|
be provided. \fCYYFILL\fP should adjust \fCYYCURSOR\fP, \fCYYLIMIT\fP and |
|
\fCYYMARKER\fP as needed. Note that for typical programming languages |
|
\fIn\fP will be the length of the longest keyword plus one. |
|
|
|
.SH "SCANNER SPECIFICATIONS" |
|
Each scanner specification consists of a set of \fIrules\fP and name |
|
definitions. |
|
Rules consist of a regular expression along with a block of C/C++ code that |
|
is to be executed when the associated regular expression is matched. |
|
Name definitions are of the form |
|
``\fIname\fP \fC=\fP \fIregular expression\fP\fC;\fP''. |
|
|
|
.SH "SUMMARY OF RE2C REGULAR EXPRESSIONS" |
|
.TP |
|
\fC"foo"\fP |
|
the literal string \fCfoo\fP. |
|
ANSI-C escape sequences can be used. |
|
.TP |
|
\fC[xyz]\fP |
|
a "character class"; in this case, |
|
the \*(rx matches either an '\fCx\fP', a '\fCy\fP', or a '\fCz\fP'. |
|
.TP |
|
\fC[abj-oZ]\fP |
|
a "character class" with a range in it; |
|
matches an '\fCa\fP', a '\fCb\fP', any letter from '\fCj\fP' through '\fCo\fP', |
|
or a '\fCZ\fP'. |
|
.TP |
|
\fIr\fP\fC\e\fP\fIs\fP |
|
match any \fIr\fP which isn't an \fIs\fP. \fIr\fP and \fIs\fP must be regular expressions |
|
which can be expressed as character classes. |
|
.TP |
|
\fIr\fP\fC*\fP |
|
zero or more \fIr\fP's, where \fIr\fP is any regular expression |
|
.TP |
|
\fC\fIr\fP\fC+\fP |
|
one or more \fIr\fP's |
|
.TP |
|
\fC\fIr\fP\fC?\fP |
|
zero or one \fIr\fP's (that is, "an optional \fIr\fP") |
|
.TP |
|
name |
|
the expansion of the "name" definition (see above) |
|
.TP |
|
\fC(\fP\fIr\fP\fC)\fP |
|
an \fIr\fP; parentheses are used to override precedence |
|
(see below) |
|
.TP |
|
\fIrs\fP |
|
an \fIr\fP followed by an \fIs\fP ("concatenation") |
|
.TP |
|
\fIr\fP\fC|\fP\fIs\fP |
|
either an \fIr\fP or an \fIs\fP |
|
.TP |
|
\fIr\fP\fC/\fP\fIs\fP |
|
an \fIr\fP but only if it is followed by an \fIs\fP. The s is not part of |
|
the matched text. This type of \*(rx is called "trailing context". |
|
.LP |
|
The regular expressions listed above are grouped according to |
|
precedence, from highest precedence at the top to lowest at the bottom. |
|
Those grouped together have equal precedence. |
|
|
|
.SH "A LARGER EXAMPLE" |
|
.LP |
|
.in +3 |
|
.nf |
|
#include <stdlib.h> |
|
#include <stdio.h> |
|
#include <fcntl.h> |
|
#include <string.h> |
|
|
|
#define ADDEQ 257 |
|
#define ANDAND 258 |
|
#define ANDEQ 259 |
|
#define ARRAY 260 |
|
#define ASM 261 |
|
#define AUTO 262 |
|
#define BREAK 263 |
|
#define CASE 264 |
|
#define CHAR 265 |
|
#define CONST 266 |
|
#define CONTINUE 267 |
|
#define DECR 268 |
|
#define DEFAULT 269 |
|
#define DEREF 270 |
|
#define DIVEQ 271 |
|
#define DO 272 |
|
#define DOUBLE 273 |
|
#define ELLIPSIS 274 |
|
#define ELSE 275 |
|
#define ENUM 276 |
|
#define EQL 277 |
|
#define EXTERN 278 |
|
#define FCON 279 |
|
#define FLOAT 280 |
|
#define FOR 281 |
|
#define FUNCTION 282 |
|
#define GEQ 283 |
|
#define GOTO 284 |
|
#define ICON 285 |
|
#define ID 286 |
|
#define IF 287 |
|
#define INCR 288 |
|
#define INT 289 |
|
#define LEQ 290 |
|
#define LONG 291 |
|
#define LSHIFT 292 |
|
#define LSHIFTEQ 293 |
|
#define MODEQ 294 |
|
#define MULEQ 295 |
|
#define NEQ 296 |
|
#define OREQ 297 |
|
#define OROR 298 |
|
#define POINTER 299 |
|
#define REGISTER 300 |
|
#define RETURN 301 |
|
#define RSHIFT 302 |
|
#define RSHIFTEQ 303 |
|
#define SCON 304 |
|
#define SHORT 305 |
|
#define SIGNED 306 |
|
#define SIZEOF 307 |
|
#define STATIC 308 |
|
#define STRUCT 309 |
|
#define SUBEQ 310 |
|
#define SWITCH 311 |
|
#define TYPEDEF 312 |
|
#define UNION 313 |
|
#define UNSIGNED 314 |
|
#define VOID 315 |
|
#define VOLATILE 316 |
|
#define WHILE 317 |
|
#define XOREQ 318 |
|
#define EOI 319 |
|
|
|
typedef unsigned int uint; |
|
typedef unsigned char uchar; |
|
|
|
#define BSIZE 8192 |
|
|
|
#define YYCTYPE uchar |
|
#define YYCURSOR cursor |
|
#define YYLIMIT s->lim |
|
#define YYMARKER s->ptr |
|
#define YYFILL(n) {cursor = fill(s, cursor);} |
|
|
|
#define RET(i) {s->cur = cursor; return i;} |
|
|
|
typedef struct Scanner { |
|
int fd; |
|
uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof; |
|
uint line; |
|
} Scanner; |
|
|
|
uchar *fill(Scanner *s, uchar *cursor){ |
|
if(!s->eof){ |
|
uint cnt = s->tok - s->bot; |
|
if(cnt){ |
|
memcpy(s->bot, s->tok, s->lim - s->tok); |
|
s->tok = s->bot; |
|
s->ptr -= cnt; |
|
cursor -= cnt; |
|
s->pos -= cnt; |
|
s->lim -= cnt; |
|
} |
|
if((s->top - s->lim) < BSIZE){ |
|
uchar *buf = (uchar*) |
|
malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar)); |
|
memcpy(buf, s->tok, s->lim - s->tok); |
|
s->tok = buf; |
|
s->ptr = &buf[s->ptr - s->bot]; |
|
cursor = &buf[cursor - s->bot]; |
|
s->pos = &buf[s->pos - s->bot]; |
|
s->lim = &buf[s->lim - s->bot]; |
|
s->top = &s->lim[BSIZE]; |
|
free(s->bot); |
|
s->bot = buf; |
|
} |
|
if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){ |
|
s->eof = &s->lim[cnt]; *(s->eof)++ = '\\n'; |
|
} |
|
s->lim += cnt; |
|
} |
|
return cursor; |
|
} |
|
|
|
int scan(Scanner *s){ |
|
uchar *cursor = s->cur; |
|
std: |
|
s->tok = cursor; |
|
/*!re2c |
|
any = [\\000-\\377]; |
|
O = [0-7]; |
|
D = [0-9]; |
|
L = [a-zA-Z_]; |
|
H = [a-fA-F0-9]; |
|
E = [Ee] [+-]? D+; |
|
FS = [fFlL]; |
|
IS = [uUlL]*; |
|
ESC = [\\\\] ([abfnrtv?'"\\\\] | "x" H+ | O+); |
|
*/ |
|
|
|
/*!re2c |
|
"/*" { goto comment; } |
|
|
|
"auto" { RET(AUTO); } |
|
"break" { RET(BREAK); } |
|
"case" { RET(CASE); } |
|
"char" { RET(CHAR); } |
|
"const" { RET(CONST); } |
|
"continue" { RET(CONTINUE); } |
|
"default" { RET(DEFAULT); } |
|
"do" { RET(DO); } |
|
"double" { RET(DOUBLE); } |
|
"else" { RET(ELSE); } |
|
"enum" { RET(ENUM); } |
|
"extern" { RET(EXTERN); } |
|
"float" { RET(FLOAT); } |
|
"for" { RET(FOR); } |
|
"goto" { RET(GOTO); } |
|
"if" { RET(IF); } |
|
"int" { RET(INT); } |
|
"long" { RET(LONG); } |
|
"register" { RET(REGISTER); } |
|
"return" { RET(RETURN); } |
|
"short" { RET(SHORT); } |
|
"signed" { RET(SIGNED); } |
|
"sizeof" { RET(SIZEOF); } |
|
"static" { RET(STATIC); } |
|
"struct" { RET(STRUCT); } |
|
"switch" { RET(SWITCH); } |
|
"typedef" { RET(TYPEDEF); } |
|
"union" { RET(UNION); } |
|
"unsigned" { RET(UNSIGNED); } |
|
"void" { RET(VOID); } |
|
"volatile" { RET(VOLATILE); } |
|
"while" { RET(WHILE); } |
|
|
|
L (L|D)* { RET(ID); } |
|
|
|
("0" [xX] H+ IS?) | ("0" D+ IS?) | (D+ IS?) | |
|
(['] (ESC|any\\[\\n\\\\'])* [']) |
|
{ RET(ICON); } |
|
|
|
(D+ E FS?) | (D* "." D+ E? FS?) | (D+ "." D* E? FS?) |
|
{ RET(FCON); } |
|
|
|
(["] (ESC|any\\[\\n\\\\"])* ["]) |
|
{ RET(SCON); } |
|
|
|
"..." { RET(ELLIPSIS); } |
|
">>=" { RET(RSHIFTEQ); } |
|
"<<=" { RET(LSHIFTEQ); } |
|
"+=" { RET(ADDEQ); } |
|
"-=" { RET(SUBEQ); } |
|
"*=" { RET(MULEQ); } |
|
"/=" { RET(DIVEQ); } |
|
"%=" { RET(MODEQ); } |
|
"&=" { RET(ANDEQ); } |
|
"^=" { RET(XOREQ); } |
|
"|=" { RET(OREQ); } |
|
">>" { RET(RSHIFT); } |
|
"<<" { RET(LSHIFT); } |
|
"++" { RET(INCR); } |
|
"--" { RET(DECR); } |
|
"->" { RET(DEREF); } |
|
"&&" { RET(ANDAND); } |
|
"||" { RET(OROR); } |
|
"<=" { RET(LEQ); } |
|
">=" { RET(GEQ); } |
|
"==" { RET(EQL); } |
|
"!=" { RET(NEQ); } |
|
";" { RET(';'); } |
|
"{" { RET('{'); } |
|
"}" { RET('}'); } |
|
"," { RET(','); } |
|
":" { RET(':'); } |
|
"=" { RET('='); } |
|
"(" { RET('('); } |
|
")" { RET(')'); } |
|
"[" { RET('['); } |
|
"]" { RET(']'); } |
|
"." { RET('.'); } |
|
"&" { RET('&'); } |
|
"!" { RET('!'); } |
|
"~" { RET('~'); } |
|
"-" { RET('-'); } |
|
"+" { RET('+'); } |
|
"*" { RET('*'); } |
|
"/" { RET('/'); } |
|
"%" { RET('%'); } |
|
"<" { RET('<'); } |
|
">" { RET('>'); } |
|
"^" { RET('^'); } |
|
"|" { RET('|'); } |
|
"?" { RET('?'); } |
|
|
|
|
|
[ \\t\\v\\f]+ { goto std; } |
|
|
|
"\\n" |
|
{ |
|
if(cursor == s->eof) RET(EOI); |
|
s->pos = cursor; s->line++; |
|
goto std; |
|
} |
|
|
|
any |
|
{ |
|
printf("unexpected character: %c\\n", *s->tok); |
|
goto std; |
|
} |
|
*/ |
|
|
|
comment: |
|
/*!re2c |
|
"*/" { goto std; } |
|
"\\n" |
|
{ |
|
if(cursor == s->eof) RET(EOI); |
|
s->tok = s->pos = cursor; s->line++; |
|
goto comment; |
|
} |
|
any { goto comment; } |
|
*/ |
|
} |
|
|
|
main(){ |
|
Scanner in; |
|
int t; |
|
memset((char*) &in, 0, sizeof(in)); |
|
in.fd = 0; |
|
while((t = scan(&in)) != EOI){ |
|
/* |
|
printf("%d\\t%.*s\\n", t, in.cur - in.tok, in.tok); |
|
printf("%d\\n", t); |
|
*/ |
|
} |
|
close(in.fd); |
|
} |
|
.fi |
|
.in -3 |
|
|
|
.SH "SEE ALSO" |
|
.LP |
|
flex(1), lex(1). |
|
|
|
.SH FEATURES |
|
.LP |
|
\*(re does not provide a default action: |
|
the generated code assumes that the input |
|
will consist of a sequence of tokens. |
|
Typically this can be dealt with by adding a rule such as the one for |
|
unexpected characters in the example above. |
|
.LP |
|
The user must arrange for a sentinel token to appear at the end of input |
|
(and provide a rule for matching it): |
|
\*(re does not provide an \fC<<EOF>>\fP expression. |
|
If the source is from a null-byte terminated string, a |
|
rule matching a null character will suffice. If the source is from a |
|
file then the approach taken in the example can be used: pad the input with |
|
a newline (or some other character that can't appear within another token); |
|
upon recognizing such a character check to see if it is the sentinel |
|
and act accordingly. |
|
.LP |
|
\*(re does not provide start conditions: use a separate scanner |
|
specification for each start condition (as illustrated in the above example). |
|
.LP |
|
No [^x]. Use difference instead. |
|
.SH BUGS |
|
.LP |
|
Only fixed length trailing context can be handled. |
|
.LP |
|
The maximum value appearing as a parameter \fIn\fP to \fCYYFILL\fP is not |
|
provided to the generated code (this value is needed for constructing |
|
the interface code). |
|
Note that this value is usually relatively small: for |
|
typical programming languages \fIn\fP will be the length of the longest |
|
keyword plus one. |
|
.LP |
|
Difference only works for character sets. |
|
.LP |
|
The \*(re internal algorithms need documentation. |
|
|
|
.SH AUTHOR |
|
.LP |
|
Please send bug reports, fixes and feedback to: |
|
.LP |
|
.nf |
|
Peter Bumbulis |
|
Computer Systems Group |
|
University of Waterloo |
|
Waterloo, Ontario |
|
N2L 3G1 |
|
Internet: peterr@csg.uwaterloo.ca |
|
.fi
|
|
|