mirror of https://github.com/yasm/yasm.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
692 lines
14 KiB
692 lines
14 KiB
#include <time.h> |
|
#include <string.h> |
|
#include <stdio.h> |
|
#include <ctype.h> |
|
|
|
#include "tools/re2c/globals.h" |
|
#include "tools/re2c/parse.h" |
|
#include "tools/re2c/dfa.h" |
|
|
|
static Symbol *first = NULL; |
|
|
|
void |
|
Symbol_init(Symbol *r, const SubStr *str) |
|
{ |
|
r->next = first; |
|
Str_init(&r->name, str); |
|
r->re = NULL; |
|
first = r; |
|
} |
|
|
|
Symbol * |
|
Symbol_find(const SubStr *str) |
|
{ |
|
Symbol *sym; |
|
for(sym = first; sym; sym = sym->next) |
|
if(SubStr_eq(&sym->name, str)) return sym; |
|
return Symbol_new(str); |
|
} |
|
|
|
/* |
|
void showIns(FILE *o, const Ins *i, const Ins *base){ |
|
o.width(3); |
|
o << &i - &base << ": "; |
|
switch(i.i.tag){ |
|
case CHAR: { |
|
o << "match "; |
|
for(const Ins *j = &(&i)[1]; j < (Ins*) i.i.link; ++j) |
|
prtCh(o, j->c.value); |
|
break; |
|
} case GOTO: |
|
o << "goto " << ((Ins*) i.i.link - &base); |
|
break; |
|
case FORK: |
|
o << "fork " << ((Ins*) i.i.link - &base); |
|
break; |
|
case CTXT: |
|
o << "term " << ((RuleOp*) i.i.link)->accept; |
|
break; |
|
case TERM: |
|
o << "term " << ((RuleOp*) i.i.link)->accept; |
|
break; |
|
} |
|
o << "\n"; |
|
} |
|
*/ |
|
|
|
static unsigned int |
|
AltOp_fixedLength(RegExp *r) |
|
{ |
|
unsigned int l1 = RegExp_fixedLength(r->d.AltCatOp.exp1); |
|
/* XXX? Should be exp2? */ |
|
unsigned int l2 = RegExp_fixedLength(r->d.AltCatOp.exp1); |
|
if(l1 != l2 || l1 == ~0u) |
|
return ~0u; |
|
return l1; |
|
} |
|
|
|
static unsigned int |
|
CatOp_fixedLength(RegExp *r) |
|
{ |
|
unsigned int l1, l2; |
|
if((l1 = RegExp_fixedLength(r->d.AltCatOp.exp1)) != ~0u ) |
|
if((l2 = RegExp_fixedLength(r->d.AltCatOp.exp2)) != ~0u) |
|
return l1+l2; |
|
return ~0u; |
|
} |
|
|
|
unsigned int |
|
RegExp_fixedLength(RegExp *r) |
|
{ |
|
switch (r->type) { |
|
case NULLOP: |
|
return 0; |
|
case MATCHOP: |
|
return 1; |
|
case ALTOP: |
|
return AltOp_fixedLength(r); |
|
case CATOP: |
|
return CatOp_fixedLength(r); |
|
default: |
|
return ~0u; |
|
} |
|
return ~0u; |
|
} |
|
|
|
void |
|
RegExp_calcSize(RegExp *re, Char *rep) |
|
{ |
|
Range *r; |
|
unsigned int c; |
|
|
|
switch (re->type) { |
|
case NULLOP: |
|
re->size = 0; |
|
break; |
|
case MATCHOP: |
|
re->size = 1; |
|
for(r = re->d.match; r; r = r->next) |
|
for(c = r->lb; c < r->ub; ++c) |
|
if(rep[c] == c) |
|
++re->size; |
|
break; |
|
case RULEOP: |
|
RegExp_calcSize(re->d.RuleOp.exp, rep); |
|
RegExp_calcSize(re->d.RuleOp.ctx, rep); |
|
re->size = re->d.RuleOp.exp->size + re->d.RuleOp.ctx->size + 1; |
|
break; |
|
case ALTOP: |
|
RegExp_calcSize(re->d.AltCatOp.exp1, rep); |
|
RegExp_calcSize(re->d.AltCatOp.exp2, rep); |
|
re->size = re->d.AltCatOp.exp1->size |
|
+ re->d.AltCatOp.exp2->size + 2; |
|
break; |
|
case CATOP: |
|
RegExp_calcSize(re->d.AltCatOp.exp1, rep); |
|
RegExp_calcSize(re->d.AltCatOp.exp2, rep); |
|
re->size = re->d.AltCatOp.exp1->size + re->d.AltCatOp.exp2->size; |
|
break; |
|
case CLOSEOP: |
|
RegExp_calcSize(re->d.exp, rep); |
|
re->size = re->d.exp->size + 1; |
|
break; |
|
case CLOSEVOP: |
|
RegExp_calcSize(re->d.CloseVOp.exp, rep); |
|
|
|
if (re->d.CloseVOp.max >= 0) |
|
re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + |
|
((1 + re->d.CloseVOp.exp->size) * |
|
(re->d.CloseVOp.max - re->d.CloseVOp.min)); |
|
else |
|
re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + 1; |
|
break; |
|
} |
|
} |
|
|
|
static void |
|
MatchOp_compile(RegExp *re, Char *rep, Ins *i) |
|
{ |
|
Ins *j; |
|
unsigned int bump; |
|
Range *r; |
|
unsigned int c; |
|
|
|
i->i.tag = CHAR; |
|
i->i.link = &i[re->size]; |
|
j = &i[1]; |
|
bump = re->size; |
|
for(r = re->d.match; r; r = r->next){ |
|
for(c = r->lb; c < r->ub; ++c){ |
|
if(rep[c] == c){ |
|
j->c.value = c; |
|
j->c.bump = --bump; |
|
j++; |
|
} |
|
} |
|
} |
|
} |
|
|
|
static void |
|
AltOp_compile(RegExp *re, Char *rep, Ins *i){ |
|
Ins *j; |
|
|
|
i->i.tag = FORK; |
|
j = &i[re->d.AltCatOp.exp1->size + 1]; |
|
i->i.link = &j[1]; |
|
RegExp_compile(re->d.AltCatOp.exp1, rep, &i[1]); |
|
j->i.tag = GOTO; |
|
j->i.link = &j[re->d.AltCatOp.exp2->size + 1]; |
|
RegExp_compile(re->d.AltCatOp.exp2, rep, &j[1]); |
|
} |
|
|
|
void |
|
RegExp_compile(RegExp *re, Char *rep, Ins *i) |
|
{ |
|
Ins *jumppoint; |
|
int st = 0; |
|
|
|
switch (re->type) { |
|
case NULLOP: |
|
break; |
|
case MATCHOP: |
|
MatchOp_compile(re, rep, i); |
|
break; |
|
case RULEOP: |
|
re->d.RuleOp.ins = i; |
|
RegExp_compile(re->d.RuleOp.exp, rep, &i[0]); |
|
i += re->d.RuleOp.exp->size; |
|
RegExp_compile(re->d.RuleOp.ctx, rep, &i[0]); |
|
i += re->d.RuleOp.ctx->size; |
|
i->i.tag = TERM; |
|
i->i.link = re; |
|
break; |
|
case ALTOP: |
|
AltOp_compile(re, rep, i); |
|
break; |
|
case CATOP: |
|
RegExp_compile(re->d.AltCatOp.exp1, rep, &i[0]); |
|
RegExp_compile(re->d.AltCatOp.exp2, rep, |
|
&i[re->d.AltCatOp.exp1->size]); |
|
break; |
|
case CLOSEOP: |
|
RegExp_compile(re->d.exp, rep, &i[0]); |
|
i += re->d.exp->size; |
|
i->i.tag = FORK; |
|
i->i.link = i - re->d.exp->size; |
|
break; |
|
case CLOSEVOP: |
|
jumppoint = i + ((1 + re->d.CloseVOp.exp->size) * |
|
(re->d.CloseVOp.max - re->d.CloseVOp.min)); |
|
for(st = re->d.CloseVOp.min; st < re->d.CloseVOp.max; st++) { |
|
i->i.tag = FORK; |
|
i->i.link = jumppoint; |
|
i+=1; |
|
RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]); |
|
i += re->d.CloseVOp.exp->size; |
|
} |
|
for(st = 0; st < re->d.CloseVOp.min; st++) { |
|
RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]); |
|
i += re->d.CloseVOp.exp->size; |
|
if(re->d.CloseVOp.max < 0 && st == 0) { |
|
i->i.tag = FORK; |
|
i->i.link = i - re->d.CloseVOp.exp->size; |
|
i++; |
|
} |
|
} |
|
break; |
|
} |
|
} |
|
|
|
static void |
|
MatchOp_split(RegExp *re, CharSet *s) |
|
{ |
|
Range *r; |
|
unsigned int c; |
|
|
|
for(r = re->d.match; r; r = r->next){ |
|
for(c = r->lb; c < r->ub; ++c){ |
|
CharPtn *x = s->rep[c], *a = x->nxt; |
|
if(!a){ |
|
if(x->card == 1) |
|
continue; |
|
x->nxt = a = s->freeHead; |
|
if(!(s->freeHead = s->freeHead->nxt)) |
|
s->freeTail = &s->freeHead; |
|
a->nxt = NULL; |
|
x->fix = s->fix; |
|
s->fix = x; |
|
} |
|
if(--(x->card) == 0){ |
|
*s->freeTail = x; |
|
*(s->freeTail = &x->nxt) = NULL; |
|
} |
|
s->rep[c] = a; |
|
++(a->card); |
|
} |
|
} |
|
for(; s->fix; s->fix = s->fix->fix) |
|
if(s->fix->card) |
|
s->fix->nxt = NULL; |
|
} |
|
|
|
void |
|
RegExp_split(RegExp *re, CharSet *s) |
|
{ |
|
switch (re->type) { |
|
case NULLOP: |
|
break; |
|
case MATCHOP: |
|
MatchOp_split(re, s); |
|
break; |
|
case RULEOP: |
|
RegExp_split(re->d.RuleOp.exp, s); |
|
RegExp_split(re->d.RuleOp.ctx, s); |
|
break; |
|
case ALTOP: |
|
/* FALLTHROUGH */ |
|
case CATOP: |
|
RegExp_split(re->d.AltCatOp.exp1, s); |
|
RegExp_split(re->d.AltCatOp.exp2, s); |
|
break; |
|
case CLOSEOP: |
|
RegExp_split(re->d.exp, s); |
|
break; |
|
case CLOSEVOP: |
|
RegExp_split(re->d.CloseVOp.exp, s); |
|
break; |
|
} |
|
} |
|
|
|
void |
|
RegExp_display(RegExp *re, FILE *o) |
|
{ |
|
switch (re->type) { |
|
case NULLOP: |
|
fputc('_', o); |
|
break; |
|
case MATCHOP: |
|
Range_out(o, re->d.match); |
|
break; |
|
case RULEOP: |
|
RegExp_display(re->d.RuleOp.exp, o); |
|
fputc('/', o); |
|
RegExp_display(re->d.RuleOp.ctx, o); |
|
fputc(';', o); |
|
break; |
|
case ALTOP: |
|
RegExp_display(re->d.AltCatOp.exp1, o); |
|
fputc('|', o); |
|
RegExp_display(re->d.AltCatOp.exp2, o); |
|
break; |
|
case CATOP: |
|
RegExp_display(re->d.AltCatOp.exp1, o); |
|
RegExp_display(re->d.AltCatOp.exp2, o); |
|
break; |
|
case CLOSEOP: |
|
RegExp_display(re->d.exp, o); |
|
fputc('+', o); |
|
break; |
|
} |
|
} |
|
|
|
void |
|
Range_out(FILE *o, const Range *r) |
|
{ |
|
if(!r) |
|
return; |
|
|
|
if((r->ub - r->lb) == 1){ |
|
prtCh(o, r->lb); |
|
} else { |
|
prtCh(o, r->lb); |
|
fputc('-', o); |
|
prtCh(o, r->ub-1); |
|
} |
|
Range_out(o, r->next); |
|
} |
|
|
|
static Range *doUnion(Range *r1, Range *r2){ |
|
Range *r, **rP = &r; |
|
for(;;){ |
|
Range *s; |
|
if(r1->lb <= r2->lb){ |
|
s = Range_new_copy(r1); |
|
} else { |
|
s = Range_new_copy(r2); |
|
} |
|
*rP = s; |
|
rP = &s->next; |
|
for(;;){ |
|
if(r1->lb <= r2->lb){ |
|
if(r1->lb > s->ub) |
|
break; |
|
if(r1->ub > s->ub) |
|
s->ub = r1->ub; |
|
if(!(r1 = r1->next)){ |
|
unsigned int ub = 0; |
|
for(; r2 && r2->lb <= s->ub; r2 = r2->next) |
|
ub = r2->ub; |
|
if(ub > s->ub) |
|
s->ub = ub; |
|
*rP = r2; |
|
return r; |
|
} |
|
} else { |
|
if(r2->lb > s->ub) |
|
break; |
|
if(r2->ub > s->ub) |
|
s->ub = r2->ub; |
|
if(!(r2 = r2->next)){ |
|
unsigned int ub = 0; |
|
for(; r1 && r1->lb <= s->ub; r1 = r1->next) |
|
ub = r1->ub; |
|
if(ub > s->ub) |
|
s->ub = ub; |
|
*rP = r1; |
|
return r; |
|
} |
|
} |
|
} |
|
} |
|
*rP = NULL; |
|
return r; |
|
} |
|
|
|
static Range *doDiff(Range *r1, Range *r2){ |
|
Range *r, *s, **rP = &r; |
|
for(; r1; r1 = r1->next){ |
|
unsigned int lb = r1->lb; |
|
for(; r2 && r2->ub <= r1->lb; r2 = r2->next); |
|
for(; r2 && r2->lb < r1->ub; r2 = r2->next){ |
|
if(lb < r2->lb){ |
|
*rP = s = Range_new(lb, r2->lb); |
|
rP = &s->next; |
|
} |
|
if((lb = r2->ub) >= r1->ub) |
|
goto noMore; |
|
} |
|
*rP = s = Range_new(lb, r1->ub); |
|
rP = &s->next; |
|
noMore:; |
|
} |
|
*rP = NULL; |
|
return r; |
|
} |
|
|
|
static RegExp *merge(RegExp *m1, RegExp *m2){ |
|
if(!m1) |
|
return m2; |
|
if(!m2) |
|
return m1; |
|
return RegExp_new_MatchOp(doUnion(m1->d.match, m2->d.match)); |
|
} |
|
|
|
RegExp *mkDiff(RegExp *e1, RegExp *e2){ |
|
RegExp *m1, *m2; |
|
Range *r; |
|
if(!(m1 = RegExp_isA(e1, MATCHOP))) |
|
return NULL; |
|
if(!(m2 = RegExp_isA(e2, MATCHOP))) |
|
return NULL; |
|
r = doDiff(m1->d.match, m2->d.match); |
|
return r? RegExp_new_MatchOp(r) : RegExp_new_NullOp(); |
|
} |
|
|
|
static RegExp *doAlt(RegExp *e1, RegExp *e2){ |
|
if(!e1) |
|
return e2; |
|
if(!e2) |
|
return e1; |
|
return RegExp_new_AltOp(e1, e2); |
|
} |
|
|
|
RegExp *mkAlt(RegExp *e1, RegExp *e2){ |
|
RegExp *a; |
|
RegExp *m1, *m2; |
|
if((a = RegExp_isA(e1, ALTOP))){ |
|
if((m1 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP))) |
|
e1 = a->d.AltCatOp.exp2; |
|
} else if((m1 = RegExp_isA(e1, MATCHOP))){ |
|
e1 = NULL; |
|
} |
|
if((a = RegExp_isA(e2, ALTOP))){ |
|
if((m2 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP))) |
|
e2 = a->d.AltCatOp.exp2; |
|
} else if((m2 = RegExp_isA(e2, MATCHOP))){ |
|
e2 = NULL; |
|
} |
|
return doAlt(merge(m1, m2), doAlt(e1, e2)); |
|
} |
|
|
|
static unsigned char unescape(SubStr *s){ |
|
unsigned char c; |
|
unsigned char v; |
|
s->len--; |
|
if((c = *s->str++) != '\\' || s->len == 0) |
|
return xlat[c]; |
|
s->len--; |
|
switch(c = *s->str++){ |
|
case 'n': |
|
return xlat['\n']; |
|
case 't': |
|
return xlat['\t']; |
|
case 'v': |
|
return xlat['\v']; |
|
case 'b': |
|
return xlat['\b']; |
|
case 'r': |
|
return xlat['\r']; |
|
case 'f': |
|
return xlat['\f']; |
|
case 'a': |
|
return xlat['\a']; |
|
case '0': case '1': case '2': case '3': |
|
case '4': case '5': case '6': case '7': { |
|
v = c - '0'; |
|
for(; s->len != 0 && '0' <= (c = *s->str) && c <= '7'; s->len--, s->str++) |
|
v = v*8 + (c - '0'); |
|
return v; |
|
} default: |
|
return xlat[c]; |
|
} |
|
} |
|
|
|
static Range *getRange(SubStr *s){ |
|
unsigned char lb = unescape(s), ub; |
|
if(s->len < 2 || *s->str != '-'){ |
|
ub = lb; |
|
} else { |
|
s->len--; s->str++; |
|
ub = unescape(s); |
|
if(ub < lb){ |
|
unsigned char tmp; |
|
tmp = lb; lb = ub; ub = tmp; |
|
} |
|
} |
|
return Range_new(lb, ub+1); |
|
} |
|
|
|
static RegExp *matchChar(unsigned int c){ |
|
return RegExp_new_MatchOp(Range_new(c, c+1)); |
|
} |
|
|
|
RegExp *strToRE(SubStr s){ |
|
RegExp *re; |
|
s.len -= 2; s.str += 1; |
|
if(s.len == 0) |
|
return RegExp_new_NullOp(); |
|
re = matchChar(unescape(&s)); |
|
while(s.len > 0) |
|
re = RegExp_new_CatOp(re, matchChar(unescape(&s))); |
|
return re; |
|
} |
|
|
|
RegExp *strToCaseInsensitiveRE(SubStr s){ |
|
unsigned char c; |
|
RegExp *re, *reL, *reU; |
|
s.len -= 2; s.str += 1; |
|
if(s.len == 0) |
|
return RegExp_new_NullOp(); |
|
c = unescape(&s); |
|
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { |
|
reL = matchChar(tolower(c)); |
|
reU = matchChar(toupper(c)); |
|
re = mkAlt(reL, reU); |
|
} else { |
|
re = matchChar(c); |
|
} |
|
while(s.len > 0) { |
|
c = unescape(&s); |
|
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { |
|
reL = matchChar(tolower(c)); |
|
reU = matchChar(toupper(c)); |
|
re = RegExp_new_CatOp(re, mkAlt(reL, reU)); |
|
} else { |
|
re = RegExp_new_CatOp(re, matchChar(c)); |
|
} |
|
} |
|
return re; |
|
} |
|
|
|
RegExp *ranToRE(SubStr s){ |
|
Range *r; |
|
s.len -= 2; s.str += 1; |
|
if(s.len == 0) |
|
return RegExp_new_NullOp(); |
|
r = getRange(&s); |
|
while(s.len > 0) |
|
r = doUnion(r, getRange(&s)); |
|
return RegExp_new_MatchOp(r); |
|
} |
|
|
|
RegExp *invToRE(SubStr s) |
|
{ |
|
RegExp *any, *ran, *inv; |
|
SubStr *ss; |
|
|
|
|
|
s.len--; |
|
s.str++; |
|
|
|
ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]")); |
|
any = ranToRE(*ss); |
|
free(ss); |
|
if (s.len <= 2) |
|
return any; |
|
|
|
ran = ranToRE(s); |
|
inv = mkDiff(any, ran); |
|
|
|
free(ran); |
|
free(any); |
|
|
|
return inv; |
|
} |
|
|
|
RegExp *mkDot() |
|
{ |
|
SubStr *ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]")); |
|
RegExp * any = ranToRE(*ss); |
|
RegExp * ran = matchChar('\n'); |
|
RegExp * inv = mkDiff(any, ran); |
|
|
|
free(ss); |
|
free(ran); |
|
free(any); |
|
|
|
return inv; |
|
} |
|
|
|
RegExp * |
|
RegExp_new_RuleOp(RegExp *e, RegExp *c, Token *t, unsigned int a) |
|
{ |
|
RegExp *r = malloc(sizeof(RegExp)); |
|
r->type = RULEOP; |
|
r->d.RuleOp.exp = e; |
|
r->d.RuleOp.ctx = c; |
|
r->d.RuleOp.ins = NULL; |
|
r->d.RuleOp.accept = a; |
|
r->d.RuleOp.code = t; |
|
return r; |
|
} |
|
|
|
static void optimize(Ins *i){ |
|
while(!isMarked(i)){ |
|
mark(i); |
|
if(i->i.tag == CHAR){ |
|
i = (Ins*) i->i.link; |
|
} else if(i->i.tag == GOTO || i->i.tag == FORK){ |
|
Ins *target = (Ins*) i->i.link; |
|
optimize(target); |
|
if(target->i.tag == GOTO) |
|
i->i.link = target->i.link == target? i : target; |
|
if(i->i.tag == FORK){ |
|
Ins *follow = (Ins*) &i[1]; |
|
optimize(follow); |
|
if(follow->i.tag == GOTO && follow->i.link == follow){ |
|
i->i.tag = GOTO; |
|
} else if(i->i.link == i){ |
|
i->i.tag = GOTO; |
|
i->i.link = follow; |
|
} |
|
} |
|
return; |
|
} else { |
|
++i; |
|
} |
|
} |
|
} |
|
|
|
void genCode(FILE *o, RegExp *re){ |
|
CharSet cs; |
|
unsigned int j; |
|
Char rep[nChars]; |
|
Ins *ins, *eoi; |
|
DFA *dfa; |
|
|
|
memset(&cs, 0, sizeof(cs)); |
|
for(j = 0; j < nChars; ++j){ |
|
cs.rep[j] = &cs.ptn[0]; |
|
cs.ptn[j].nxt = &cs.ptn[j+1]; |
|
} |
|
cs.freeHead = &cs.ptn[1]; |
|
*(cs.freeTail = &cs.ptn[nChars-1].nxt) = NULL; |
|
cs.ptn[0].card = nChars; |
|
cs.ptn[0].nxt = NULL; |
|
RegExp_split(re, &cs); |
|
/* |
|
for(unsigned int k = 0; k < nChars;){ |
|
for(j = k; ++k < nChars && cs.rep[k] == cs.rep[j];); |
|
printSpan(cerr, j, k); |
|
cerr << "\t" << cs.rep[j] - &cs.ptn[0] << endl; |
|
} |
|
*/ |
|
for(j = 0; j < nChars; ++j){ |
|
if(!cs.rep[j]->nxt) |
|
cs.rep[j]->nxt = &cs.ptn[j]; |
|
rep[j] = (Char) (cs.rep[j]->nxt - &cs.ptn[0]); |
|
} |
|
|
|
RegExp_calcSize(re, rep); |
|
ins = malloc(sizeof(Ins)*(re->size+1)); |
|
memset(ins, 0, (re->size+1)*sizeof(Ins)); |
|
RegExp_compile(re, rep, ins); |
|
eoi = &ins[re->size]; |
|
eoi->i.tag = GOTO; |
|
eoi->i.link = eoi; |
|
|
|
optimize(ins); |
|
for(j = 0; j < re->size;){ |
|
unmark(&ins[j]); |
|
if(ins[j].i.tag == CHAR){ |
|
j = (Ins*) ins[j].i.link - ins; |
|
} else { |
|
j++; |
|
} |
|
} |
|
|
|
dfa = DFA_new(ins, re->size, 0, 256, rep); |
|
DFA_emit(dfa, o); |
|
DFA_delete(dfa); |
|
free(ins); |
|
}
|
|
|