#include #include #include #include #include "tools/re2c/globals.h" #include "tools/re2c/parse.h" #include "tools/re2c/dfa.h" static Symbol *first = NULL; void Symbol_init(Symbol *r, const SubStr *str) { r->next = first; Str_init(&r->name, str); r->re = NULL; first = r; } Symbol * Symbol_find(const SubStr *str) { Symbol *sym; for(sym = first; sym; sym = sym->next) if(SubStr_eq(&sym->name, str)) return sym; return Symbol_new(str); } /* void showIns(FILE *o, const Ins *i, const Ins *base){ o.width(3); o << &i - &base << ": "; switch(i.i.tag){ case CHAR: { o << "match "; for(const Ins *j = &(&i)[1]; j < (Ins*) i.i.link; ++j) prtCh(o, j->c.value); break; } case GOTO: o << "goto " << ((Ins*) i.i.link - &base); break; case FORK: o << "fork " << ((Ins*) i.i.link - &base); break; case CTXT: o << "term " << ((RuleOp*) i.i.link)->accept; break; case TERM: o << "term " << ((RuleOp*) i.i.link)->accept; break; } o << "\n"; } */ static unsigned int AltOp_fixedLength(RegExp *r) { unsigned int l1 = RegExp_fixedLength(r->d.AltCatOp.exp1); /* XXX? Should be exp2? */ unsigned int l2 = RegExp_fixedLength(r->d.AltCatOp.exp1); if(l1 != l2 || l1 == ~0u) return ~0u; return l1; } static unsigned int CatOp_fixedLength(RegExp *r) { unsigned int l1, l2; if((l1 = RegExp_fixedLength(r->d.AltCatOp.exp1)) != ~0u ) if((l2 = RegExp_fixedLength(r->d.AltCatOp.exp2)) != ~0u) return l1+l2; return ~0u; } unsigned int RegExp_fixedLength(RegExp *r) { switch (r->type) { case NULLOP: return 0; case MATCHOP: return 1; case ALTOP: return AltOp_fixedLength(r); case CATOP: return CatOp_fixedLength(r); default: return ~0u; } return ~0u; } void RegExp_calcSize(RegExp *re, Char *rep) { Range *r; unsigned int c; switch (re->type) { case NULLOP: re->size = 0; break; case MATCHOP: re->size = 1; for(r = re->d.match; r; r = r->next) for(c = r->lb; c < r->ub; ++c) if(rep[c] == c) ++re->size; break; case RULEOP: RegExp_calcSize(re->d.RuleOp.exp, rep); RegExp_calcSize(re->d.RuleOp.ctx, rep); re->size = re->d.RuleOp.exp->size + re->d.RuleOp.ctx->size + 1; break; case ALTOP: RegExp_calcSize(re->d.AltCatOp.exp1, rep); RegExp_calcSize(re->d.AltCatOp.exp2, rep); re->size = re->d.AltCatOp.exp1->size + re->d.AltCatOp.exp2->size + 2; break; case CATOP: RegExp_calcSize(re->d.AltCatOp.exp1, rep); RegExp_calcSize(re->d.AltCatOp.exp2, rep); re->size = re->d.AltCatOp.exp1->size + re->d.AltCatOp.exp2->size; break; case CLOSEOP: RegExp_calcSize(re->d.exp, rep); re->size = re->d.exp->size + 1; break; case CLOSEVOP: RegExp_calcSize(re->d.CloseVOp.exp, rep); if (re->d.CloseVOp.max >= 0) re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + ((1 + re->d.CloseVOp.exp->size) * (re->d.CloseVOp.max - re->d.CloseVOp.min)); else re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + 1; break; } } static void MatchOp_compile(RegExp *re, Char *rep, Ins *i) { Ins *j; unsigned int bump; Range *r; unsigned int c; i->i.tag = CHAR; i->i.link = &i[re->size]; j = &i[1]; bump = re->size; for(r = re->d.match; r; r = r->next){ for(c = r->lb; c < r->ub; ++c){ if(rep[c] == c){ j->c.value = c; j->c.bump = --bump; j++; } } } } static void AltOp_compile(RegExp *re, Char *rep, Ins *i){ Ins *j; i->i.tag = FORK; j = &i[re->d.AltCatOp.exp1->size + 1]; i->i.link = &j[1]; RegExp_compile(re->d.AltCatOp.exp1, rep, &i[1]); j->i.tag = GOTO; j->i.link = &j[re->d.AltCatOp.exp2->size + 1]; RegExp_compile(re->d.AltCatOp.exp2, rep, &j[1]); } void RegExp_compile(RegExp *re, Char *rep, Ins *i) { Ins *jumppoint; int st = 0; switch (re->type) { case NULLOP: break; case MATCHOP: MatchOp_compile(re, rep, i); break; case RULEOP: re->d.RuleOp.ins = i; RegExp_compile(re->d.RuleOp.exp, rep, &i[0]); i += re->d.RuleOp.exp->size; RegExp_compile(re->d.RuleOp.ctx, rep, &i[0]); i += re->d.RuleOp.ctx->size; i->i.tag = TERM; i->i.link = re; break; case ALTOP: AltOp_compile(re, rep, i); break; case CATOP: RegExp_compile(re->d.AltCatOp.exp1, rep, &i[0]); RegExp_compile(re->d.AltCatOp.exp2, rep, &i[re->d.AltCatOp.exp1->size]); break; case CLOSEOP: RegExp_compile(re->d.exp, rep, &i[0]); i += re->d.exp->size; i->i.tag = FORK; i->i.link = i - re->d.exp->size; break; case CLOSEVOP: jumppoint = i + ((1 + re->d.CloseVOp.exp->size) * (re->d.CloseVOp.max - re->d.CloseVOp.min)); for(st = re->d.CloseVOp.min; st < re->d.CloseVOp.max; st++) { i->i.tag = FORK; i->i.link = jumppoint; i+=1; RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]); i += re->d.CloseVOp.exp->size; } for(st = 0; st < re->d.CloseVOp.min; st++) { RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]); i += re->d.CloseVOp.exp->size; if(re->d.CloseVOp.max < 0 && st == 0) { i->i.tag = FORK; i->i.link = i - re->d.CloseVOp.exp->size; i++; } } break; } } static void MatchOp_split(RegExp *re, CharSet *s) { Range *r; unsigned int c; for(r = re->d.match; r; r = r->next){ for(c = r->lb; c < r->ub; ++c){ CharPtn *x = s->rep[c], *a = x->nxt; if(!a){ if(x->card == 1) continue; x->nxt = a = s->freeHead; if(!(s->freeHead = s->freeHead->nxt)) s->freeTail = &s->freeHead; a->nxt = NULL; x->fix = s->fix; s->fix = x; } if(--(x->card) == 0){ *s->freeTail = x; *(s->freeTail = &x->nxt) = NULL; } s->rep[c] = a; ++(a->card); } } for(; s->fix; s->fix = s->fix->fix) if(s->fix->card) s->fix->nxt = NULL; } void RegExp_split(RegExp *re, CharSet *s) { switch (re->type) { case NULLOP: break; case MATCHOP: MatchOp_split(re, s); break; case RULEOP: RegExp_split(re->d.RuleOp.exp, s); RegExp_split(re->d.RuleOp.ctx, s); break; case ALTOP: /* FALLTHROUGH */ case CATOP: RegExp_split(re->d.AltCatOp.exp1, s); RegExp_split(re->d.AltCatOp.exp2, s); break; case CLOSEOP: RegExp_split(re->d.exp, s); break; case CLOSEVOP: RegExp_split(re->d.CloseVOp.exp, s); break; } } void RegExp_display(RegExp *re, FILE *o) { switch (re->type) { case NULLOP: fputc('_', o); break; case MATCHOP: Range_out(o, re->d.match); break; case RULEOP: RegExp_display(re->d.RuleOp.exp, o); fputc('/', o); RegExp_display(re->d.RuleOp.ctx, o); fputc(';', o); break; case ALTOP: RegExp_display(re->d.AltCatOp.exp1, o); fputc('|', o); RegExp_display(re->d.AltCatOp.exp2, o); break; case CATOP: RegExp_display(re->d.AltCatOp.exp1, o); RegExp_display(re->d.AltCatOp.exp2, o); break; case CLOSEOP: RegExp_display(re->d.exp, o); fputc('+', o); break; } } void Range_out(FILE *o, const Range *r) { if(!r) return; if((r->ub - r->lb) == 1){ prtCh(o, r->lb); } else { prtCh(o, r->lb); fputc('-', o); prtCh(o, r->ub-1); } Range_out(o, r->next); } static Range *doUnion(Range *r1, Range *r2){ Range *r, **rP = &r; for(;;){ Range *s; if(r1->lb <= r2->lb){ s = Range_new_copy(r1); } else { s = Range_new_copy(r2); } *rP = s; rP = &s->next; for(;;){ if(r1->lb <= r2->lb){ if(r1->lb > s->ub) break; if(r1->ub > s->ub) s->ub = r1->ub; if(!(r1 = r1->next)){ unsigned int ub = 0; for(; r2 && r2->lb <= s->ub; r2 = r2->next) ub = r2->ub; if(ub > s->ub) s->ub = ub; *rP = r2; return r; } } else { if(r2->lb > s->ub) break; if(r2->ub > s->ub) s->ub = r2->ub; if(!(r2 = r2->next)){ unsigned int ub = 0; for(; r1 && r1->lb <= s->ub; r1 = r1->next) ub = r1->ub; if(ub > s->ub) s->ub = ub; *rP = r1; return r; } } } } *rP = NULL; return r; } static Range *doDiff(Range *r1, Range *r2){ Range *r, *s, **rP = &r; for(; r1; r1 = r1->next){ unsigned int lb = r1->lb; for(; r2 && r2->ub <= r1->lb; r2 = r2->next); for(; r2 && r2->lb < r1->ub; r2 = r2->next){ if(lb < r2->lb){ *rP = s = Range_new(lb, r2->lb); rP = &s->next; } if((lb = r2->ub) >= r1->ub) goto noMore; } *rP = s = Range_new(lb, r1->ub); rP = &s->next; noMore:; } *rP = NULL; return r; } static RegExp *merge(RegExp *m1, RegExp *m2){ if(!m1) return m2; if(!m2) return m1; return RegExp_new_MatchOp(doUnion(m1->d.match, m2->d.match)); } RegExp *mkDiff(RegExp *e1, RegExp *e2){ RegExp *m1, *m2; Range *r; if(!(m1 = RegExp_isA(e1, MATCHOP))) return NULL; if(!(m2 = RegExp_isA(e2, MATCHOP))) return NULL; r = doDiff(m1->d.match, m2->d.match); return r? RegExp_new_MatchOp(r) : RegExp_new_NullOp(); } static RegExp *doAlt(RegExp *e1, RegExp *e2){ if(!e1) return e2; if(!e2) return e1; return RegExp_new_AltOp(e1, e2); } RegExp *mkAlt(RegExp *e1, RegExp *e2){ RegExp *a; RegExp *m1, *m2; if((a = RegExp_isA(e1, ALTOP))){ if((m1 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP))) e1 = a->d.AltCatOp.exp2; } else if((m1 = RegExp_isA(e1, MATCHOP))){ e1 = NULL; } if((a = RegExp_isA(e2, ALTOP))){ if((m2 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP))) e2 = a->d.AltCatOp.exp2; } else if((m2 = RegExp_isA(e2, MATCHOP))){ e2 = NULL; } return doAlt(merge(m1, m2), doAlt(e1, e2)); } static unsigned char unescape(SubStr *s){ unsigned char c; unsigned char v; s->len--; if((c = *s->str++) != '\\' || s->len == 0) return xlat[c]; s->len--; switch(c = *s->str++){ case 'n': return xlat['\n']; case 't': return xlat['\t']; case 'v': return xlat['\v']; case 'b': return xlat['\b']; case 'r': return xlat['\r']; case 'f': return xlat['\f']; case 'a': return xlat['\a']; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { v = c - '0'; for(; s->len != 0 && '0' <= (c = *s->str) && c <= '7'; s->len--, s->str++) v = v*8 + (c - '0'); return v; } default: return xlat[c]; } } static Range *getRange(SubStr *s){ unsigned char lb = unescape(s), ub; if(s->len < 2 || *s->str != '-'){ ub = lb; } else { s->len--; s->str++; ub = unescape(s); if(ub < lb){ unsigned char tmp; tmp = lb; lb = ub; ub = tmp; } } return Range_new(lb, ub+1); } static RegExp *matchChar(unsigned int c){ return RegExp_new_MatchOp(Range_new(c, c+1)); } RegExp *strToRE(SubStr s){ RegExp *re; s.len -= 2; s.str += 1; if(s.len == 0) return RegExp_new_NullOp(); re = matchChar(unescape(&s)); while(s.len > 0) re = RegExp_new_CatOp(re, matchChar(unescape(&s))); return re; } RegExp *strToCaseInsensitiveRE(SubStr s){ unsigned char c; RegExp *re, *reL, *reU; s.len -= 2; s.str += 1; if(s.len == 0) return RegExp_new_NullOp(); c = unescape(&s); if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { reL = matchChar(tolower(c)); reU = matchChar(toupper(c)); re = mkAlt(reL, reU); } else { re = matchChar(c); } while(s.len > 0) { c = unescape(&s); if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { reL = matchChar(tolower(c)); reU = matchChar(toupper(c)); re = RegExp_new_CatOp(re, mkAlt(reL, reU)); } else { re = RegExp_new_CatOp(re, matchChar(c)); } } return re; } RegExp *ranToRE(SubStr s){ Range *r; s.len -= 2; s.str += 1; if(s.len == 0) return RegExp_new_NullOp(); r = getRange(&s); while(s.len > 0) r = doUnion(r, getRange(&s)); return RegExp_new_MatchOp(r); } RegExp *invToRE(SubStr s) { RegExp *any, *ran, *inv; SubStr *ss; s.len--; s.str++; ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]")); any = ranToRE(*ss); free(ss); if (s.len <= 2) return any; ran = ranToRE(s); inv = mkDiff(any, ran); free(ran); free(any); return inv; } RegExp *mkDot() { SubStr *ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]")); RegExp * any = ranToRE(*ss); RegExp * ran = matchChar('\n'); RegExp * inv = mkDiff(any, ran); free(ss); free(ran); free(any); return inv; } RegExp * RegExp_new_RuleOp(RegExp *e, RegExp *c, Token *t, unsigned int a) { RegExp *r = malloc(sizeof(RegExp)); r->type = RULEOP; r->d.RuleOp.exp = e; r->d.RuleOp.ctx = c; r->d.RuleOp.ins = NULL; r->d.RuleOp.accept = a; r->d.RuleOp.code = t; return r; } static void optimize(Ins *i){ while(!isMarked(i)){ mark(i); if(i->i.tag == CHAR){ i = (Ins*) i->i.link; } else if(i->i.tag == GOTO || i->i.tag == FORK){ Ins *target = (Ins*) i->i.link; optimize(target); if(target->i.tag == GOTO) i->i.link = target->i.link == target? i : target; if(i->i.tag == FORK){ Ins *follow = (Ins*) &i[1]; optimize(follow); if(follow->i.tag == GOTO && follow->i.link == follow){ i->i.tag = GOTO; } else if(i->i.link == i){ i->i.tag = GOTO; i->i.link = follow; } } return; } else { ++i; } } } void genCode(FILE *o, RegExp *re){ CharSet cs; unsigned int j; Char rep[nChars]; Ins *ins, *eoi; DFA *dfa; memset(&cs, 0, sizeof(cs)); for(j = 0; j < nChars; ++j){ cs.rep[j] = &cs.ptn[0]; cs.ptn[j].nxt = &cs.ptn[j+1]; } cs.freeHead = &cs.ptn[1]; *(cs.freeTail = &cs.ptn[nChars-1].nxt) = NULL; cs.ptn[0].card = nChars; cs.ptn[0].nxt = NULL; RegExp_split(re, &cs); /* for(unsigned int k = 0; k < nChars;){ for(j = k; ++k < nChars && cs.rep[k] == cs.rep[j];); printSpan(cerr, j, k); cerr << "\t" << cs.rep[j] - &cs.ptn[0] << endl; } */ for(j = 0; j < nChars; ++j){ if(!cs.rep[j]->nxt) cs.rep[j]->nxt = &cs.ptn[j]; rep[j] = (Char) (cs.rep[j]->nxt - &cs.ptn[0]); } RegExp_calcSize(re, rep); ins = malloc(sizeof(Ins)*(re->size+1)); memset(ins, 0, (re->size+1)*sizeof(Ins)); RegExp_compile(re, rep, ins); eoi = &ins[re->size]; eoi->i.tag = GOTO; eoi->i.link = eoi; optimize(ins); for(j = 0; j < re->size;){ unmark(&ins[j]); if(ins[j].i.tag == CHAR){ j = (Ins*) ins[j].i.link - ins; } else { j++; } } dfa = DFA_new(ins, re->size, 0, 256, rep); DFA_emit(dfa, o); DFA_delete(dfa); free(ins); }