/* subst.c
 *
 * This file provides a means of using Perl-Compatible Regular Expressions
 * to rewrite certain input pages.  The input data is parsed into a set of
 * URL-matching rules with associated substitution command sets.
 */
/* This software is copyrighted as detailed in the LICENSE file. */

#include <config.h>
#include <ctype.h>
#include <rbmake/rbfile.h>
#include <pcre.h>
#include "mbuf.h"

#define GLOBAL_SUBST	0x0001

typedef struct SubstRule {
    struct SubstRule *next;
    pcre *pat;
    char *subst;
    int flags;
} SubstRule;

typedef struct {
    char *wild; /* Only one of wild or pat will be non-NULL. */
    pcre *pat;
    SubstRule *substRuleList;
} MatchRule;

typedef struct {
    char *str;
    int pos, len;
} StrMod;

static StrMod *substList;
static int substListPos, substListCnt, substListAlloc, substBufLen;
static int substPosMod, substLtCnt, substGtCnt;
static bool substAllocStrs;

static char openChars[] = "{([<";
static char closeChars[] = "})]>";

char *Subst_parseOneSet(int *lp, char **bpp, SubstRule **srp);
static StrMod *slideSubstBufFromStart(char *bp, StrMod *sm);
static bool slideSubstBufFromEnd(char *bp, int *endPosPtr);

void
Subst_initChangeset(bool allocStrs)
{
    if (!substList) {
	substListAlloc = 512;
	substList = Mem_alloc(substListAlloc * sizeof *substList);
    }
    substListCnt = substPosMod = substLtCnt = substGtCnt = 0;
    substAllocStrs = allocStrs;
}

void
Subst_addChange(char *bp, int pos, int plen, char *s, int slen)
{
    if (slen && plen)
	memcpy(bp + pos, s, slen >= plen? plen : slen);
    if (slen != plen) {
	StrMod *sm;
	if (substListCnt == substListAlloc) {
	    substListAlloc += 512;
	    substList = Mem_realloc(substList,
				substListAlloc * sizeof *substList);
	}
	sm = substList + substListCnt++;
	if (slen > plen)
	    sm->str = substAllocStrs? Mem_strdup(s + plen) : s + plen;
	else
	    sm->str = NULL;
	sm->pos = pos + plen + substPosMod;
	sm->len = slen - plen;
	if ((substPosMod += sm->len) < 0)
	    substLtCnt++;
	else if (substPosMod)
	    substGtCnt++;
    }
}

void
Subst_applyChangeset(char **bpp, int *blenp, int *alenp)
{
    if (substListCnt != 0) {
	int blen, alen = *alenp - 1;
	char *bp;

	if ((blen = *blenp += substPosMod) > alen) {
	    alen = (blen + 4095) & ~4095;
	    bp = *bpp = Mem_realloc(*bpp, *alenp = alen + 1);
	}
	else
	    bp = *bpp;

	if (substLtCnt >= substGtCnt) {
	    StrMod *sm = substList;
	    substPosMod = 0;
	    substListPos = 1;
	    substBufLen = blen;
	    while ((sm = slideSubstBufFromStart(bp, sm)) != 0) {}
	}
	else {
	    int endPos = blen;
	    substListPos = substListCnt - 1;
	    while (slideSubstBufFromEnd(bp, &endPos)) {}
	}
	bp[blen] = '\0';

	if (substAllocStrs) {
	    int i;
	    for (i = 0; i < substListCnt; i++) {
		if (substList[i].str)
		    Mem_free(substList[i].str);
	    }
	}
    }
}

static StrMod *
slideSubstBufFromStart(char *bp, StrMod *sm)
{
    int segLen, end, mod;
    StrMod *nextSm;
    char *pp;

    if (substListPos == substListCnt) {
	nextSm = NULL;
	end = substBufLen;
    }
    else {
	nextSm = substList + substListPos++;
	end = nextSm->pos + (nextSm->len < 0? nextSm->len : 0);
    }

    segLen = end - (sm->pos + sm->len);
    mod = substPosMod += sm->len;
 
    /* If there's no room yet, slide the next chunk out of the way first. */
    if (mod > 0 && nextSm)
	nextSm = slideSubstBufFromStart(bp, nextSm); /* Changes substPosMod */
 
    pp = bp + sm->pos;
    if (segLen && mod) {
	char *cp = pp + sm->len;
	memmove(cp, cp - mod, segLen);
    }
    if (sm->str)
	memcpy(pp, sm->str, sm->len);
 
    return nextSm;
}

static bool
slideSubstBufFromEnd(char *bp, int *endPosPtr)
{
    int segLen, mod;
    StrMod *sm;
    char *pp;

    if (substListPos < 0)
	return false;

    sm = substList + substListPos--;
 
    segLen = *endPosPtr - (sm->pos + sm->len);
    mod = substPosMod;
    *endPosPtr = sm->pos + (sm->len < 0? sm->len : 0);
 
    /* If there's no room yet, slide the prior chunk out of the way first. */
    if ((substPosMod -= sm->len) < 0)
	slideSubstBufFromEnd(bp, endPosPtr); /* Changes substPosMod */
 
    pp = bp + sm->pos;
    if (segLen && mod) {
	char *cp = pp + sm->len;
	memmove(cp, cp - mod, segLen);
    }
    if (sm->str)
	memcpy(pp, sm->str, sm->len);
 
    return true;
}

/* Pass in a null-terminated buffer to be parsed.  The resulting
 * MatchRules are appended to the destRules array, each one of which will
 * have one or more associated SubstRule objects in a linked list.  The
 * return value is NULL on success, otherwise it contains a malloced error
 * message. */
char *
Subst_parseRules(MArray *destRules, char *txt)
{
    MArray *nameArray = MArray_new(32, 0);
    MatchRule *mr;
    char ch, opener, closer, *start, *end, *line;
    int depth, lineno = 1;

    while (1) {
	while (ISSPACE(*txt)) {
	    if (*txt == '\n')
		lineno++;
	    txt++;
	}
	if (*txt == '#') {
	    while (*++txt && *txt != '\n') {}
	    continue;
	}
	if (!*txt)
	    break;

	if (*txt == '{') {
	    SubstRule *sr;
	    char *err;
	    if (!MArray_itemCnt(nameArray)) {
		MBuf *err = MBuf_new(64, 0);
		char buf[32];
		sprintf(buf, "%d", lineno);
		MBuf_vwrite(err, "No filename-match items found before opening brace on line ",-1,
			    buf,-1, NULL);
		return MBuf_toBuffer(err, NULL);
	    }
	    txt++;
	    err = Subst_parseOneSet(&lineno, &txt, &sr);
	    if (err)
		return err;
	    if (*txt != '}')
		return Mem_strdup("Failed to find closing brace before EOF");
	    while ((mr = MArray_fetchPtr(nameArray)) != NULL) {
		mr->substRuleList = sr;
		MArray_appendPtr(destRules, mr);
	    }
	    MArray_truncate(nameArray, 0);
	    txt++;
	    continue;
	}

	mr = Mem_alloc(sizeof *mr);
	line = txt;
	if (*txt == '"') {
	    mr->pat = NULL;
	    mr->wild = start = ++txt;
	    opener = closer = '"';
	}
	else if (*txt == '/' || (*txt == 'm' && *++txt && !ISSPACE(*txt))) {
	    char *cp;
	    mr->wild = NULL;
	    opener = closer = *txt++;
	    start = txt;
	    if ((cp = strchr(openChars, opener)) != NULL)
		closer = closeChars[cp - openChars];
	}
	else {
	    MBuf *err = MBuf_new(64, 0);
	    char *nl = strchr(line, '\n');
	    if (!nl)
		nl = line + strlen(line);
	    MBuf_vwrite(err, "Looking for file-match rule, found: ",-1,
			line,nl-line, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	for (depth = 0; (ch = *txt) != '\0'; txt++) {
	    if (ch == closer) {
		if (!depth--)
		    break;
	    }
	    else if (ch == opener)
		depth++;
	    else if (ch == '\\' && txt[1])
		txt++;
	}
	if (*txt != closer) {
	    MBuf *err = MBuf_new(64, 0);
	    char *nl = strchr(line, '\n');
	    if (!nl)
		nl = line + strlen(line);
	    MBuf_vwrite(err, "Invalid syntax in file-match rule: ",-1,
			line,nl-line, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	end = txt;
	if (mr->wild) {
	    *end = '\0';
	    mr->wild = Mem_strdup(start);
	    *end = closer;
	    do txt++; while (ISSPACE(*txt));
	}
	else {
	    const char *errstr;
	    int errpos, options = 0;

	    while (*++txt && *txt != ':' && *txt != '\n') {
		switch (*txt) {
		  case 'i':
		    options |= PCRE_CASELESS;
		    break;
		  case 'u':
		    options |= PCRE_UNGREEDY;
		    break;
		  case ' ':
		  case '\t':
		    break;
		  default:
		    {
			MBuf *err = MBuf_new(64, 0);
			char *nl = strchr(line, '\n');
			if (!nl)
			    nl = line + strlen(line);
			MBuf_vwrite(err, "Invalid file-match flag `",-1, txt,1,
				    "' on this rule: ",-1, line,nl-line, NULL);
			return MBuf_toBuffer(err, NULL);
		    }
		}
	    }
	    *end = '\0';
	    mr->pat = pcre_compile(start, 0, &errstr, &errpos, NULL);
	    *end = closer;
	    if (!mr->pat) {
		MBuf *err = MBuf_new(64, 0);
		MBuf_vwrite(err, "Invalid regex in file-match rule (",-1,
			    errstr,-1, "): ",3, start,-1, NULL);
		return MBuf_toBuffer(err, NULL);
	    }
	}
	if (*txt != ':') {
	    MBuf *err = MBuf_new(64, 0);
	    char *nl = strchr(line, '\n');
	    if (!nl)
		nl = line + strlen(line);
	    MBuf_vwrite(err, "Failed to find ':' at end of file-match rule: ",-1,
			line,nl-line, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	MArray_appendPtr(nameArray, mr);
	txt++;
    }
    MArray_delete(nameArray);

    return NULL;
}

char *
Subst_parseOneSet(int *lp, char **bpp, SubstRule **srp)
{
    SubstRule *listHead = NULL, *listTail = NULL, *sr;
    char ch, opener, closer, *cp, *pat, *end, *line, *txt = *bpp;
    bool skipMiddleWhitespace;
    const char *errstr;
    int options, errpos, depth, lineno = *lp;

    for ( ; *txt && *txt != '}'; txt++) {
	if (ISSPACE(*txt)) {
	    if (*txt == '\n')
		lineno++;
	    continue;
	}
	if (*txt == '#') {
	    while (*++txt && *txt != '\n') {}
	    if (!*txt)
		break;
	    lineno++;
	    continue;
	}
	line = txt;
	if (*txt != 's' || ISSPACE(txt[1])) {
	    MBuf *err = MBuf_new(64, 0);
	    char *nl = strchr(line, '\n');
	    if (!nl)
		nl = line + strlen(line);
	    MBuf_vwrite(err, "Expecting substitution rule, found: ",-1,
			line,nl-line, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	opener = closer = *++txt;
	if ((cp = strchr(openChars, opener)) != NULL) {
	    closer = closeChars[cp - openChars];
	    skipMiddleWhitespace = true;
	}
	else
	    skipMiddleWhitespace = false;
	sr = Mem_alloc(sizeof *sr);
	for (pat = ++txt, depth = 0; (ch = *txt) != '\0'; txt++) {
	    if (ch == closer) {
		if (!depth--)
		    break;
	    }
	    else if (ch == opener)
		depth++;
	    else if (ch == '\\' && txt[1])
		txt++;
	    else if (ch == '\n')
		lineno++;
	}
	if (*txt != closer) {
	    MBuf *err = MBuf_new(64, 0);
	    char *nl = strchr(line, '\n');
	    if (!nl)
		nl = line + strlen(line);
	    MBuf_vwrite(err, "Didn't find middle `",-1, &closer,1,
			"' before EOF near: ",-1, line,nl-line, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	end = txt;
	if (skipMiddleWhitespace) {
	    do txt++; while (ISSPACE(*txt));
	    opener = closer = *txt;
	    if ((cp = strchr(openChars, opener)) != NULL)
		closer = closeChars[cp - openChars];
	}
	for (sr->subst = cp = ++txt, depth = 0; (ch = *txt) != '\0'; txt++) {
	    if (ch == closer) {
		if (!depth--)
		    break;
	    }
	    else if (ch == opener)
		depth++;
	    else if (ch == '\\') {
		int num;
		switch (ch = *++txt) {
		  case 'x':
		    num = 0;
		    while (ISDIGIT(ch = txt[1])
		     || (ISALPHA(ch) && (ch = (ch&0x1F)+9+'0') < 16+'0'))
			num = num * 16 + ch - '0', txt++;
		    ch = num;
		    break;
		  case '0': case '1': case '2': case '3':
		  case '4': case '5': case '6': case '7':
		    num = ch - '0';
		    while ((ch = txt[1]) <= '7' && ch >= '0')
			num = num * 8 + ch - '0', txt++;
		    ch = num;
		    break;
		  case 'c':
		    if ((ch = txt[1]) != '\0')
			ch &= 0x1F, txt++;
		    else
			ch = 'c';
		    break;
		  case 'n':
		    ch = '\n';
		    break;
		  case 'r':
		    ch = '\r';
		    break;
		  case 't':
		    ch = '\t';
		    break;
		  case 'e':
		    ch = '\033';
		    break;
		  case 'f':
		    ch = '\f';
		    break;
		  case 'a':
		    ch = '\007';
		    break;
		  case 'b':
		    ch = '\007';
		    break;
		  case '\0':
		    txt--;
		    break;
		  default:
		    break;
		}
	    }
	    else if (ch == '\n')
		lineno++;
	    *cp++ = ch;
	}
	if (*txt != closer) {
	    MBuf *err = MBuf_new(64, 0);
	    char *nl = strchr(line, '\n');
	    if (!nl)
		nl = line + strlen(line);
	    MBuf_vwrite(err, "Didn't find closing `",-1, &closer,1,
			"' before EOF near: ",-1, line,nl-line, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	*cp = '\0';
	sr->subst = Mem_strdup(sr->subst);
	sr->next = NULL;
	sr->flags = options = 0;
	while (*++txt && *txt != ';') {
	    switch (*txt) {
	      case 'i':
		options |= PCRE_CASELESS;
		break;
	      case 's':
		options |= PCRE_DOTALL;
		break;
	      case 'g':
		sr->flags |= GLOBAL_SUBST;
		break;
	      case 'm':
		options |= PCRE_MULTILINE;
		break;
	      case 'u':
		options |= PCRE_UNGREEDY;
		break;
	      case 'x':
		options |= PCRE_EXTENDED;
		break;
	      case ' ':
		break;
	      default:
		{
		    MBuf *err = MBuf_new(64, 0);
		    char *nl = strchr(line, '\n');
		    if (!nl)
			nl = line + strlen(line);
		    MBuf_vwrite(err, "Invalid substitution flag `",-1, txt,1,
				"' on this rule: ",-1, line,nl-line, NULL);
		    return MBuf_toBuffer(err, NULL);
		}
	    }
	}
	ch = *end;
	*end = '\0';
	sr->pat = pcre_compile(pat, options, &errstr, &errpos, NULL);
	*end = ch;
	if (!sr->pat) {
	    MBuf *err = MBuf_new(64, 0);
	    MBuf_vwrite(err, "Invalid regex in substitution rule (",-1,
			errstr,-1, "): ",3, pat,-1, NULL);
	    return MBuf_toBuffer(err, NULL);
	}
	if (listHead)
	    listTail->next = sr;
	else
	    listHead = sr;
	listTail = sr;
    }
    *lp = lineno;
    *bpp = txt;
    *srp = listHead;

    return NULL;
}

#define REF_MAX 100

/* This destroys "buf" and returns a new MBuf that may have been rewritten. */
MBuf *
Subst_runRules(MArray *rules, const char *name, MBuf *buf)
{
    int alen, blen, nlen = strlen(name);
    int fnVectCnt, fnVect[REF_MAX*3], ovect[REF_MAX*3];
    MatchRule *mr;
    SubstRule *sr;
    char *bp, *cp;

    blen = buf->totalLen;
    bp = MBuf_toBuffer(buf, &alen);
    buf = MBuf_new(128, 0); /* Use this for substitution strings for a while */

    MArray_setFetchPos(rules, 0);
    while ((mr = MArray_fetchPtr(rules)) != NULL) {
	if (mr->wild) {
	   if (!Wild_EQ(mr->wild, name))
	       continue;
	   fnVectCnt = 0;
	}
	else {
	    pcre_fullinfo(mr->pat, NULL, PCRE_INFO_CAPTURECOUNT, &fnVectCnt);
	    if (pcre_exec(mr->pat,NULL,name,nlen,0,0,fnVect,REF_MAX) < 0)
		continue;
	}
	for (sr = mr->substRuleList; sr; sr = sr->next) {
	    int ret, flen = 0, pos = 0;
	    char *f = NULL;
	    Subst_initChangeset(true);
	    while (pos < blen) {
		ret = pcre_exec(sr->pat,NULL,bp,blen,pos,0,ovect,REF_MAX);
		if (ret < 0)
		    break;
		if (substAllocStrs) {
		    substAllocStrs = false;
		    MBuf_truncate(buf, 0);
		    for (cp = sr->subst; *cp; cp++) {
			if (*cp == '$'
			 && (ISDIGIT(cp[1]) || cp[1] == '&' || cp[1] == '/')) {
			    if (cp[1] == '/') {
				int num = atoi(cp+2) * 2;
				do cp++; while (ISDIGIT(cp[1]));
				if (num <= fnVectCnt * 2) {
				    MBuf_write(buf, name + fnVect[num],
					       fnVect[num+1] - fnVect[num]);
				}
			    }
			    else {
				int num = atoi(cp+1) * 2;
				while (ISDIGIT(cp[1])) cp++;
				if (num < ret * 2) {
				    MBuf_write(buf, bp + ovect[num],
					       ovect[num+1] - ovect[num]);
				}
			    }
			    substAllocStrs = true;
			}
			else
			    MBuf_putc(buf, *cp);
		    }
		    if ((flen = buf->totalLen) != 0)
			f = MBuf_dataPtr(buf, NULL);
		}
		Subst_addChange(bp, ovect[0], ovect[1] - ovect[0], f, flen);
		if (!(sr->flags & GLOBAL_SUBST))
		    break;
		pos = ovect[1];
	    }
	    Subst_applyChangeset(&bp, &blen, &alen);
	}
    }
    MBuf_delete(buf);

    buf = MBuf_new(4096, 4096);
    MBuf_appendBuffer(buf, bp, blen, alen);

    return buf;
}
