/* rbdump.c
 *
 * This is a test program that allows you to look at a .html page and its
 * assoicated .hidx page and see if the offsets all match up.
 */
/* This software is copyrighted as detailed in the LICENSE file. */

#include <config.h>
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#include <ctype.h>
#include <rbmake/rbfile.h>

static char opts[] = "hnptV";

static bool outputTags, outputParagraphs, outputNames;

static void handlePage(RbFile *rf, ToC *toc);
static void dumpIt(MBuf *hidx, MBuf *html);
static int outputHtmlContext(MBuf *html, int off, int before, int after);
static void outputContextPart(const char *cp, int cnt, bool putStart);
static void usage(void);

int
main(int argc, char *argv[])
{
    ToC *toc;
    RbFile *rf;
    char *rbfile, *suf;
    int ch;

    while ((ch = getopt(argc, argv, opts)) != EOF) {
	switch (ch) {
	  case 'n':
	    outputNames = true;
	    break;
	  case 'p':
	    outputParagraphs = true;
	    break;
	  case 't':
	    outputTags = true;
	    break;
	  case 'V':
	    RbError_warn("rbdump v%s\n", RBMAKE_VERSION);
	    exit(0);
	  case 'h':
	  default:
	    usage();
	}
    }
    rbfile = argv[optind];
    if (optind >= argc || !rbIsRbSuf(rbGetFileSuffix(rbfile))) {
	RbError_exit("You didn't specify a .rb file to dump.  "
		     "Use the -h option to get help.\n");
    }

    if (!outputTags && !outputParagraphs && !outputNames)
	outputTags = outputParagraphs = outputNames = true;

    if (!(rf = RbFile_open(argv[optind], RB_OPENFLAG_INCLUDE_HIDDEN)))
	exit(1);

    if (++optind == argc) {
	for (toc = RbFile_getTocHead(rf); toc; toc = ToC_getNext(toc)) {
	    suf = rbGetFileSuffix(ToC_getName(toc));
	    if (rbIsHtmlSuf(suf) || rbIsTextSuf(suf)) {
		printf("\nDumping %s:\n", ToC_getName(toc));
		handlePage(rf, toc);
	    }
	}
    }
    else {
	do {
	    optarg = argv[optind];
	    suf = rbGetFileSuffix(optarg);
	    if (!rbIsHtmlSuf(suf) && !rbIsTextSuf(suf)) {
		printf("%s is not a .html or .txt file.\n", optarg);
		continue;
	    }
	    if (!(toc = RbFile_find(rf, optarg))) {
		printf("Couldn't find %s in %s.\n", optarg, rbfile);
		continue;
	    }
	    handlePage(rf, toc);
	} while (++optind < argc);
    }
    return 0;
}

static void
handlePage(RbFile *rf, ToC *toc)
{
    char *suf, tn[64];
    MBuf *hidx, *html;
    ToC *htoc;

    strcpy(tn, ToC_getName(toc));
    suf = rbGetFileSuffix(tn);
    hidx = MBuf_new(1024, 32*1024);
    html = MBuf_new(1024, 32*1024);
    strcpy(suf, "hidx");
    if ((htoc = RbFile_find(rf, tn)) != NULL) {
	RbFile_readPage(rf, toc, html, NULL);
	RbFile_readPage(rf, htoc, hidx, NULL);
	dumpIt(hidx, html);
    }
    else
	printf("Didn't find %s to go with %s.\n", tn, ToC_getName(toc));
    MBuf_delete(hidx);
    MBuf_delete(html);
}

#define NOTHING 0
#define TAGS 1
#define PARAGRAPHS 2
#define NAMES 3

char *sectionName[] = { "NOTHING", "tags", "paragraphs", "names" };

static void
dumpIt(MBuf *hidx, MBuf *html)
{
    char buf[2048];
    int len, cnt = 0, limit = 0, pos, off, mode = NOTHING;
    char *s, *t, ch;
    bool firstTag = true;
    MBuf *tagBuf = MBuf_new(1024, 0);
    MArray *tagOff= MArray_new(1024, 0);

    MBuf_setReadPos(hidx, 0, 0);
    while ((len = MBuf_gets(hidx, buf, sizeof buf)) > 0) {
	if (len <= 0 || (len == 1 && *buf == '\n')) {
	    if (mode == NOTHING)
		printf("ERROR: incorrect empty lines.\n");
	    else if (cnt != limit) {
		printf("\nFound %d items in the \"%s\" section instead of %d.\n", cnt,
		       sectionName[mode], limit);
	    }
	    if (len <= 0)
		break;
	    mode = NOTHING;
	    continue;
	}
	if (*buf == '[') {
	    buf[len-1] = '\0';
	    if (buf[len-2] != ']') {
		printf("ERROR: invalid section syntax:\n%s", buf);
		return;
	    }
	    else if (strnEQ(buf+1, "tags ", 5)) {
		mode = TAGS;
		limit = atoi(buf+6);
		cnt = 0;
		if (outputTags)
		    printf("\n%s\n", buf);
	    }
	    else if (strnEQ(buf+1, "paragraphs ", 11)) {
		mode = PARAGRAPHS;
		limit = atoi(buf+12);
		cnt = 0;
		if (outputParagraphs) {
		    printf("\n%s  (the offset points AFTER the para-tweaking tag)\n\n",
			   buf);
		}
	    }
	    else if (strnEQ(buf+1, "names ", 6)) {
		mode = NAMES;
		limit = atoi(buf+7);
		cnt = 0;
		if (outputNames) {
		    printf("\n%s  (the offset points BEFORE the name-including tag)\n\n",
			   buf);
		}
	    }
	    else {
		printf("ERROR: invalid section name:\n%s", buf);
		return;
	    }
	}
	else {
	    switch (mode) {
	      case NOTHING:
		printf("ERROR: didn't find next section:\n%s", buf);
		return;
	      case TAGS:
		if (*buf != '<' || !(s = strchr(buf+1, '>')) || s[1] != ' '
		 || (firstTag? s[2] != '-' : !ISDIGIT(s[2]))) {
		    printf("ERROR: invalid \"tags\" line:\n%s", buf);
		    return;
		}
		if ((pos = atoi(s+2)) >= MArray_itemCnt(tagOff)) {
		    printf("ERROR: invalid \"tags\" index:\n%s", buf);
		    return;
		}
		cnt++;
		firstTag = false;
		s[1] = '\0';
		len = MBuf_getLength(tagBuf);
		MArray_append(tagOff, len);
		if (pos >= 0) {
		    off = MArray_fetchAt(tagOff, pos);
		    s = MBuf_dataPtrAt(tagBuf, off, NULL);
		    MBuf_puts(tagBuf, s);
		} 
		MBuf_puts(tagBuf, buf);
		MBuf_putc(tagBuf, '\0');
		if (outputTags) {
		    t = MBuf_dataPtrAt(tagBuf, len, NULL);
		    printf("%d. %s\n", cnt-1, t);
		}
		break;
	      case PARAGRAPHS:
		if (sscanf(buf, "%d %d%c", &off,&pos,&ch) != 3 || ch != '\n') {
		    printf("ERROR: invalid \"paragraphs\" line:\n%s", buf);
		    return;
		}
		cnt++;
		if (outputParagraphs) {
		    if (outputHtmlContext(html, off, 20, 10) < 0) {
			printf("ERROR: bogus \"paragraphs\" offset:\n%s", buf);
			return;
		    }
		    t = MBuf_dataPtrAt(tagBuf,MArray_fetchAt(tagOff,pos),NULL);
		    printf(" | %d %d %s\n", off, pos, t);
		}
		break;
	      case NAMES:
		if (*buf != '"' || !(s = strchr(buf+1, '"')) || s[1] != ' '
		 || !ISDIGIT(s[2])) {
		    printf("ERROR: invalid \"names\" line:\n%s", buf);
		    return;
		}
		off = atoi(s+2);
		cnt++;
		s[1] = '\0';
		if (outputNames) {
		    if (outputHtmlContext(html, off, 10, 30) < 0) {
			printf("ERROR: bogus \"names\" offset:\n%s", buf);
			return;
		    }
		    printf(" | %s %d\n", buf, off);
		}
		break;
	    }
	}
    }
}

static int
outputHtmlContext(MBuf *html, int off, int before, int after)
{
    char data[1024];
    int total = before + after;
    int fudge;

    if (off < 0)
	return -1;
    if (off < before) {
	fudge = before - off;
	memset(data, ' ', fudge);
    }
    else
	fudge = 0;
    if (MBuf_setReadPos(html, off - before + fudge, 0) < 0
     || (total = MBuf_read(html, data + fudge, total)) < before)
	return -1;
    data[total] = '\0';
    outputContextPart(data, before, false);
    fputs(" | ", stdout);
    outputContextPart(data + before, total - before, true);

    return 0;
}

static void
outputContextPart(const char *cp, int cnt, bool putStart)
{
    char buf[1024], *t;
    int ch, i;

    for (i = cnt, t = buf; i--; ) {
	switch (ch = *(unsigned char *)cp++) {
	  case '\t':
	    *t++ = '\\', *t++ = 't';
	    break;
	  case '\r':
	    *t++ = '\\', *t++ = 'r';
	    break;
	  case '\n':
	    *t++ = '\\', *t++ = 'n';
	    break;
	  case '\\':
	  case '^':
	    *t++ = '\\', *t++ = ch;
	    break;
	  case 128: *t++ = '?'; break;/* Euro sign */
	  case 130: *t++ = '?'; break;/* Single low-9 quotation mark */
	  case 131: *t++ = 'f'; break;/* Latin 'f' with hook */
	  case 132: *t++ = '?'; break;/* Double low-9 quotation mark */
	  case 133: *t++='.';*t++='.';*t++='.'; break;/* Horizontal ellipsis */
	  case 134: *t++ = '?'; break;/* Dagger */
	  case 135: *t++ = '?'; break;/* Double dagger */
	  case 136: *t++ = '?'; break;/* Modifier letter circumflex accent */
	  case 137: *t++ = '?'; break;/* Per mille sign */
	  case 138: *t++ = 'S'; break;/* Latin 'S' with caron */
	  case 139: *t++ = '{'; break;/* Single left-pointing angle quote */
	  case 140: *t++='O';*t++='E'; break;/* Latin capital ligature OE */
	  case 142: *t++ = 'Z'; break;/* Latin 'Z' with caron */
	  case 145: *t++ = '\''; break;/* Left single quote */
	  case 146: *t++ = '\''; break;/* Right single quote */
	  case 147: *t++ = '"'; break;/* Left double quote */
	  case 148: *t++ = '"'; break;/* Right double quote */
	  case 149: *t++ = '*'; break;/* Bullet */
	  case 150: *t++ = '-'; break;/* En dash */
	  case 151: *t++ = '-'; break;/* Em dash */
	  case 152: *t++ = '*'; break;/* Small tilde */
	  case 153: *t++ = '?'; break;/* Trade mark sign */
	  case 154: *t++ = 's'; break;/* Latin 's' with caron */
	  case 155: *t++ = '}'; break;/* Single right-pointing angle quote */
	  case 156: *t++='o';*t++='e'; break;/* Latin small ligature oe */
	  case 158: *t++ = 'z'; break;/* Latin 'z' with caron */
	  case 159: *t++ = 'Y'; break;/* Latin 'Y' with diaeresis */
	  case 160: *t++ = ' '; break;/* nbsp */
	  default:
	    if (ch < 32)
		*t++ = '^', *t++ = ch | 'A';
#if 0
	    else if (ch > 127)
		*t++ = '?';
#endif
	    else
		*t++ = ch;
	    break;
	}
    }
    *t = '\0';
    if (putStart) {
	buf[cnt] = '\0';
	fputs(buf, stdout);
    }
    else
	fputs(t-cnt, stdout);
}

static void
usage()
{
    printf("\
Usage: rbdump [-OPTIONS] FILE.rb [FILE.html|FILE.txt ...]\n\
\n\
-h  Output this help message.\n\
-n  Output the name section.\n\
-p  Output the paragraphs section.\n\
-t  Output the tag section.\n\
-V  Output the version of rbdump.\n\
\n\
Note that if no output options are specified, all sections will be output.\n\
");
    exit(0);
}
