aboutsummaryrefslogblamecommitdiffstats
path: root/camel/camel-mime-filter-html.c
blob: 06d0ef252b17c720909aaae33cd3cdcb1f9d28d6 (plain) (tree)

































































































































































































































































































































                                                                                                                                   
/*
 *  Copyright (C) 2001 Ximian Inc.
 *
 *  Authors: Michael Zucchi <notzed@helixcode.com>
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public License
 *  as published by the Free Software Foundation; either version 2 of
 *  the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public
 *  License along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "camel-mime-filter-html.h"

#include <stdio.h>
#include <string.h>
#include <stdarg.h>


#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>

#include "xmlmemory.h"
#include "HTMLparser.h"
#include "HTMLtree.h"

#define d(x)

static void camel_mime_filter_html_class_init (CamelMimeFilterHTMLClass *klass);
static void camel_mime_filter_html_init       (CamelObject *o);
static void camel_mime_filter_html_finalize   (CamelObject *o);

static CamelMimeFilterClass *camel_mime_filter_html_parent;

struct _CamelMimeFilterHTMLPrivate {
    htmlParserCtxtPtr ctxt;
};

/* ********************************************************************** */

/* HTML parser */

#define ARRAY_LEN(x) (sizeof(x)/sizeof((x)[0]))

static struct {
    char *element;
    char *remap;
} map_start[] = {
    { "p", "\n\n" },
    { "br", "\n" },
    { "h1", "\n" }, { "h2", "\n" }, { "h3", "\n" }, { "h4", "\n" }, { "h5", "\n" }, { "h6", "\n" },
};


static struct {
    char *element;
    char *remap;
} map_end[] = {
    { "h1", "\n" }, { "h2", "\n" }, { "h3", "\n" }, { "h4", "\n" }, { "h5", "\n" }, { "h6", "\n" },
};

static void
characters(void *ctx, const xmlChar *ch, int len)
{
    CamelMimeFilter *mf = ctx;

    memcpy(mf->outptr, ch, len);
    mf->outptr+= len;
}

#if 0
/* we probably dont want to index comments */
static void
comment(void *ctx, const xmlChar *value)
{
    CamelMimeFilter *mf = ctx;

    mf->outptr += sprintf(mf->outptr, " %s \n", value);
}
#endif

/* we map element starts to stuff sometimes, so we can properly break up
   words and lines.
   This is very dumb, and needs to be smarter: e.g.
   <b>F</b>\nooBar should -> "FooBar"
*/
static void
startElement(void *ctx, const xmlChar *name, const xmlChar **atts)
{
    int i;
    CamelMimeFilter *mf = ctx;

    /* we grab all "content" from "meta" tags, and dump it in the output,
       it might be useful for searching with.  This should probably be pickier */
    if (!strcasecmp(name, "meta")) {
        if (atts) {
            for (i=0;atts[i];i+=2) {
                if (!strcmp(atts[i], "content"))
                    mf->outptr += sprintf(mf->outptr, " %s \n", atts[i+1]);
            }
        }
        return;
    }

    /* FIXME: use a hashtable */
    for (i=0;i<ARRAY_LEN(map_start);i++) {
        if (!strcasecmp(map_start[i].element, name)) {
            characters(ctx, map_start[i].remap, strlen(map_start[i].remap));
            break;
        }
    }
}

static void
endElement(void *ctx, const xmlChar *name)
{
    int i;

    /* FIXME: use a hashtable */
    for (i=0;i<ARRAY_LEN(map_end);i++) {
        if (!strcasecmp(map_end[i].element, name)) {
            characters(ctx, map_end[i].remap, strlen(map_end[i].remap));
            break;
        }
    }
}

/* dum de dum, well we can print out some crap for now */
static void
warning(void *ctx, const char *msg, ...)
{
    va_list args;

    va_start(args, msg);
    fprintf(stdout, "SAX.warning: ");
    vfprintf(stdout, msg, args);
    va_end(args);
}

static void
error(void *ctx, const char *msg, ...)
{
    va_list args;

    va_start(args, msg);
    fprintf(stdout, "SAX.error: ");
    vfprintf(stdout, msg, args);
    va_end(args);
}

static void
fatalError(void *ctx, const char *msg, ...)
{
    va_list args;

    va_start(args, msg);
    fprintf(stdout, "SAX.fatalError: ");
    vfprintf(stdout, msg, args);
    va_end(args);
}

static xmlSAXHandler indexSAXHandler = {
    NULL, /* internalSubset */
    NULL, /*isStandalone,*/
    NULL, /*hasInternalSubset,*/
    NULL, /*hasExternalSubset,*/
    NULL, /*resolveEntity,*/
    NULL, /*getEntity,*/
    NULL, /*entityDecl,*/
    NULL, /*notationDecl,*/
    NULL, /*attributeDecl,*/
    NULL, /*elementDecl,*/
    NULL, /*unparsedEntityDecl,*/
    NULL, /*setDocumentLocator,*/
    NULL, /*startDocument,*/
    NULL, /*endDocument,*/
    startElement,
    endElement,
    NULL, /*reference,*/
    characters,
    NULL, /*ignorableWhitespace,*/
    NULL, /*processingInstruction,*/
    NULL, /*comment,*/
    warning,
    error,
    fatalError,
    NULL, /*getParameterEntity,*/
};


/* ********************************************************************** */


CamelType
camel_mime_filter_html_get_type (void)
{
    static CamelType type = CAMEL_INVALID_TYPE;
    
    if (type == CAMEL_INVALID_TYPE) {
        type = camel_type_register (camel_mime_filter_get_type (), "CamelMimeFilterHTML",
                        sizeof (CamelMimeFilterHTML),
                        sizeof (CamelMimeFilterHTMLClass),
                        (CamelObjectClassInitFunc) camel_mime_filter_html_class_init,
                        NULL,
                        (CamelObjectInitFunc) camel_mime_filter_html_init,
                        (CamelObjectFinalizeFunc) camel_mime_filter_html_finalize);
    }
    
    return type;
}

static void
camel_mime_filter_html_finalize(CamelObject *o)
{
    CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)o;

    if (f->priv->ctxt)
        htmlFreeParserCtxt(f->priv->ctxt);
}

static void
camel_mime_filter_html_init       (CamelObject *o)
{
    CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)o;

    f->priv = g_malloc0(sizeof(*f->priv));
}

static void
complete(CamelMimeFilter *mf, char *in, size_t len, size_t prespace, char **out, size_t *outlenptr, size_t *outprespace)
{
    CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf;

    camel_mime_filter_set_size(mf, len*2+256, FALSE);
    mf->outptr = mf->outbuf;

    d(printf("converting html end:\n%.*s\n", (int)len, in));

    if (f->priv->ctxt == NULL) {
        f->priv->ctxt = htmlCreatePushParserCtxt(&indexSAXHandler, f, in, len, "", 0);
        len = 0;
    }

    htmlParseChunk(f->priv->ctxt, in, len, 1);

    *out = mf->outbuf;
    *outlenptr = mf->outptr - mf->outbuf;
    *outprespace = mf->outbuf - mf->outreal;

    d(printf("converted html end:\n%.*s\n", (int)*outlenptr, *out));
}

static void
filter(CamelMimeFilter *mf, char *in, size_t len, size_t prespace, char **out, size_t *outlenptr, size_t *outprespace)
{
    CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf;

    camel_mime_filter_set_size(mf, len*2+16, FALSE);
    mf->outptr = mf->outbuf;

    d(printf("converting html:\n%.*s\n", (int)len, in));

    if (f->priv->ctxt == NULL)
        f->priv->ctxt = htmlCreatePushParserCtxt(&indexSAXHandler, f, in, len, "", 0);
    else
        htmlParseChunk(f->priv->ctxt, in, len, 0);

    *out = mf->outbuf;
    *outlenptr = mf->outptr - mf->outbuf;
    *outprespace = mf->outbuf - mf->outreal;

    d(printf("converted html:\n%.*s\n", (int)*outlenptr, *out));
}

static void
reset(CamelMimeFilter *mf)
{
    CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf;

    if (f->priv->ctxt != NULL) {
        htmlFreeParserCtxt(f->priv->ctxt);
        f->priv->ctxt = NULL;
    }
}

static void
camel_mime_filter_html_class_init (CamelMimeFilterHTMLClass *klass)
{
    CamelMimeFilterClass *filter_class = (CamelMimeFilterClass *) klass;
    
    camel_mime_filter_html_parent = CAMEL_MIME_FILTER_CLASS (camel_type_get_global_classfuncs (camel_mime_filter_get_type ()));

    filter_class->reset = reset;
    filter_class->filter = filter;
    filter_class->complete = complete;
}

/**
 * camel_mime_filter_html_new:
 *
 * Create a new CamelMimeFilterHTML object.
 * 
 * Return value: A new CamelMimeFilterHTML widget.
 **/
CamelMimeFilterHTML *
camel_mime_filter_html_new (void)
{
    CamelMimeFilterHTML *new = CAMEL_MIME_FILTER_HTML ( camel_object_new (camel_mime_filter_html_get_type ()));
    return new;
}