/* Copyright 2006 Joachim Zobel . * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This is mod_expat. It is runs the expat parser and converts its input * into SAX buckets. These are SAX events wrapped into buckets. They * morph back into heap buckets if you call their bucket read function. * This means that nothing needs to be done to convert them back. * It also means that you have to be carefull, once you treat them as * ordinary buckets (e.g. by using an "ordinary" filter), they are. * So if you only run the expat filter on XML input you will rarely * notice it. Whitespace inside tags is normalized and all empty tags * are turned into empty start/end pairs. * * There is only one filter using SAX buckets by now. This is mod_xi, * which implements XInclude. * * The module should be able to run on large files, which is actually * the point with both SAX and Apache filters. Allocation of per request * memory is done once for every tag name, attribute name and namespace. * So as long as your XML file is not permanently introducing new tags * or new namespaces this is limited. Check sax_unify_name to see * what exactly happens. * * It is compiled and installed as expected with * /usr/local/apache2/bin/apxs -i -c mod_expat.c buckets_sax.c frag_buffer.c */ #include #include #include #include #include #include #include #include #include #include module AP_MODULE_DECLARE_DATA expat_module ; #include #include "mod_expat.h" #include "frag_buffer.h" #include "buckets_sax.h" typedef struct { /* The base struct */ sax_ctx sax ; XML_Parser parser ; apr_size_t sz_parsed ; } expat_ctx ; /***************************************************************************** * Expat Handlers *****************************************************************************/ /* * XML_StartNamespaceDeclHandler */ static void expat_bucket_create_start_ns(void *ctx, const XML_Char *prefix, const XML_Char *uri) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_ns(c, prefix, uri) ; sax_event_set_start_id(c, bs->event) ; apr_bucket *b = sax_bucket_append(c, bs) ; } /* * XML_EndNamespaceDeclHandler */ static void expat_bucket_create_end_ns(void *ctx, const XML_Char *prefix) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_ns(c, prefix, NULL) ; sax_bucket_set_which(bs, END_NS) ; sax_event_set_end_id(c, bs->event) ; sax_bucket_append(c, bs) ; } /* * XML_StartElementHandler */ static void expat_bucket_create_start_elt(void *ctx, const XML_Char* name, const XML_Char** atts) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_elt(c, name, atts) ; sax_event_set_start_id(c, bs->event) ; sax_bucket_append(c, bs) ; } /* * XML_EndElementHandler * This acually creates a start buckets and resets the type. This accepts a small * memory overhead for avoiding code duplication. */ static void expat_bucket_create_end_elt(void *ctx, const XML_Char* name) { sax_ctx *c = ctx ; const XML_Char *atts = NULL; bucket_sax *bs = sax_bucket_create_elt(c, name, &atts) ; sax_bucket_set_which(bs, END_ELT); sax_event_set_end_id(c, bs->event) ; sax_bucket_append(c, bs); } /* * XML_StartCdataSectionHandler */ static void expat_bucket_create_start_cd(void *ctx) { sax_ctx *c = ctx ; c->is_cdata = 1 ; bucket_sax *bs = sax_bucket_create_empty(c, START_CD) ; sax_bucket_append(c, bs); } /* * XML_EndCdataSectionHandler */ static void expat_bucket_create_end_cd(void *ctx) { sax_ctx *c = ctx ; c->is_cdata = 0 ; bucket_sax *bs = sax_bucket_create_empty(c, END_CD) ; sax_bucket_append(c, bs); } /* * XML_CharacterHandler */ static void expat_bucket_create_character(void* ctx, const XML_Char* buf, int len) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_char(c, buf, len, !c->is_cdata) ; sax_bucket_append(c, bs); } /* * XML_CommentHandler */ static void expat_bucket_create_comment(void* ctx, const XML_Char* buf) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_char(c, buf, strlen(buf), 0) ; sax_bucket_set_which(bs, COMMENT) ; sax_bucket_append(c, bs); } /* * XML_XmlDeclHandler */ static void expat_bucket_create_xml_decl(void *ctx, const XML_Char *version, const XML_Char *encoding, int standalone) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_xml_decl(c, version, encoding, standalone) ; sax_bucket_append(c, bs); } /* * XML_ProcessingInstructionHandler */ static void expat_bucket_create_proc_instr(void *ctx, const XML_Char *target, const XML_Char *data) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_proc_instr(c, target, data) ; sax_bucket_append(c, bs); } /* * XML_DefaultHandler */ static void expat_bucket_create_default(void* ctx, const XML_Char* buf, int len) { sax_ctx *c = ctx ; bucket_sax *bs = sax_bucket_create_char(c, buf, len, 0) ; sax_bucket_set_which(bs, DEFAULT) ; sax_bucket_append(c, bs); } /***************************************************************************** * Interface Functions *****************************************************************************/ /** * Aborts parsing for the given filter, assuming it is an expat filter * @param r - An expat filter */ static void mod_expat_abort_filter(ap_filter_t *f) { expat_ctx *ctx = f->ctx; XML_StopParser(ctx->parser, XML_FALSE) ; } /* * mod_expat_abort */ void mod_expat_abort(request_rec *r) { ap_filter_t *f = r->output_filters ; /* We walk the filter chain and abort all expat parsers */ for (;f;f=f->next) { if (strcmp(f->frec->name,"expat") == 0) { mod_expat_abort_filter(f) ; } } } /***************************************************************************** * Module Handlers *****************************************************************************/ /* * Helper from mod_xmlns */ static char* ctype2encoding(apr_pool_t* pool, const char* in) { char* x ; char* ptr ; char* ctype ; if ( ! in ) return 0 ; ctype = apr_pstrdup(pool, in) ; for ( ptr = ctype ; *ptr; ++ptr) if ( isupper(*ptr) ) *ptr = tolower(*ptr) ; if ( ptr = strstr(ctype, "charset=") , ptr > 0 ) { ptr += 8 ; /* jump over "charset=" and chop anything that follows charset*/ if ( x = strpbrk(ptr, " ;") , x != NULL ) *x = 0 ; } x = ptr ? apr_pstrdup(pool, ptr) : 0 ; return x ; } /***************************************************************************** * Expat memory tracking *****************************************************************************/ static volatile apr_size_t expat_mem = 0; static void *expat_alloc(size_t sz) { void *mem = malloc(sz+sizeof(size_t)) ; size_t *psz = mem ; *psz = sz ; expat_mem += sz ; return (char *)mem + sizeof(size_t); } static void expat_free(void *emem) { if (!emem) { return ; } void *mem = (char *)emem - sizeof(size_t) ; size_t *psz = mem ; expat_mem -= *psz ; free(mem) ; } static void *expat_realloc(void *emem, size_t sz) { if (!emem) { return expat_alloc(sz) ; } void *mem = (char *)emem - sizeof(size_t) ; size_t *psz = mem ; expat_mem -= *psz ; mem = realloc(mem, sz + sizeof(size_t)); psz = mem ; *psz = sz ; expat_mem += sz ; return (char *)mem + sizeof(size_t); } static XML_Memory_Handling_Suite expat_mm = { expat_alloc, expat_realloc, expat_free }; /* * expat_filter_init */ #define CALLBACK(fn,handler) fn ( fctx->parser , handler ) static int expat_filter_init(ap_filter_t* f) { expat_ctx *fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(expat_ctx)) ; sax_ctx *sctx = &fctx->sax ; request_rec *r = f->r ; ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, "expat_filter_init called."); /* Chunked encoding enables HTTP keepalive * and removes Content-Length */ if ( r->proto_num >= 1001 ) { if ( ! r->main && ! r->prev ) r->chunked = 1 ; } /* but for the else cases */ apr_table_unset(r->headers_out, "Content-Length") ; /* Init the sax_ctx */ sax_ctx_init(sctx, /* with a newly created brigade */ apr_brigade_create(r->pool, f->c->bucket_alloc), f, mod_expat_abort_filter) ; char* enc = ctype2encoding(r->pool, r->content_type) ; /* set up the parser */ fctx->sz_parsed = 0 ; fctx->parser = XML_ParserCreate_MM(enc, (ap_server_conf->loglevel>=APLOG_INFO)?&expat_mm:NULL, &SEP_NS) ; if (!fctx->parser) { ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "XML_ParserCreateNS failed for enc. %s", enc); } apr_pool_cleanup_register(r->pool, fctx->parser, (void*)XML_ParserFree, apr_pool_cleanup_null) ; /* Turn on 3part names */ XML_SetReturnNSTriplet(fctx->parser, 1) ; /* * Set handlers */ CALLBACK(XML_SetDefaultHandler, expat_bucket_create_default) ; CALLBACK(XML_SetCommentHandler, expat_bucket_create_comment) ; CALLBACK(XML_SetStartElementHandler, expat_bucket_create_start_elt) ; CALLBACK(XML_SetEndElementHandler, expat_bucket_create_end_elt) ; CALLBACK(XML_SetStartNamespaceDeclHandler, expat_bucket_create_start_ns) ; CALLBACK(XML_SetEndNamespaceDeclHandler, expat_bucket_create_end_ns) ; CALLBACK(XML_SetXmlDeclHandler, expat_bucket_create_xml_decl) ; CALLBACK(XML_SetCharacterDataHandler, expat_bucket_create_character); CALLBACK(XML_SetStartCdataSectionHandler, expat_bucket_create_start_cd); CALLBACK(XML_SetEndCdataSectionHandler, expat_bucket_create_end_cd); CALLBACK(XML_SetProcessingInstructionHandler, expat_bucket_create_proc_instr); /* Set the context as user data */ XML_SetUserData(fctx->parser, fctx) ; return OK ; } /** * Logs memory usage related information * @param sctx - The SAX context * @param - r_log - The request */ static void mod_expat_log_mem_usage(sax_ctx *sctx) { request_rec *r_log = sctx->bctx.r_log ; // Report unique table counts, to detect growth const apr_array_header_t *set_n = apr_table_elts(sctx->bctx.unq.name) ; const apr_array_header_t *set_u = apr_table_elts(sctx->bctx.unq.uri) ; const apr_array_header_t *set_p = apr_table_elts(sctx->bctx.unq.prefix) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "%d names, %d uris, %d prefixes.", set_n->nelts, set_u->nelts, set_p->nelts) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "%d elts in frag_buf, %d elts in newns, %d elts in starts.", sctx->mctx->frag_buf->nelts, sctx->mctx->newns->nelts, sctx->starts->nelts) ; const char *unit; double sz = sax_hr_size(sctx->bctx.sum_mem, &unit) ; // Report bucket memory currently in use ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "%.4g %s of bucket memory in use.", sz, unit) ; // Report expat memory currently in use sz = sax_hr_size(expat_mem, &unit) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "%.4g %s bytes of expat memory in global use.", sz, unit) ; } typedef enum { PARSE_OK, PARSE_ERR, PARSE_ABORT } parse_status_t ; #define MIN(x,y) ((x)<(y)?(x):(y)) /** * Parses the XML input * @param sctx - The SAX context * @param - r_log - The request * @return - 0 if sucessful */ static parse_status_t mod_expat_parse(expat_ctx *ectx, const char *buf, apr_size_t len, int end) { parse_status_t rv = PARSE_OK ; request_rec *r_log = ectx->sax.bctx.r_log ; apr_pool_t *p_tmp = ectx->sax.bctx.p_tmp ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "Expat is about to parse %d bytes.", len); if ( XML_Parse(ectx->parser, buf, len, end) != XML_STATUS_OK ) { enum XML_Error err = XML_GetErrorCode(ectx->parser) ; const XML_LChar* msg = XML_ErrorString(err) ; if ( ( err == XML_ERROR_FINISHED ) || ( err == XML_ERROR_ABORTED ) ) { rv = PARSE_ABORT ; } else { rv = PARSE_ERR ; } ap_log_rerror(APLOG_MARK, /* Abort is not an error */ (rv==PARSE_ERR)?APLOG_ERR:APLOG_DEBUG, 0, r_log, "Parse Error %d: %s", err, msg) ; if (rv == PARSE_ERR) { const int line = XML_GetCurrentLineNumber(ectx->parser) ; const int col = XML_GetCurrentColumnNumber(ectx->parser) ; const long pos = XML_GetCurrentByteIndex(ectx->parser) ; const apr_size_t bpos = pos - ectx->sz_parsed ; ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r_log, "at line %d, column %d:", line, col) ; ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r_log, "'%s'", apr_pstrmemdup(p_tmp, buf + bpos, MIN(len-bpos, 100)) ) ; } } return rv ; } /* static int mod_expat_no_mmap(apr_bucket *b) { if (strcmp(b->type->name, "FILE") == 0) { apr_bucket_file *a = b->data ; a->can_mmap = 0 ; } return APR_SUCCESS ; } */ /* * expat_filter */ static int expat_filter(ap_filter_t* f, apr_bucket_brigade* bb) { apr_bucket *b ; apr_bucket *del = NULL ; const char *buf = NULL ; apr_size_t bytes = 0 ; apr_status_t rv = APR_SUCCESS ; expat_ctx *ctxt = f->ctx ; sax_ctx *sctx = &ctxt->sax ; request_rec *r_log = sctx->bctx.r_log ; ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r_log, "expat_filter called. APR_BUCKET_ALLOC_SIZE is %d", APR_BUCKET_ALLOC_SIZE); for ( b = APR_BRIGADE_FIRST(bb) ; b != APR_BRIGADE_SENTINEL(bb) ; b = APR_BUCKET_NEXT(b) ) { // del can now be deleted if (del) { apr_bucket_delete(del) ; del = NULL ; } // Basic consistency ap_assert(b->list == bb->bucket_alloc) ; if ( APR_BUCKET_IS_EOS(b) ) { ap_assert(APR_BUCKET_NEXT(b) == APR_BRIGADE_SENTINEL(bb)) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "EOS bucket found.") ; if (PARSE_ABORT == mod_expat_parse(ctxt, buf, 0, 1)) { return sctx->rv ; } // This the end anyway, so no special // handling of PARSE_ERR takes place. } if ( APR_BUCKET_IS_METADATA(b) ) { apr_bucket *p = NULL; apr_bucket_copy(b, &p) ; APR_BRIGADE_INSERT_TAIL(sctx->bb, p) ; } else { // this is the main parser call if ( apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ) == APR_SUCCESS ) { switch ( mod_expat_parse(ctxt, buf, bytes, 0) ) { case PARSE_OK: // We need to delete the bucket. // With a large file doing this at // brigades end is too far away. // We can however not delete it now, // since it knows the next bucket. del = b ; ctxt->sz_parsed += bytes ; break ; case PARSE_ABORT: return sctx->rv ; case PARSE_ERR: APR_BRIGADE_INSERT_TAIL(sctx->bb, apr_bucket_eos_create(sctx->bb->bucket_alloc) ) ; ap_pass_brigade(f->next, sctx->bb) ; //XXX: What is a good error return here? return -500 ; } mod_expat_log_mem_usage(sctx) ; } else { ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r_log, "Error in bucket read") ; } } } ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r_log, "End of brigade.") ; mod_expat_log_mem_usage(sctx) ; // This is not ours apr_brigade_cleanup(bb) ; // We are done, so we pass the new buckets return sax_pass_buckets(sctx, 0) ; } /***************************************************************************** * The usual module stuff *****************************************************************************/ static void expat_hooks(apr_pool_t* p) { ap_register_output_filter("expat", expat_filter, expat_filter_init, AP_FTYPE_RESOURCE) ; APR_REGISTER_OPTIONAL_FN(mod_expat_abort); } module AP_MODULE_DECLARE_DATA expat_module = { STANDARD20_MODULE_STUFF, NULL, NULL, NULL, NULL, NULL, expat_hooks } ;