Welcome! Log In Create A New Profile

Advanced

[PATCH 3 of 4] xslt_html_parser directive

Laurence Rowe
March 21, 2012 05:30PM
# HG changeset patch
# User Laurence Rowe <laurence@lrowe.co.uk>
# Date 1331329666 0
# Node ID 65fd4892a78371e863d43e31d4430cdb7333a35d
# Parent 151124d060d3f725c02b656d39c10575ff009cdb
xslt_html_parser directive

When ```xslt_html_parser on;`` the HTMLParser is used. Parsing is performed
with HTML_PARSE_RECOVER as real-world HTML may not be well formed, so only
fatal error handling is enabled when this option is set.

diff --git a/src/http/modules/ngx_http_xslt_filter_module.c b/src/http/modules/ngx_http_xslt_filter_module.c
--- a/src/http/modules/ngx_http_xslt_filter_module.c
+++ b/src/http/modules/ngx_http_xslt_filter_module.c
@@ -10,6 +10,7 @@
#include <ngx_http.h>

#include <libxml/parser.h>
+#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <libxslt/xslt.h>
#include <libxslt/xsltInternals.h>
@@ -58,6 +59,7 @@
ngx_hash_t types;
ngx_array_t *types_keys;
ngx_array_t *params; /* ngx_http_xslt_param_t */
+ ngx_flag_t html_parser;
} ngx_http_xslt_filter_loc_conf_t;


@@ -67,6 +69,7 @@
xsltTransformContextPtr transform;
ngx_http_request_t *request;
ngx_array_t params;
+ ngx_flag_t html_parser;

ngx_uint_t done; /* unsigned done:1; */
} ngx_http_xslt_filter_ctx_t;
@@ -150,6 +153,13 @@
offsetof(ngx_http_xslt_filter_loc_conf_t, types_keys),
&ngx_http_xslt_default_types[0] },

+ { ngx_string("xslt_html_parser"),
+ NGX_HTTP_LOC_CONF|NGX_CONF_FLAG,
+ ngx_conf_set_flag_slot,
+ NGX_HTTP_LOC_CONF_OFFSET,
+ offsetof(ngx_http_xslt_filter_loc_conf_t, html_parser),
+ NULL },
+
ngx_null_command
};

@@ -225,6 +235,8 @@

r->main_filter_need_in_memory = 1;

+ ctx->html_parser = conf->html_parser;
+
return NGX_OK;
}

@@ -261,7 +273,11 @@
xmlFreeDoc(ctx->ctxt->myDoc);
}

- xmlFreeParserCtxt(ctx->ctxt);
+ if (ctx->html_parser) {
+ htmlFreeParserCtxt(ctx->ctxt);
+ } else {
+ xmlFreeParserCtxt(ctx->ctxt);
+ }

return ngx_http_xslt_send(r, ctx, NULL);
}
@@ -276,9 +292,13 @@

wellFormed = ctx->ctxt->wellFormed;

- xmlFreeParserCtxt(ctx->ctxt);
+ if (ctx->html_parser) {
+ htmlFreeParserCtxt(ctx->ctxt);
+ } else {
+ xmlFreeParserCtxt(ctx->ctxt);
+ }

- if (wellFormed) {
+ if (wellFormed || ctx->html_parser) {
return ngx_http_xslt_send(r, ctx,
ngx_http_xslt_apply_stylesheet(r, ctx));
}
@@ -352,22 +372,48 @@
ngx_buf_t *b)
{
int err;
- xmlParserCtxtPtr ctxt;
+ xmlParserCtxtPtr ctxt = NULL;
+ xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;

if (ctx->ctxt == NULL) {

- ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
- if (ctxt == NULL) {
- ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
- "xmlCreatePushParserCtxt() failed");
- return NGX_ERROR;
+ if (ctx->html_parser) {
+ if (r->headers_out.charset.len) {
+ enc = xmlParseCharEncoding(
+ (const char *) r->headers_out.charset.data);
+ if (enc == XML_CHAR_ENCODING_ERROR) {
+ ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
+ "xmlParseCharEncoding() failed charset: %s",
+ r->headers_out.charset.data);
+ return NGX_ERROR;
+ }
+ }
+
+ ctxt = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, enc);
+ if (ctxt == NULL) {
+ ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
+ "htmlCreatePushParserCtxt() failed");
+ return NGX_ERROR;
+ }
+
+ htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR
+ |HTML_PARSE_NOWARNING);
+
+ } else {
+ ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
+ if (ctxt == NULL) {
+ ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
+ "xmlCreatePushParserCtxt() failed");
+ return NGX_ERROR;
+ }
+
+ xmlCtxtUseOptions(ctxt, XML_PARSE_NOENT|XML_PARSE_DTDLOAD
+ |XML_PARSE_NOWARNING);
+ ctxt->sax->externalSubset = ngx_http_xslt_sax_external_subset;
+ ctxt->sax->error = ngx_http_xslt_sax_error;
}
- xmlCtxtUseOptions(ctxt, XML_PARSE_NOENT|XML_PARSE_DTDLOAD
- |XML_PARSE_NOWARNING);

- ctxt->sax->externalSubset = ngx_http_xslt_sax_external_subset;
ctxt->sax->setDocumentLocator = NULL;
- ctxt->sax->error = ngx_http_xslt_sax_error;
ctxt->sax->fatalError = ngx_http_xslt_sax_error;
ctxt->sax->_private = ctx;

@@ -375,8 +421,16 @@
ctx->request = r;
}

- err = xmlParseChunk(ctx->ctxt, (char *) b->pos, (int) (b->last - b->pos),
- (b->last_buf) || (b->last_in_chain));
+ if (ctx->html_parser) {
+ err = htmlParseChunk(ctx->ctxt, (char *) b->pos,
+ (int) (b->last - b->pos),
+ (b->last_buf) || (b->last_in_chain));
+
+ } else {
+ err = xmlParseChunk(ctx->ctxt, (char *) b->pos,
+ (int) (b->last - b->pos),
+ (b->last_buf) || (b->last_in_chain));
+ }

if (ctx->done == 0) {
b->pos = b->last;
@@ -1059,6 +1113,8 @@
* conf->params = NULL;
*/

+ conf->html_parser = NGX_CONF_UNSET;
+
return conf;
}

@@ -1081,6 +1137,8 @@
conf->params = prev->params;
}

+ ngx_conf_merge_value(conf->html_parser, prev->html_parser, 0);
+
if (ngx_http_merge_types(cf, &conf->types_keys, &conf->types,
&prev->types_keys, &prev->types,
ngx_http_xslt_default_types)

_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel
Subject Author Views Posted

[PATCH 0 of 4] XSLT HTML parsing

Laurence Rowe 1295 March 21, 2012 05:30PM

[PATCH 3 of 4] xslt_html_parser directive

Laurence Rowe 574 March 21, 2012 05:30PM

[PATCH 2 of 4] Set done flag on module context to stop further chunk parsing

Laurence Rowe 505 March 21, 2012 05:40PM

Re: [PATCH 2 of 4] Set done flag on module context to stop further chunk parsing

Maxim Dounin 582 March 27, 2012 10:04PM

[PATCH 4 of 4] Handle empty response body

Laurence Rowe 493 March 21, 2012 05:40PM

Re: [PATCH 4 of 4] Handle empty response body

Maxim Dounin 514 March 27, 2012 10:04PM

[PATCH 1 of 4] Set parser options with xmlCtxtUseOptions

Laurence Rowe 556 March 21, 2012 05:40PM

Re: [PATCH 1 of 4] Set parser options with xmlCtxtUseOptions

Maxim Dounin 554 March 27, 2012 09:58PM



Sorry, you do not have permission to post/reply in this forum.

Online Users

Guests: 165
Record Number of Users: 8 on April 13, 2023
Record Number of Guests: 421 on December 02, 2018
Powered by nginx      Powered by FreeBSD      PHP Powered      Powered by MariaDB      ipv6 ready