diff options
author | Shimeng (Simon) Wang <swang@google.com> | 2010-04-01 11:46:35 -0700 |
---|---|---|
committer | Shimeng (Simon) Wang <swang@google.com> | 2010-04-02 10:21:34 -0700 |
commit | 218647ebd721a259319b28f5719e89a4d0d1aaef (patch) | |
tree | b6cb84b4b51883557305ba83e784fc6964c7481a | |
parent | d1bb91585716a46886a95d3f40af584d471ab706 (diff) | |
download | libxml2-froyo.tar.gz |
Add ICU support for libxml.android-sdk-tools_r7android-sdk-tools_r6android-sdk-2.2_r2android-sdk-2.2_r1android-cts-2.2_r8android-cts-2.2_r7android-cts-2.2_r6android-cts-2.2_r5android-cts-2.2_r4android-cts-2.2_r3android-cts-2.2_r2android-cts-2.2_r1android-adt-0.9.9android-adt-0.9.8android-2.2_r1.3android-2.2_r1.2android-2.2_r1.1android-2.2_r1android-2.2.3_r2.1android-2.2.3_r2android-2.2.3_r1android-2.2.2_r1android-2.2.1_r2android-2.2.1_r1tools_r9tools_r8tools_r7froyo-releasefroyo-plus-aospfroyo
This is derived from Jungshik's libxml patch for Chrome.
Issue:2557315
Change-Id: I8e4c9e544660f3f943a15042756f7248d5afff8e
-rw-r--r-- | Android.mk | 4 | ||||
-rw-r--r-- | encoding.c | 248 | ||||
-rw-r--r-- | include/libxml/encoding.h | 29 | ||||
-rw-r--r-- | include/libxml/parser.h | 3 | ||||
-rw-r--r-- | include/libxml/xmlversion.h | 11 | ||||
-rw-r--r-- | parser.c | 9 | ||||
-rw-r--r-- | patches/0001-Add-ICU-support-for-libxml.patch | 559 | ||||
-rw-r--r-- | xmlregexp.c | 2 |
8 files changed, 853 insertions, 12 deletions
@@ -57,7 +57,7 @@ common_C_INCLUDES += \ include $(CLEAR_VARS) LOCAL_SRC_FILES := $(common_SRC_FILES) -LOCAL_C_INCLUDES += $(common_C_INCLUDES) +LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) LOCAL_CFLAGS += -fvisibility=hidden @@ -71,7 +71,7 @@ include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) LOCAL_SRC_FILES := $(common_SRC_FILES) -LOCAL_C_INCLUDES += $(common_C_INCLUDES) +LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) LOCAL_MODULE:= libxml2 include $(BUILD_HOST_STATIC_LIBRARY) @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; static int xmlCharEncodingAliasesNb = 0; static int xmlCharEncodingAliasesMax = 0; -#ifdef LIBXML_ICONV_ENABLED +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) #if 0 #define DEBUG_ENCODING /* Define this to get encoding traces */ #endif @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) NULL, 0, val, NULL, NULL, 0, 0, msg, val); } +#ifdef LIBXML_ICU_ENABLED +static uconv_t* +openIcuConverter(const char* name, int toUnicode) +{ + UErrorCode status = U_ZERO_ERROR; + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); + if (conv == NULL) + return NULL; + + conv->uconv = ucnv_open(name, &status); + if (U_FAILURE(status)) + goto error; + + status = U_ZERO_ERROR; + if (toUnicode) { + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, + NULL, NULL, NULL, &status); + } + else { + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, + NULL, NULL, NULL, &status); + } + if (U_FAILURE(status)) + goto error; + + status = U_ZERO_ERROR; + conv->utf8 = ucnv_open("UTF-8", &status); + if (U_SUCCESS(status)) + return conv; + +error: + if (conv->uconv) + ucnv_close(conv->uconv); + xmlFree(conv); + return NULL; +} + +static void +closeIcuConverter(uconv_t *conv) +{ + if (conv != NULL) { + ucnv_close(conv->uconv); + ucnv_close(conv->utf8); + xmlFree(conv); + } +} +#endif /* LIBXML_ICU_ENABLED */ + /************************************************************************ * * * Conversions To/From UTF8 encoding * @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, #ifdef LIBXML_ICONV_ENABLED handler->iconv_in = NULL; handler->iconv_out = NULL; -#endif /* LIBXML_ICONV_ENABLED */ +#endif +#ifdef LIBXML_ICU_ENABLED + handler->uconv_in = NULL; + handler->uconv_out = NULL; +#endif /* * registers and returns the handler. @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); #endif /* LIBXML_OUTPUT_ENABLED */ -#ifndef LIBXML_ICONV_ENABLED +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) #ifdef LIBXML_ISO8859X_ENABLED xmlRegisterCharEncodingHandlersISO8859x (); #endif @@ -1576,6 +1628,10 @@ xmlFindCharEncodingHandler(const char *name) { xmlCharEncodingHandlerPtr enc; iconv_t icv_in, icv_out; #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + xmlCharEncodingHandlerPtr enc; + uconv_t *ucv_in, *ucv_out; +#endif /* LIBXML_ICU_ENABLED */ char upper[100]; int i; @@ -1642,6 +1698,35 @@ xmlFindCharEncodingHandler(const char *name) { "iconv : problems with filters for '%s'\n", name); } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + /* check whether icu can handle this */ + ucv_in = openIcuConverter(name, 1); + ucv_out = openIcuConverter(name, 0); + if (ucv_in != NULL && ucv_out != NULL) { + enc = (xmlCharEncodingHandlerPtr) + xmlMalloc(sizeof(xmlCharEncodingHandler)); + if (enc == NULL) { + closeIcuConverter(ucv_in); + closeIcuConverter(ucv_out); + return(NULL); + } + enc->name = xmlMemStrdup(name); + enc->input = NULL; + enc->output = NULL; + enc->uconv_in = ucv_in; + enc->uconv_out = ucv_out; +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Found ICU converter handler for encoding %s\n", name); +#endif + return enc; + } else if (ucv_in != NULL || ucv_out != NULL) { + closeIcuConverter(ucv_in); + closeIcuConverter(ucv_out); + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, + "ICU converter : problems with filters for '%s'\n", name); + } +#endif /* LIBXML_ICU_ENABLED */ #ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, @@ -1732,6 +1817,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, /************************************************************************ * * + * ICU based generic conversion functions * + * * + ************************************************************************/ + +#ifdef LIBXML_ICU_ENABLED +/** + * xmlUconvWrapper: + * @cd: ICU uconverter data structure + * @toUnicode : non-zero if toUnicode. 0 otherwise. + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ISO Latin 1 chars + * @inlen: the length of @in + * + * Returns 0 if success, or + * -1 by lack of space, or + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + * -3 if there the last byte can't form a single output char. + * + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +static int +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen) { + const char *ucv_in = (const char *) in; + char *ucv_out = (char *) out; + UErrorCode err = U_ZERO_ERROR; + + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { + if (outlen != NULL) *outlen = 0; + return(-1); + } + + /* + * TODO(jungshik) + * 1. is ucnv_convert(To|From)Algorithmic better? + * 2. had we better use an explicit pivot buffer? + * 3. error returned comes from 'fromUnicode' only even + * when toUnicode is true ! + */ + if (toUnicode) { + /* encoding => UTF-16 => UTF-8 */ + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, + 0, TRUE, &err); + } else { + /* UTF-8 => UTF-16 => encoding */ + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, + 0, TRUE, &err); + } + *inlen = ucv_in - (const char*) in; + *outlen = ucv_out - (char *) out; + if (U_SUCCESS(err)) + return 0; + if (err == U_BUFFER_OVERFLOW_ERROR) + return -1; + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) + return -2; + /* if (err == U_TRUNCATED_CHAR_FOUND) */ + return -3; +} +#endif /* LIBXML_ICU_ENABLED */ + +/************************************************************************ + * * * The real API used by libxml for on-the-fly conversion * * * ************************************************************************/ @@ -1794,6 +1948,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, if (ret == -1) ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_in != NULL) { + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ #ifdef DEBUG_ENCODING switch (ret) { case 0: @@ -1879,6 +2043,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_in != NULL) { + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ switch (ret) { case 0: #ifdef DEBUG_ENCODING @@ -1979,6 +2154,15 @@ retry: out->content[out->use] = 0; } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_out != NULL) { + ret = xmlUconvWrapper(handler->uconv_out, 0, + &out->content[out->use], + &written, NULL, &toconv); + out->use += written; + out->content[out->use] = 0; + } +#endif /* LIBXML_ICU_ENABLED */ #ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, "initialized encoder\n"); @@ -2003,7 +2187,7 @@ retry: xmlBufferShrink(in, toconv); out->use += written; writtentot += written; - } + } out->content[out->use] = 0; } #ifdef LIBXML_ICONV_ENABLED @@ -2025,6 +2209,26 @@ retry: } } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_out != NULL) { + ret = xmlUconvWrapper(handler->uconv_out, 0, + &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + writtentot += written; + out->content[out->use] = 0; + if (ret == -1) { + if (written > 0) { + /* + * Can be a limitation of iconv + */ + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICU_ENABLED */ else { xmlEncodingErr(XML_I18N_NO_OUTPUT, "xmlCharEncOutFunc: no output function !\n", NULL); @@ -2137,6 +2341,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { xmlFree(handler); } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { + if (handler->name != NULL) + xmlFree(handler->name); + handler->name = NULL; + if (handler->uconv_out != NULL) { + closeIcuConverter(handler->uconv_out); + handler->uconv_out = NULL; + } + if (handler->uconv_in != NULL) { + closeIcuConverter(handler->uconv_in); + handler->uconv_in = NULL; + } + xmlFree(handler); + } +#endif #ifdef DEBUG_ENCODING if (ret) xmlGenericError(xmlGenericErrorContext, @@ -2212,6 +2432,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { cur += toconv; } while (ret == -2); #endif +#ifdef LIBXML_ICU_ENABLED + } else if (handler->uconv_out != NULL) { + do { + toconv = in->end - cur; + written = 32000; + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], + &written, cur, &toconv); + if (ret < 0) { + if (written > 0) + ret = -2; + else + return(-1); + } + unused += written; + cur += toconv; + } while (ret == -2); } else { /* could not find a converter */ return(-1); @@ -2223,8 +2459,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { } return(in->consumed + (in->cur - in->base)); } +#endif -#ifndef LIBXML_ICONV_ENABLED +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) #ifdef LIBXML_ISO8859X_ENABLED /** @@ -3296,4 +3533,3 @@ xmlRegisterCharEncodingHandlersISO8859x (void) { #define bottom_encoding #include "elfgcchack.h" - diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index c74b25f3..c68ec109 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -26,6 +26,24 @@ #ifdef LIBXML_ICONV_ENABLED #include <iconv.h> +#else +#ifdef LIBXML_ICU_ENABLED +#include <unicode/ucnv.h> +#if 0 +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> + * to prevent unwanted ICU symbols being exposed to users of libxml2. + * One particular case is Qt4 conflicting on UChar32. + */ +#include <stdint.h> +struct UConverter; +typedef struct UConverter UConverter; +#ifdef _MSC_VER +typedef wchar_t UChar; +#else +typedef uint16_t UChar; +#endif +#endif +#endif #endif #ifdef __cplusplus extern "C" { @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, * Block defining the handlers for non UTF-8 encodings. * If iconv is supported, there are two extra fields. */ +#ifdef LIBXML_ICU_ENABLED +struct _uconv_t { + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ +}; +typedef struct _uconv_t uconv_t; +#endif typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { iconv_t iconv_in; iconv_t iconv_out; #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + uconv_t *uconv_in; + uconv_t *uconv_out; +#endif /* LIBXML_ICU_ENABLED */ }; #ifdef __cplusplus diff --git a/include/libxml/parser.h b/include/libxml/parser.h index 567addba..bd9de248 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -276,6 +276,7 @@ struct _xmlParserCtxt { int nsNr; /* the number of inherited namespaces */ int nsMax; /* the size of the arrays */ const xmlChar * *nsTab; /* the array of prefix/namespace name */ + struct _xmlParserCtxt *nsParent; /* parent context to inherit namespaces from * */ int *attallocs; /* which attribute were allocated */ void * *pushTab; /* array of data for push */ xmlHashTablePtr attsDefault; /* defaulted attributes if any */ @@ -1213,6 +1214,7 @@ typedef enum { XML_WITH_DEBUG_MEM = 29, XML_WITH_DEBUG_RUN = 30, XML_WITH_ZLIB = 31, + XML_WITH_ICU = 32, XML_WITH_NONE = 99999 /* just to be sure of allocation size */ } xmlFeature; @@ -1223,4 +1225,3 @@ XMLPUBFUN int XMLCALL } #endif #endif /* __XML_PARSER_H__ */ - diff --git a/include/libxml/xmlversion.h b/include/libxml/xmlversion.h index a98e00c2..fb2b8cad 100644 --- a/include/libxml/xmlversion.h +++ b/include/libxml/xmlversion.h @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); #endif /** + * LIBXML_ICU_ENABLED: + * + * Whether icu support is available + */ +#if 1 +#define LIBXML_ICU_ENABLED +#endif + +/** * LIBXML_ISO8859X_ENABLED: * * Whether ISO-8859-* support is made available in case iconv is not @@ -454,5 +463,3 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); } #endif /* __cplusplus */ #endif - - @@ -937,6 +937,12 @@ xmlHasFeature(xmlFeature feature) #else return(0); #endif + case XML_WITH_ICU: +#ifdef LIBXML_ICU_ENABLED + return(1); +#else + return(0); +#endif default: break; } @@ -8189,6 +8195,7 @@ xmlGetNamespace(xmlParserCtxtPtr ctxt, const xmlChar *prefix) { return(NULL); return(ctxt->nsTab[i + 1]); } + if (ctxt->nsParent) return xmlGetNamespace(ctxt->nsParent, prefix); return(NULL); } @@ -12538,6 +12545,8 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt, ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5); ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36); + ctxt->nsParent = oldctxt; + oldsax = ctxt->sax; ctxt->sax = oldctxt->sax; xmlDetectSAX2(ctxt); diff --git a/patches/0001-Add-ICU-support-for-libxml.patch b/patches/0001-Add-ICU-support-for-libxml.patch new file mode 100644 index 00000000..401099dd --- /dev/null +++ b/patches/0001-Add-ICU-support-for-libxml.patch @@ -0,0 +1,559 @@ +From f1121648d0762cf9bf4e5117bfc1008447fb4080 Mon Sep 17 00:00:00 2001 +From: android +Date: Thu, 1 Apr 2010 11:46:35 -0700 +Subject: [PATCH] Add ICU support for libxml. + +This is derived from Jungshik's patch. The encoding.c is a copy from Chrome's source, +which has one extra modification than Jungshik's patch. + +Issue:2557315 +Change-Id: I8e4c9e544660f3f943a15042756f7248d5afff8e +--- + Android.mk | 4 +- + encoding.c | 248 +++++++++++++++++++++++++++++++++++++++++- + include/libxml/encoding.h | 29 +++++ + include/libxml/parser.h | 3 +- + include/libxml/xmlversion.h | 11 ++- + parser.c | 9 ++ + xmlregexp.c | 2 +- + 7 files changed, 294 insertions(+), 12 deletions(-) + +diff --git a/Android.mk b/Android.mk +index 3d0ede8..08bf11f 100644 +--- a/Android.mk ++++ b/Android.mk +@@ -57,7 +57,7 @@ common_C_INCLUDES += \ + include $(CLEAR_VARS) + + LOCAL_SRC_FILES := $(common_SRC_FILES) +-LOCAL_C_INCLUDES += $(common_C_INCLUDES) ++LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common + LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) + LOCAL_CFLAGS += -fvisibility=hidden + +@@ -71,7 +71,7 @@ include $(BUILD_STATIC_LIBRARY) + + include $(CLEAR_VARS) + LOCAL_SRC_FILES := $(common_SRC_FILES) +-LOCAL_C_INCLUDES += $(common_C_INCLUDES) ++LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common + LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) + LOCAL_MODULE:= libxml2 + include $(BUILD_HOST_STATIC_LIBRARY) +diff --git a/encoding.c b/encoding.c +index e2df797..2abc32e 100644 +--- a/encoding.c ++++ b/encoding.c +@@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; + static int xmlCharEncodingAliasesNb = 0; + static int xmlCharEncodingAliasesMax = 0; + +-#ifdef LIBXML_ICONV_ENABLED ++#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) + #if 0 + #define DEBUG_ENCODING /* Define this to get encoding traces */ + #endif +@@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) + NULL, 0, val, NULL, NULL, 0, 0, msg, val); + } + ++#ifdef LIBXML_ICU_ENABLED ++static uconv_t* ++openIcuConverter(const char* name, int toUnicode) ++{ ++ UErrorCode status = U_ZERO_ERROR; ++ uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); ++ if (conv == NULL) ++ return NULL; ++ ++ conv->uconv = ucnv_open(name, &status); ++ if (U_FAILURE(status)) ++ goto error; ++ ++ status = U_ZERO_ERROR; ++ if (toUnicode) { ++ ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, ++ NULL, NULL, NULL, &status); ++ } ++ else { ++ ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, ++ NULL, NULL, NULL, &status); ++ } ++ if (U_FAILURE(status)) ++ goto error; ++ ++ status = U_ZERO_ERROR; ++ conv->utf8 = ucnv_open("UTF-8", &status); ++ if (U_SUCCESS(status)) ++ return conv; ++ ++error: ++ if (conv->uconv) ++ ucnv_close(conv->uconv); ++ xmlFree(conv); ++ return NULL; ++} ++ ++static void ++closeIcuConverter(uconv_t *conv) ++{ ++ if (conv != NULL) { ++ ucnv_close(conv->uconv); ++ ucnv_close(conv->utf8); ++ xmlFree(conv); ++ } ++} ++#endif /* LIBXML_ICU_ENABLED */ ++ + /************************************************************************ + * * + * Conversions To/From UTF8 encoding * +@@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, + #ifdef LIBXML_ICONV_ENABLED + handler->iconv_in = NULL; + handler->iconv_out = NULL; +-#endif /* LIBXML_ICONV_ENABLED */ ++#endif ++#ifdef LIBXML_ICU_ENABLED ++ handler->uconv_in = NULL; ++ handler->uconv_out = NULL; ++#endif + + /* + * registers and returns the handler. +@@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { + xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); + xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); + #endif /* LIBXML_OUTPUT_ENABLED */ +-#ifndef LIBXML_ICONV_ENABLED ++#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) + #ifdef LIBXML_ISO8859X_ENABLED + xmlRegisterCharEncodingHandlersISO8859x (); + #endif +@@ -1576,6 +1628,10 @@ xmlFindCharEncodingHandler(const char *name) { + xmlCharEncodingHandlerPtr enc; + iconv_t icv_in, icv_out; + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ xmlCharEncodingHandlerPtr enc; ++ uconv_t *ucv_in, *ucv_out; ++#endif /* LIBXML_ICU_ENABLED */ + char upper[100]; + int i; + +@@ -1642,6 +1698,35 @@ xmlFindCharEncodingHandler(const char *name) { + "iconv : problems with filters for '%s'\n", name); + } + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ /* check whether icu can handle this */ ++ ucv_in = openIcuConverter(name, 1); ++ ucv_out = openIcuConverter(name, 0); ++ if (ucv_in != NULL && ucv_out != NULL) { ++ enc = (xmlCharEncodingHandlerPtr) ++ xmlMalloc(sizeof(xmlCharEncodingHandler)); ++ if (enc == NULL) { ++ closeIcuConverter(ucv_in); ++ closeIcuConverter(ucv_out); ++ return(NULL); ++ } ++ enc->name = xmlMemStrdup(name); ++ enc->input = NULL; ++ enc->output = NULL; ++ enc->uconv_in = ucv_in; ++ enc->uconv_out = ucv_out; ++#ifdef DEBUG_ENCODING ++ xmlGenericError(xmlGenericErrorContext, ++ "Found ICU converter handler for encoding %s\n", name); ++#endif ++ return enc; ++ } else if (ucv_in != NULL || ucv_out != NULL) { ++ closeIcuConverter(ucv_in); ++ closeIcuConverter(ucv_out); ++ xmlEncodingErr(XML_ERR_INTERNAL_ERROR, ++ "ICU converter : problems with filters for '%s'\n", name); ++ } ++#endif /* LIBXML_ICU_ENABLED */ + + #ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, +@@ -1732,6 +1817,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, + + /************************************************************************ + * * ++ * ICU based generic conversion functions * ++ * * ++ ************************************************************************/ ++ ++#ifdef LIBXML_ICU_ENABLED ++/** ++ * xmlUconvWrapper: ++ * @cd: ICU uconverter data structure ++ * @toUnicode : non-zero if toUnicode. 0 otherwise. ++ * @out: a pointer to an array of bytes to store the result ++ * @outlen: the length of @out ++ * @in: a pointer to an array of ISO Latin 1 chars ++ * @inlen: the length of @in ++ * ++ * Returns 0 if success, or ++ * -1 by lack of space, or ++ * -2 if the transcoding fails (for *in is not valid utf8 string or ++ * the result of transformation can't fit into the encoding we want), or ++ * -3 if there the last byte can't form a single output char. ++ * ++ * The value of @inlen after return is the number of octets consumed ++ * as the return value is positive, else unpredictable. ++ * The value of @outlen after return is the number of ocetes consumed. ++ */ ++static int ++xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, ++ const unsigned char *in, int *inlen) { ++ const char *ucv_in = (const char *) in; ++ char *ucv_out = (char *) out; ++ UErrorCode err = U_ZERO_ERROR; ++ ++ if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { ++ if (outlen != NULL) *outlen = 0; ++ return(-1); ++ } ++ ++ /* ++ * TODO(jungshik) ++ * 1. is ucnv_convert(To|From)Algorithmic better? ++ * 2. had we better use an explicit pivot buffer? ++ * 3. error returned comes from 'fromUnicode' only even ++ * when toUnicode is true ! ++ */ ++ if (toUnicode) { ++ /* encoding => UTF-16 => UTF-8 */ ++ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, ++ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, ++ 0, TRUE, &err); ++ } else { ++ /* UTF-8 => UTF-16 => encoding */ ++ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, ++ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, ++ 0, TRUE, &err); ++ } ++ *inlen = ucv_in - (const char*) in; ++ *outlen = ucv_out - (char *) out; ++ if (U_SUCCESS(err)) ++ return 0; ++ if (err == U_BUFFER_OVERFLOW_ERROR) ++ return -1; ++ if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) ++ return -2; ++ /* if (err == U_TRUNCATED_CHAR_FOUND) */ ++ return -3; ++} ++#endif /* LIBXML_ICU_ENABLED */ ++ ++/************************************************************************ ++ * * + * The real API used by libxml for on-the-fly conversion * + * * + ************************************************************************/ +@@ -1794,6 +1948,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, + if (ret == -1) ret = -3; + } + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ else if (handler->uconv_in != NULL) { ++ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], ++ &written, in->content, &toconv); ++ xmlBufferShrink(in, toconv); ++ out->use += written; ++ out->content[out->use] = 0; ++ if (ret == -1) ret = -3; ++ } ++#endif /* LIBXML_ICU_ENABLED */ + #ifdef DEBUG_ENCODING + switch (ret) { + case 0: +@@ -1879,6 +2043,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, + ret = -3; + } + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ else if (handler->uconv_in != NULL) { ++ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], ++ &written, in->content, &toconv); ++ xmlBufferShrink(in, toconv); ++ out->use += written; ++ out->content[out->use] = 0; ++ if (ret == -1) ++ ret = -3; ++ } ++#endif /* LIBXML_ICU_ENABLED */ + switch (ret) { + case 0: + #ifdef DEBUG_ENCODING +@@ -1979,6 +2154,15 @@ retry: + out->content[out->use] = 0; + } + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ else if (handler->uconv_out != NULL) { ++ ret = xmlUconvWrapper(handler->uconv_out, 0, ++ &out->content[out->use], ++ &written, NULL, &toconv); ++ out->use += written; ++ out->content[out->use] = 0; ++ } ++#endif /* LIBXML_ICU_ENABLED */ + #ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "initialized encoder\n"); +@@ -2003,7 +2187,7 @@ retry: + xmlBufferShrink(in, toconv); + out->use += written; + writtentot += written; +- } ++ } + out->content[out->use] = 0; + } + #ifdef LIBXML_ICONV_ENABLED +@@ -2025,6 +2209,26 @@ retry: + } + } + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ else if (handler->uconv_out != NULL) { ++ ret = xmlUconvWrapper(handler->uconv_out, 0, ++ &out->content[out->use], ++ &written, in->content, &toconv); ++ xmlBufferShrink(in, toconv); ++ out->use += written; ++ writtentot += written; ++ out->content[out->use] = 0; ++ if (ret == -1) { ++ if (written > 0) { ++ /* ++ * Can be a limitation of iconv ++ */ ++ goto retry; ++ } ++ ret = -3; ++ } ++ } ++#endif /* LIBXML_ICU_ENABLED */ + else { + xmlEncodingErr(XML_I18N_NO_OUTPUT, + "xmlCharEncOutFunc: no output function !\n", NULL); +@@ -2137,6 +2341,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { + xmlFree(handler); + } + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { ++ if (handler->name != NULL) ++ xmlFree(handler->name); ++ handler->name = NULL; ++ if (handler->uconv_out != NULL) { ++ closeIcuConverter(handler->uconv_out); ++ handler->uconv_out = NULL; ++ } ++ if (handler->uconv_in != NULL) { ++ closeIcuConverter(handler->uconv_in); ++ handler->uconv_in = NULL; ++ } ++ xmlFree(handler); ++ } ++#endif + #ifdef DEBUG_ENCODING + if (ret) + xmlGenericError(xmlGenericErrorContext, +@@ -2212,6 +2432,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { + cur += toconv; + } while (ret == -2); + #endif ++#ifdef LIBXML_ICU_ENABLED ++ } else if (handler->uconv_out != NULL) { ++ do { ++ toconv = in->end - cur; ++ written = 32000; ++ ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], ++ &written, cur, &toconv); ++ if (ret < 0) { ++ if (written > 0) ++ ret = -2; ++ else ++ return(-1); ++ } ++ unused += written; ++ cur += toconv; ++ } while (ret == -2); + } else { + /* could not find a converter */ + return(-1); +@@ -2223,8 +2459,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { + } + return(in->consumed + (in->cur - in->base)); + } ++#endif + +-#ifndef LIBXML_ICONV_ENABLED ++#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) + #ifdef LIBXML_ISO8859X_ENABLED + + /** +@@ -3296,4 +3533,3 @@ xmlRegisterCharEncodingHandlersISO8859x (void) { + + #define bottom_encoding + #include "elfgcchack.h" +- +diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h +index c74b25f..c68ec10 100644 +--- a/include/libxml/encoding.h ++++ b/include/libxml/encoding.h +@@ -26,6 +26,24 @@ + + #ifdef LIBXML_ICONV_ENABLED + #include <iconv.h> ++#else ++#ifdef LIBXML_ICU_ENABLED ++#include <unicode/ucnv.h> ++#if 0 ++/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> ++ * to prevent unwanted ICU symbols being exposed to users of libxml2. ++ * One particular case is Qt4 conflicting on UChar32. ++ */ ++#include <stdint.h> ++struct UConverter; ++typedef struct UConverter UConverter; ++#ifdef _MSC_VER ++typedef wchar_t UChar; ++#else ++typedef uint16_t UChar; ++#endif ++#endif ++#endif + #endif + #ifdef __cplusplus + extern "C" { +@@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, + * Block defining the handlers for non UTF-8 encodings. + * If iconv is supported, there are two extra fields. + */ ++#ifdef LIBXML_ICU_ENABLED ++struct _uconv_t { ++ UConverter *uconv; /* for conversion between an encoding and UTF-16 */ ++ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ ++}; ++typedef struct _uconv_t uconv_t; ++#endif + + typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; + typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; +@@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { + iconv_t iconv_in; + iconv_t iconv_out; + #endif /* LIBXML_ICONV_ENABLED */ ++#ifdef LIBXML_ICU_ENABLED ++ uconv_t *uconv_in; ++ uconv_t *uconv_out; ++#endif /* LIBXML_ICU_ENABLED */ + }; + + #ifdef __cplusplus +diff --git a/include/libxml/parser.h b/include/libxml/parser.h +index 567addb..bd9de24 100644 +--- a/include/libxml/parser.h ++++ b/include/libxml/parser.h +@@ -276,6 +276,7 @@ struct _xmlParserCtxt { + int nsNr; /* the number of inherited namespaces */ + int nsMax; /* the size of the arrays */ + const xmlChar * *nsTab; /* the array of prefix/namespace name */ ++ struct _xmlParserCtxt *nsParent; /* parent context to inherit namespaces from * */ + int *attallocs; /* which attribute were allocated */ + void * *pushTab; /* array of data for push */ + xmlHashTablePtr attsDefault; /* defaulted attributes if any */ +@@ -1213,6 +1214,7 @@ typedef enum { + XML_WITH_DEBUG_MEM = 29, + XML_WITH_DEBUG_RUN = 30, + XML_WITH_ZLIB = 31, ++ XML_WITH_ICU = 32, + XML_WITH_NONE = 99999 /* just to be sure of allocation size */ + } xmlFeature; + +@@ -1223,4 +1225,3 @@ XMLPUBFUN int XMLCALL + } + #endif + #endif /* __XML_PARSER_H__ */ +- +diff --git a/include/libxml/xmlversion.h b/include/libxml/xmlversion.h +index a98e00c..fb2b8ca 100644 +--- a/include/libxml/xmlversion.h ++++ b/include/libxml/xmlversion.h +@@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); + #endif + + /** ++ * LIBXML_ICU_ENABLED: ++ * ++ * Whether icu support is available ++ */ ++#if 1 ++#define LIBXML_ICU_ENABLED ++#endif ++ ++/** + * LIBXML_ISO8859X_ENABLED: + * + * Whether ISO-8859-* support is made available in case iconv is not +@@ -454,5 +463,3 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); + } + #endif /* __cplusplus */ + #endif +- +- +diff --git a/parser.c b/parser.c +index 9db664f..306b84d 100644 +--- a/parser.c ++++ b/parser.c +@@ -937,6 +937,12 @@ xmlHasFeature(xmlFeature feature) + #else + return(0); + #endif ++ case XML_WITH_ICU: ++#ifdef LIBXML_ICU_ENABLED ++ return(1); ++#else ++ return(0); ++#endif + default: + break; + } +@@ -8189,6 +8195,7 @@ xmlGetNamespace(xmlParserCtxtPtr ctxt, const xmlChar *prefix) { + return(NULL); + return(ctxt->nsTab[i + 1]); + } ++ if (ctxt->nsParent) return xmlGetNamespace(ctxt->nsParent, prefix); + return(NULL); + } + +@@ -12538,6 +12545,8 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt, + ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5); + ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36); + ++ ctxt->nsParent = oldctxt; ++ + oldsax = ctxt->sax; + ctxt->sax = oldctxt->sax; + xmlDetectSAX2(ctxt); +diff --git a/xmlregexp.c b/xmlregexp.c +index 73598a5..4258a08 100644 +--- a/xmlregexp.c ++++ b/xmlregexp.c +@@ -6401,7 +6401,7 @@ xmlExpHashNameComputeKey(const xmlChar *name) { + if (name != NULL) { + value += 30 * (*name); + while ((ch = *name++) != 0) { +- value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); ++ value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch); + } + } + return (value); +-- +1.7.0.1 + diff --git a/xmlregexp.c b/xmlregexp.c index 73598a53..4258a086 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -6401,7 +6401,7 @@ xmlExpHashNameComputeKey(const xmlChar *name) { if (name != NULL) { value += 30 * (*name); while ((ch = *name++) != 0) { - value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); + value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch); } } return (value); |