diff options
author | Lucas Eckels <eckels@google.com> | 2016-08-23 23:30:19 +0000 |
---|---|---|
committer | android-build-merger <android-build-merger@google.com> | 2016-08-23 23:30:19 +0000 |
commit | 680f240bda39c798480bef3bb36c671af1f575c1 (patch) | |
tree | 099ebc3ecf8b51d0df490fedb34240896e93d3bb | |
parent | f919a5e238f35a8c7359ea4b8b43e9da2bb0da81 (diff) | |
parent | 1189ea57585f7c1d0365fbda1c696e96d3a25d37 (diff) | |
download | unicode-o-preview.tar.gz |
Add unicode source. am: dc4699f0a7 am: 92ae600936android-wear-o-preview-4android-wear-o-preview-3android-o-preview-4android-o-preview-3android-o-preview-2android-o-preview-1android-o-iot-preview-5android-n-iot-preview-4o-previewo-iot-preview-5n-iot-preview-4
am: 1189ea5758
Change-Id: Ic6780efcaf9168eccd8ba755a83f75cf78afdac6
-rw-r--r-- | .gitignore | 31 | ||||
-rw-r--r-- | CVTUTF7.C | 299 | ||||
-rw-r--r-- | CVTUTF7.H | 79 | ||||
-rw-r--r-- | ConvertUTF.c | 549 | ||||
-rw-r--r-- | ConvertUTF.h | 149 | ||||
-rw-r--r-- | ExpectedOutput.txt | 21 | ||||
-rw-r--r-- | harness.c | 446 | ||||
-rw-r--r-- | readme.txt | 44 | ||||
-rw-r--r-- | testunicode.sln | 20 | ||||
-rw-r--r-- | testunicode.vcproj | 209 | ||||
-rw-r--r-- | testunicode.xcodeproj/project.pbxproj | 188 |
11 files changed, 2035 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ff6170f --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +/build +/source/build +*.mode1v3 +*.pbxuser +*.pbxindex/ +!user.pbxuser +/*.log +*.user +*.ncb +*.suo +*.pdb +*.pdf +*.html +*.idb +*.o +*.lo +*.a +*.so +*.so.0 +*.la +.deps +.libs +*.pyc +.DS_Store +# Emacs and other editor backup files +*~ + +# builds on Windows +Debug/ +Release/ +release/ diff --git a/CVTUTF7.C b/CVTUTF7.C new file mode 100644 index 0000000..6958753 --- /dev/null +++ b/CVTUTF7.C @@ -0,0 +1,299 @@ +/* ================================================================ */ +/* +File: ConvertUTF7.c +Author: David B. Goldsmith +Copyright (C) 1994, 1996 IBM Corporation All rights reserved. +Revisions: Header update only July, 2001. + +This code is copyrighted. Under the copyright laws, this code may not +be copied, in whole or part, without prior written consent of IBM Corporation. + +IBM Corporation grants the right to use this code as long as this ENTIRE +copyright notice is reproduced in the code. The code is provided +AS-IS, AND IBM CORPORATION DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR +IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT +WILL IBM CORPORATION BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS +INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY +LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN +IF IBM CORPORATION HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF +LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE +LIMITATION MAY NOT APPLY TO YOU. + +RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the +government is subject to restrictions as set forth in subparagraph +(c)(l)(ii) of the Rights in Technical Data and Computer Software +clause at DFARS 252.227-7013 and FAR 52.227-19. + +This code may be protected by one or more U.S. and International +Patents. + +*/ + +#include "CVTUTF7.H" + +static char base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static short invbase64[128]; + +static char direct[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"; +static char optional[] = "!\"#$%&*;<=>@[]^_`{|}"; +static char spaces[] = " \011\015\012"; /* space, tab, return, line feed */ +static char mustshiftsafe[128]; +static char mustshiftopt[128]; + +static int needtables = 1; + +#define SHIFT_IN '+' +#define SHIFT_OUT '-' + +static void +tabinit() +{ + int i, limit; + + for (i = 0; i < 128; ++i) + { + mustshiftopt[i] = mustshiftsafe[i] = 1; + invbase64[i] = -1; + } + limit = strlen(direct); + for (i = 0; i < limit; ++i) + mustshiftopt[direct[i]] = mustshiftsafe[direct[i]] = 0; + limit = strlen(spaces); + for (i = 0; i < limit; ++i) + mustshiftopt[spaces[i]] = mustshiftsafe[spaces[i]] = 0; + limit = strlen(optional); + for (i = 0; i < limit; ++i) + mustshiftopt[optional[i]] = 0; + limit = strlen(base64); + for (i = 0; i < limit; ++i) + invbase64[base64[i]] = i; + + needtables = 0; +} + +#define DECLARE_BIT_BUFFER register unsigned long BITbuffer = 0, buffertemp = 0; int bufferbits = 0 +#define BITS_IN_BUFFER bufferbits +#define WRITE_N_BITS(x, n) ((BITbuffer |= ( ((x) & ~(-1L<<(n))) << (32-(n)-bufferbits) ) ), bufferbits += (n) ) +#define READ_N_BITS(n) ((buffertemp = (BITbuffer >> (32-(n)))), (BITbuffer <<= (n)), (bufferbits -= (n)), buffertemp) +#define TARGETCHECK {if (target >= targetEnd) {result = targetExhausted; break;}} + +ConversionResult ConvertUCS2toUTF7( + UCS2** sourceStart, UCS2* sourceEnd, + char** targetStart, char* targetEnd, + int optional, int verbose) +{ + ConversionResult result = ok; + DECLARE_BIT_BUFFER; + int shifted = 0, needshift = 0, done = 0; + register UCS2 *source = *sourceStart; + register char *target = *targetStart; + char *mustshift; + + if (needtables) + tabinit(); + + if (optional) + mustshift = mustshiftopt; + else + mustshift = mustshiftsafe; + + do + { + register UCS2 r; + + if (!(done = (source >= sourceEnd))) + r = *source++; + needshift = (!done && ((r > 0x7f) || mustshift[r])); + + if (needshift && !shifted) + { + TARGETCHECK; + *target++ = SHIFT_IN; + /* Special case handling of the SHIFT_IN character */ + if (r == (UCS2)SHIFT_IN) { + TARGETCHECK; + *target++ = SHIFT_OUT; + } + else + shifted = 1; + } + + if (shifted) + { + /* Either write the character to the bit buffer, or pad + the bit buffer out to a full base64 character. + */ + if (needshift) + WRITE_N_BITS(r, 16); + else + WRITE_N_BITS(0, (6 - (BITS_IN_BUFFER % 6))%6); + + /* Flush out as many full base64 characters as possible + from the bit buffer. + */ + while ((target < targetEnd) && BITS_IN_BUFFER >= 6) + { + *target++ = base64[READ_N_BITS(6)]; + } + + if (BITS_IN_BUFFER >= 6) + TARGETCHECK; + + if (!needshift) + { + /* Write the explicit shift out character if + 1) The caller has requested we always do it, or + 2) The directly encoded character is in the + base64 set, or + 3) The directly encoded character is SHIFT_OUT. + */ + if (verbose || ((!done) && (invbase64[r] >=0 || r == SHIFT_OUT))) + { + TARGETCHECK; + *target++ = SHIFT_OUT; + } + shifted = 0; + } + } + + /* The character can be directly encoded as ASCII. */ + if (!needshift && !done) + { + TARGETCHECK; + *target++ = (char) r; + } + + } + while (!done); + + *sourceStart = source; + *targetStart = target; + return result; +} + +ConversionResult ConvertUTF7toUCS2( + char** sourceStart, char* sourceEnd, + UCS2** targetStart, UCS2* targetEnd) +{ + ConversionResult result = ok; + DECLARE_BIT_BUFFER; + int shifted = 0, first = 0, wroteone = 0, base64EOF, base64value, done; + unsigned int c, prevc; + unsigned long junk; + register char *source = *sourceStart; + register UCS2 *target = *targetStart; + + if (needtables) + tabinit(); + + do + { + /* read an ASCII character c */ + if (!(done = (source >= sourceEnd))) + c = *source++; + if (shifted) + { + /* We're done with a base64 string if we hit EOF, it's not a valid + ASCII character, or it's not in the base64 set. + */ + base64EOF = done || (c > 0x7f) || (base64value = invbase64[c]) < 0; + if (base64EOF) + { + shifted = 0; + /* If the character causing us to drop out was SHIFT_IN or + SHIFT_OUT, it may be a special escape for SHIFT_IN. The + test for SHIFT_IN is not necessary, but allows an alternate + form of UTF-7 where SHIFT_IN is escaped by SHIFT_IN. This + only works for some values of SHIFT_IN. + */ + if (!done && (c == SHIFT_IN || c == SHIFT_OUT)) + { + /* get another character c */ + prevc = c; + if (!(done = (source >= sourceEnd))) + c = *source++; + /* If no base64 characters were encountered, and the + character terminating the shift sequence was + SHIFT_OUT, then it's a special escape for SHIFT_IN. + */ + if (first && prevc == SHIFT_OUT) + { + /* write SHIFT_IN unicode */ + TARGETCHECK; + *target++ = (UCS2)SHIFT_IN; + } + else if (!wroteone) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: empty sequence near byte %ld in input\n", source-sourceStart) */; + } + } + else if (!wroteone) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: empty sequence near byte %ld in input\n", source-sourceStart) */; + } + } + else + { + /* Add another 6 bits of base64 to the bit buffer. */ + WRITE_N_BITS(base64value, 6); + first = 0; + } + + /* Extract as many full 16 bit characters as possible from the + bit buffer. + */ + while (BITS_IN_BUFFER >= 16 && (target < targetEnd)) + { + /* write a unicode */ + *target++ = READ_N_BITS(16); + wroteone = 1; + } + + if (BITS_IN_BUFFER >= 16) + TARGETCHECK; + + if (base64EOF) + { + junk = READ_N_BITS(BITS_IN_BUFFER); + if (junk) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: non-zero pad bits near byte %ld in input\n", source-sourceStart) */; + } + } + } + + if (!shifted && !done) + { + if (c == SHIFT_IN) + { + shifted = 1; + first = 1; + wroteone = 0; + } + else + { + /* It must be a directly encoded character. */ + if (c > 0x7f) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: non-ASCII character near byte %ld in input\n", source-sourceStart) */; + } + /* write a unicode */ + TARGETCHECK; + *target++ = c; + } + } + } + while (!done); + + *sourceStart = source; + *targetStart = target; + return result; +} diff --git a/CVTUTF7.H b/CVTUTF7.H new file mode 100644 index 0000000..362fdd0 --- /dev/null +++ b/CVTUTF7.H @@ -0,0 +1,79 @@ +/* ================================================================ */ +/* +File: ConvertUTF7.h +Author: David B. Goldsmith +Copyright (C) 1994 IBM Corporation All rights reserved. +Revisions: Header update only July, 2001. + +This code is copyrighted. Under the copyright laws, this code may not +be copied, in whole or part, without prior written consent of IBM Corporation. + +IBM Corporation grants the right to use this code as long as this ENTIRE +copyright notice is reproduced in the code. The code is provided +AS-IS, AND IBM CORPORATION DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR +IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT +WILL IBM CORPORATION BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS +INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY +LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN +IF IBM CORPORATION HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF +LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE +LIMITATION MAY NOT APPLY TO YOU. + +RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the +government is subject to restrictions as set forth in subparagraph +(c)(l)(ii) of the Rights in Technical Data and Computer Software +clause at DFARS 252.227-7013 and FAR 52.227-19. + +This code may be protected by one or more U.S. and International +Patents. + +*/ +/* ================================================================ */ + +/* ================================================================ */ +/* The following definitions are compiler-specific. + I would use wchar_t for UCS2/UTF16, except that the C standard + does not guarantee that it has at least 16 bits, so wchar_t is + no more portable than unsigned short! +*/ + +typedef unsigned short UCS2; + +/* ================================================================ */ +/* Each of these routines converts the text between *sourceStart and +sourceEnd, putting the result into the buffer between *targetStart and +targetEnd. Note: the end pointers are *after* the last item: e.g. +*(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, +and if not, whether the problem was in the source or target buffers. + + After the conversion, *sourceStart and *targetStart are both +updated to point to the end of last text successfully converted in +the respective buffers. + + In ConvertUCS2toUTF7, optional indicates whether UTF-7 optional +characters should be directly encoded, and verbose controls whether the +shift-out character, "-", is always emitted at the end of a shifted +sequence. +*/ + +typedef enum { + ok, /* conversion successful */ + sourceCorrupt, /* source contains invalid UTF-7 */ + targetExhausted /* insuff. room in target for conversion */ +} ConversionResult; + +extern ConversionResult ConvertUCS2toUTF7 ( + UCS2** sourceStart, UCS2* sourceEnd, + char** targetStart, char* targetEnd, + int optional, int verbose); + +extern ConversionResult ConvertUTF7toUCS2 ( + char** sourceStart, char* sourceEnd, + UCS2** targetStart, UCS2* targetEnd); + +/* ================================================================ */ diff --git a/ConvertUTF.c b/ConvertUTF.c new file mode 100644 index 0000000..9e836fa --- /dev/null +++ b/ConvertUTF.c @@ -0,0 +1,549 @@ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Source code file. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. + June 2002: Tim Dodd added detection and handling of incomplete + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. + July 2003: slight mods to back out aggressive FFFE detection. + Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + + See the header file "ConvertUTF.h" for complete documentation. + +------------------------------------------------------------------------ */ + + +#include "ConvertUTF.h" +#ifdef CVTUTF_DEBUG +#include <stdio.h> +#endif + +static const int halfShift = 10; /* used for shifting by 10 bits */ + +static const UTF32 halfBase = 0x0010000UL; +static const UTF32 halfMask = 0x3FFUL; + +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF +#define false 0 +#define true 1 + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG +if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); +} +#endif + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +/* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. + */ +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +/* --------------------------------------------------------------------- */ + +/* The interface converts a whole buffer to avoid function-call overhead. + * Constants have been gathered. Loops & conditionals have been removed as + * much as possible for efficiency, in favor of drop-through switches. + * (See "Note A" at the bottom of the file for equivalent code.) + * If your compiler supports it, the "isLegalUTF8" call can be turned + * into an inline function. + */ + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + + // TPN: substitute all control characters except for NULL, TAB, LF or CR + if (ch && (ch != (UTF32)0x09) && (ch != (UTF32)0x0a) && (ch != (UTF32)0x0d) && (ch < (UTF32)0x20) ) { + ch = (UTF32)0x3f; + } + // TPN: filter out byte order marks and invalid character 0xFFFF + if((ch == (UTF32)0xFEFF) || (ch == (UTF32)0xFFFE)|| (ch == (UTF32)0xFFFF)) { + continue; + } + + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns false. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ + +inline Boolean isLegalUTF8(const UTF8 *source, int length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- + + Note A. + The fall-through switches in UTF-8 reading code save a + temp variable, some decrements & conditionals. The switches + are equivalent to the following loop: + { + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); + } + In UTF-8 writing code, the switches on "bytesToWrite" are + similarly unrolled loops. + + --------------------------------------------------------------------- */ diff --git a/ConvertUTF.h b/ConvertUTF.h new file mode 100644 index 0000000..e264915 --- /dev/null +++ b/ConvertUTF.h @@ -0,0 +1,149 @@ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Header file. + + Several funtions are included here, forming a complete set of + conversions between the three formats. UTF-7 is not included + here, but is handled in a separate source file. + + Each of these routines takes pointers to input buffers and output + buffers. The input buffers are const. + + Each routine converts the text between *sourceStart and sourceEnd, + putting the result into the buffer between *targetStart and + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, + and if not, whether the problem was in the source or target buffers. + (Only the first encountered problem is indicated.) + + After the conversion, *sourceStart and *targetStart are both + updated to point to the end of last text successfully converted in + the respective buffers. + + Input parameters: + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. + + These conversion functions take a ConversionFlags argument. When this + flag is set to strict, both irregular sequences and isolated surrogates + will cause an error. When the flag is set to lenient, both irregular + sequences and isolated surrogates are converted. + + Whether the flag is strict or lenient, all illegal sequences will cause + an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, + or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + must check for illegal sequences. + + When the flag is set to lenient, characters over 0x10FFFF are converted + to the replacement character; otherwise (when the flag is set to strict) + they constitute an error. + + Output parameters: + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. + + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Fixes & updates, Sept 2001. + +------------------------------------------------------------------------ */ + +/* --------------------------------------------------------------------- + The following 4 definitions are compiler-specific. + The C standard does not guarantee that wchar_t has at least + 16 bits, so wchar_t is no less portable than unsigned short! + All should be unsigned values to avoid sign extension during + bit mask & shift operations. +------------------------------------------------------------------------ */ + +typedef unsigned long UTF32; /* at least 32 bits */ +typedef unsigned short UTF16; /* at least 16 bits */ +typedef unsigned char UTF8; /* typically 8 bits */ +typedef unsigned char Boolean; /* 0 or 1 */ + +/* Some fundamental constants */ +#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD +#define UNI_MAX_BMP (UTF32)0x0000FFFF +#define UNI_MAX_UTF16 (UTF32)0x0010FFFF +#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF +#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF + +typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +} ConversionResult; + +typedef enum { + strictConversion = 0, + lenientConversion +} ConversionFlags; + +/* This is for C++ and does no harm in C */ +#ifdef __cplusplus +extern "C" { +#endif + +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); + +#ifdef __cplusplus +} +#endif + +/* --------------------------------------------------------------------- */ diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt new file mode 100644 index 0000000..e09d844 --- /dev/null +++ b/ExpectedOutput.txt @@ -0,0 +1,21 @@ +Three tests of round-trip conversions will be performed. +One test of illegal UTF-32 will be peroformed. +Two illegal result messages are expected; one in test 02A; one in test 03A. +These are for tests of Surrogate conversion. + +Begin Test01 +******** Test01 succeeded without error. ******** + +Begin Test02 +Test02A for 55296, input 0000d800, output 0000,0000, result 3 +!!! Test02A: note expected illegal result for 0x0000D800 +******** Test02 succeeded without error. ******** + +Begin Test03 +sourceIllegal Test03A for 55296 (0xd800); output ; result 3 +!!! Test03A: note expected illegal result for 0x0000D800 +******** Test03 succeeded without error. ******** + +Begin Test04 +******** Test04 succeeded without error. ******** + diff --git a/harness.c b/harness.c new file mode 100644 index 0000000..c3b0b3a --- /dev/null +++ b/harness.c @@ -0,0 +1,446 @@ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + * + * harness.c + * + * This is a test harness for "ConvertUTF.c". Compile this + * and run without arguments. It will exhaustively test + * the conversion routines, and print a few lines of diagnostic + * output. You don't need to compile ConvertUTF.c itself, + * since it gets #included here along with the header. + * Example of a compile line: + * + * $ gcc -g harness.c -o harness + * + * Rev History: Rick McGowan, new file April 2001. + * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2] + * per report from Iain Murray. + * July 3, 2003: Updated printout message. + * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch + * illegal surrogate use in UTF-8, per report from Frank Tang. + * + */ + +#define CVTUTF_DEBUG 1 + +#include <stdio.h> +#include "ConvertUTF.c" + +/* --------------------------------------------------------------------- + test01 - Spot check a few legal & illegal UTF-8 values only. + This is not an exhaustive test, just a brief one that was + used to develop the "isLegalUTF8" routine. + + Legal UTF-8 sequences are: + + 1st---- 2nd---- 3rd---- 4th---- Codepoints--- + + 00-7F 0000- 007F + C2-DF 80-BF 0080- 07FF + E0 A0-BF 80-BF 0800- 0FFF + E1-EC 80-BF 80-BF 1000- CFFF + ED 80-9F 80-BF D000- D7FF + EE-EF 80-BF 80-BF E000- FFFF + F0 90-BF 80-BF 80-BF 10000- 3FFFF + F1-F3 80-BF 80-BF 80-BF 40000- FFFFF + F4 80-8F 80-BF 80-BF 100000-10FFFF + + --------------------------------------------------------------------- */ + + +struct utf8_test { + Boolean utf8_legal; /* is legal sequence? */ + int utf8_len; /* length of sequence */ + unsigned char utf8_seq[5]; /* the sequence */ +}; + +struct utf8_test utf8_testData[] = { + { 1, 1, { 0x7A, 0x00, 0x00, 0x00, 0x00 }}, /* 0 */ + { 1, 2, { 0xC2, 0xAC, 0x00, 0x00, 0x00 }}, /* 1 */ + { 1, 2, { 0xDF, 0xB2, 0x00, 0x00, 0x00 }}, /* 2 */ + { 1, 3, { 0xE0, 0xA1, 0x81, 0x00, 0x00 }}, /* 3 */ + { 1, 3, { 0xE1, 0xAC, 0x90, 0x00, 0x00 }}, /* 4 */ + { 1, 3, { 0xF0, 0x93, 0xB2, 0xA1, 0x00 }}, /* 5 */ + { 1, 4, { 0xF1, 0x87, 0x9A, 0xB0, 0x00 }}, /* 6 */ + { 1, 4, { 0xF3, 0x88, 0x9B, 0xAD, 0x00 }}, /* 7 */ + { 1, 4, { 0xF4, 0x82, 0x89, 0x8F, 0x00 }}, /* 8 */ + + { 0, 3, { 0x82, 0x00, 0x00, 0x00, 0x00 }}, /* 9 */ + { 0, 2, { 0xF8, 0xAC, 0x00, 0x00, 0x00 }}, /* 10 */ + { 0, 2, { 0xE1, 0xFC, 0xFF, 0x00, 0x00 }}, /* 11 */ + { 0, 3, { 0xC2, 0xFC, 0x00, 0x00, 0x00 }}, /* 12 */ + { 0, 3, { 0xE1, 0xC2, 0x81, 0x00, 0x00 }}, /* 13 */ + { 0, 2, { 0xC2, 0xC1, 0x00, 0x00, 0x00 }}, /* 14 */ + { 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */ + { 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */ + { 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */ + + { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */ + { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */ + { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */ + { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */ + +/* for all > 21 use "short" buffer lengths to detect over-run */ + { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */ + { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}, + +}; + +int test01() { + int i; + int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2; + + printf("Begin Test01\n"); fflush(stdout); + + rval = 0; + for (i = 0; utf8_testData[i].utf8_len; i++) { + wantVal1 = wantVal2 = utf8_testData[i].utf8_legal; + gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len); + /* use truncated length for tests over 21 */ + if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } + gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2); + if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) { + printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n", + i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0], + utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2], + utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4], + utf8_testData[i].utf8_len); + ++rval; + } + } + + return (rval ? 0 : 1); +} + + +/* --------------------------------------------------------------------- + test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32 + + This is an exhaustive test of values 0 through 0x10FFFF. It + takes each integer value and converts from UTC4 through the + other encoding forms, and back to UTR32, checking the results + along the way. + + It does not check the un-paired low surrogates, except for + the first low surrogate. It intends to get that one illegal + result, prints a message, and continues with tests. + + --------------------------------------------------------------------- */ + +int test02() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2], utf32_result[2]; + UTF16 utf16_buf[3], utf16_result[3]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF16 *utf16SourceStart, *utf16TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test02\n"); fflush(stdout); + + for (i = 0; i <= 0x10FFFF; i++) { + utf32_buf[0] = i; utf32_buf[1] = 0; + utf32_result[0] = utf32_result[1] = 0; + utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0; + utf16_result[0] = utf16_result[1] = utf16_result[2] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf16TargetStart = utf16SourceStart = utf16_buf; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF32 -> UTF16 + */ + result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); + if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { + /* skip result checking for all but 0000d800, which we know to be illegal */ + switch (result) { + default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + } + if (result != conversionOK) { + if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { + printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result); + if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) { + return 0; + } else { + printf("!!! Test02A: note expected illegal result for 0x0000D800\n"); + } + } + } + if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue; + + /* + * Test UTF16 -> UTF8, with legality check on. We check for everything except + * for unpaired low surrogates. We do make one check that the lowest low + * surrogate, when unpaired, is illegal. + */ + result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); + if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) { + return 0; + } else { + /* Note: This illegal result only happens if we remove the surrogate + check in Test02A. So it shouldn't be seen unless that check and + the "continue" are removed in the test above. + */ + if (i == UNI_SUR_LOW_START) + printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n"); + else if (i == UNI_SUR_HIGH_START) + printf("!!! Test02B: note expected illegal result for 0xD800,0000\n"); + } + } + if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) { + printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); + printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n"); + return 0; + } + + if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue; + + /* + * Reset some result buffer pointers for the trip back. + */ + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf16TargetStart = utf16SourceStart = utf16_result; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF8 -> UTF16, with legality check on. + */ + result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n", + i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result); + return 0; + } + for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */ + if (utf16_buf[n] != utf16_result[n]) { + printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n", + utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]); + return 0; + } + } + + /* + * Test UTF16 -> UTF32, with legality check on. If the result of our previous + * conversion gave us a "surrogate pair", then we need to convert 2 entities + * back to UTF32. + */ + if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) { + result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + } else { + result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + } + switch (result) { + default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result); + return 0; + } + + /* + * Now, check the final round-trip value. + */ + if (utf32_buf[0] != utf32_result[0]) { + printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]); + return 0; + } + } + return 1; +} + +/* --------------------------------------------------------------------- + test03 - Test round trip UTF32 -> UTF8 -> UTF32 + + This tests the functions that were not tested by test02 above. + For each UTF32 value 0 through 0x10FFFF, it tests the conversion + to UTF-8 and back. The test is exhaustive. + + --------------------------------------------------------------------- */ + +int test03() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2], utf32_result[2]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test03\n"); fflush(stdout); + + for (i = 0; i <= 0x10FFFF; i++) { + /* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */ + if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue; + + utf32_buf[0] = i; utf32_buf[1] = 0; + utf32_result[0] = utf32_result[1] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF32 -> UTF8, with legality check on. + */ + result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test03A for %d (0x%x); output %s; result %d\n", + i, utf32_buf[0], utf8_buf, result); + if (i != UNI_SUR_HIGH_START) { + return 0; + } else { + printf("!!! Test03A: note expected illegal result for 0x0000D800\n"); + } + } + if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) { + printf("Test03A for %d (0x%x); output %s; result %d\n", + i, utf32_buf[0], utf8_buf, result); + printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n"); + return 0; + } + + if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue; + + /* + * Reset some result buffer pointers for the trip back. + */ + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF8 -> UTF32, with legality check on. + */ + result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n", + i, utf32_buf[0], utf8_buf, utf32_result[0], result); + return 0; + } + + /* + * Now, check the final round-trip value. + */ + if (utf32_buf[0] != utf32_result[0]) { + printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]); + return 0; + } + } + return 1; +} + +/* --------------------------------------------------------------------- + test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8. + Expect it will be turned into UNI_REPLACEMENT_CHAR. + + --------------------------------------------------------------------- */ + +int test04() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test04\n"); fflush(stdout); + + i = 0x10FFFF + 21; /* an arbitrary value > legal */ + + utf32_buf[0] = i; utf32_buf[1] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; + utf8TargetStart = utf8_buf; + + /* + * Test UTF32 -> UTF8, with legality check on. + */ + result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); + if (result != sourceIllegal) { + fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + } + + return 1; +} + +/* --------------------------------------------------------------------- */ + +int main() { + printf("Three tests of round-trip conversions will be performed.\n"); + printf("One test of illegal UTF-32 will be peroformed.\n"); + printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n"); + printf("These are for tests of Surrogate conversion.\n\n"); + fflush(stdout); + if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); } + else { printf("-------- Test01 failed. --------\n\n"); } + if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); } + else { printf("-------- Test02 failed. --------\n\n"); } + if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); } + else { printf("-------- Test03 failed. --------\n\n"); } + if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); } + else { printf("-------- Test04 failed. --------\n\n"); } + return 0; +} diff --git a/readme.txt b/readme.txt new file mode 100644 index 0000000..78dcd9f --- /dev/null +++ b/readme.txt @@ -0,0 +1,44 @@ +NOTE: Code downloaded from http://www.unicode.org + +The accompanying C source code file "ConvertUTF.c" and the associated header +file "ConvertUTF.h" provide for conversion between various transformation +formats of Unicode characters. The following conversions are supported: + + UTF-32 to UTF-16 + UTF-32 to UTF-8 + UTF-16 to UTF-32 + UTF-16 to UTF-8 + UTF-8 to UTF-16 + UTF-8 to UTF-32 + +In addition, there is a test harness which runs various tests. + +The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes +only. They have not been updated to Unicode 3.0 or later and should be +considered obsolescent. "CVTUTF7.C" contains two functions that can convert +between UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are +not supported, the code has not been tested, and should be considered +unsuitable for general purpose use. + +Please submit any bug reports about these programs here: + + http://www.unicode.org/unicode/reporting.html + +Version 1.0: initial version. + +Version 1.1: corrected some minor problems; added stricter checks. + +Version 1.2: corrected switch statements associated with "extraBytesToRead" + in 4 & 5 byte cases, in functions for conversion from UTF8. + Note: formally, the 4 & 5 byte cases are illegal in the latest + UTF8, but the table and this code has always catered for those, + cases since at one time they were legal. + +Version 1.3: Updated UTF-8 legality check; + updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions + Updated UTF-8 legality tests in harness.c + + +Last update: October 19, 2004 + + diff --git a/testunicode.sln b/testunicode.sln new file mode 100644 index 0000000..46aaad5 --- /dev/null +++ b/testunicode.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 9.00 +# Visual Studio 2005 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "testunicode", "testunicode.vcproj", "{09995AAF-A9E7-442E-B40A-F1F94A9A4EC8}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {09995AAF-A9E7-442E-B40A-F1F94A9A4EC8}.Debug|Win32.ActiveCfg = Debug|Win32 + {09995AAF-A9E7-442E-B40A-F1F94A9A4EC8}.Debug|Win32.Build.0 = Debug|Win32 + {09995AAF-A9E7-442E-B40A-F1F94A9A4EC8}.Release|Win32.ActiveCfg = Release|Win32 + {09995AAF-A9E7-442E-B40A-F1F94A9A4EC8}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/testunicode.vcproj b/testunicode.vcproj new file mode 100644 index 0000000..e3cd31c --- /dev/null +++ b/testunicode.vcproj @@ -0,0 +1,209 @@ +<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="8.00"
+ Name="testunicode"
+ ProjectGUID="{09995AAF-A9E7-442E-B40A-F1F94A9A4EC8}"
+ RootNamespace="testunicode"
+ Keyword="Win32Proj"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="1"
+ CharacterSet="0"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+
+ PreprocessorDefinitions="_WIN32_WINNT=0x0501;WINVER=0x0501;_WIN32_IE=0x0603;WIN32;_DEBUG;_CONSOLE"
+
+
+
+
+
+ Detect64BitPortabilityProblems="false"
+
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+
+
+ SubSystem="1"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCWebDeploymentTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="1"
+ CharacterSet="1"
+
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ PreprocessorDefinitions="_WIN32_WINNT=0x0501;WINVER=0x0501;_WIN32_IE=0x0603;WIN32;NDEBUG;_CONSOLE"
+
+
+
+ Detect64BitPortabilityProblems="false"
+
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+
+
+ SubSystem="1"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCWebDeploymentTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath=".\harness.c"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath=".\ConvertUTF.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+ >
+ </Filter>
+ <File
+ RelativePath=".\ExpectedOutput.txt"
+ >
+ </File>
+ <File
+ RelativePath=".\readme.txt"
+ >
+ </File>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
diff --git a/testunicode.xcodeproj/project.pbxproj b/testunicode.xcodeproj/project.pbxproj new file mode 100644 index 0000000..941d573 --- /dev/null +++ b/testunicode.xcodeproj/project.pbxproj @@ -0,0 +1,188 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 44; + objects = { + +/* Begin PBXBuildFile section */ + B045A37709F1CE3C0028F52B /* harness.c in Sources */ = {isa = PBXBuildFile; fileRef = B045A37609F1CE3C0028F52B /* harness.c */; }; + B045A38209F1CE9C0028F52B /* ExpectedOutput.txt in CopyFiles */ = {isa = PBXBuildFile; fileRef = B045A38009F1CE9C0028F52B /* ExpectedOutput.txt */; }; + B045A38309F1CE9C0028F52B /* readme.txt in CopyFiles */ = {isa = PBXBuildFile; fileRef = B045A38109F1CE9C0028F52B /* readme.txt */; }; +/* End PBXBuildFile section */ + +/* Begin PBXCopyFilesBuildPhase section */ + 8DD76F690486A84900D96B5E /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = /usr/share/man/man1/; + dstSubfolderSpec = 0; + files = ( + B045A38209F1CE9C0028F52B /* ExpectedOutput.txt in CopyFiles */, + B045A38309F1CE9C0028F52B /* readme.txt in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; +/* End PBXCopyFilesBuildPhase section */ + +/* Begin PBXFileReference section */ + 8DD76F6C0486A84900D96B5E /* testunicode */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = testunicode; sourceTree = BUILT_PRODUCTS_DIR; }; + B045A37609F1CE3C0028F52B /* harness.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; path = harness.c; sourceTree = SOURCE_ROOT; }; + B045A38009F1CE9C0028F52B /* ExpectedOutput.txt */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = ExpectedOutput.txt; sourceTree = SOURCE_ROOT; }; + B045A38109F1CE9C0028F52B /* readme.txt */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = readme.txt; sourceTree = SOURCE_ROOT; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 8DD76F660486A84900D96B5E /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 08FB7794FE84155DC02AAC07 /* testunicode */ = { + isa = PBXGroup; + children = ( + B045A38009F1CE9C0028F52B /* ExpectedOutput.txt */, + B045A38109F1CE9C0028F52B /* readme.txt */, + B045A37609F1CE3C0028F52B /* harness.c */, + 1AB674ADFE9D54B511CA2CBB /* Products */, + ); + name = testunicode; + sourceTree = "<group>"; + }; + 1AB674ADFE9D54B511CA2CBB /* Products */ = { + isa = PBXGroup; + children = ( + 8DD76F6C0486A84900D96B5E /* testunicode */, + ); + name = Products; + sourceTree = "<group>"; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + 8DD76F620486A84900D96B5E /* testunicode */ = { + isa = PBXNativeTarget; + buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "testunicode" */; + buildPhases = ( + 8DD76F640486A84900D96B5E /* Sources */, + 8DD76F660486A84900D96B5E /* Frameworks */, + 8DD76F690486A84900D96B5E /* CopyFiles */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = testunicode; + productInstallPath = "$(HOME)/bin"; + productName = testunicode; + productReference = 8DD76F6C0486A84900D96B5E /* testunicode */; + productType = "com.apple.product-type.tool"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 08FB7793FE84155DC02AAC07 /* Project object */ = { + isa = PBXProject; + buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "testunicode" */; + compatibilityVersion = "Xcode 3.0"; + hasScannedForEncodings = 1; + mainGroup = 08FB7794FE84155DC02AAC07 /* testunicode */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8DD76F620486A84900D96B5E /* testunicode */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXSourcesBuildPhase section */ + 8DD76F640486A84900D96B5E /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B045A37709F1CE3C0028F52B /* harness.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + 1DEB923208733DC60010E9CD /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + COPY_PHASE_STRIP = NO; + GCC_DYNAMIC_NO_PIC = NO; + GCC_ENABLE_FIX_AND_CONTINUE = YES; + GCC_MODEL_TUNING = G5; + GCC_OPTIMIZATION_LEVEL = 0; + INSTALL_PATH = "$(HOME)/bin"; + PRODUCT_NAME = testunicode; + ZERO_LINK = YES; + }; + name = Debug; + }; + 1DEB923308733DC60010E9CD /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ARCHS = ( + ppc, + i386, + ); + GCC_GENERATE_DEBUGGING_SYMBOLS = NO; + GCC_MODEL_TUNING = G5; + INSTALL_PATH = "$(HOME)/bin"; + PRODUCT_NAME = testunicode; + }; + name = Release; + }; + 1DEB923608733DC60010E9CD /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + PREBINDING = NO; + + }; + name = Debug; + }; + 1DEB923708733DC60010E9CD /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + PREBINDING = NO; + + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "testunicode" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 1DEB923208733DC60010E9CD /* Debug */, + 1DEB923308733DC60010E9CD /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "testunicode" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 1DEB923608733DC60010E9CD /* Debug */, + 1DEB923708733DC60010E9CD /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 08FB7793FE84155DC02AAC07 /* Project object */; +} |