diff options
Diffstat (limited to 'XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java')
-rw-r--r-- | XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java b/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java new file mode 100644 index 0000000..118d77d --- /dev/null +++ b/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java @@ -0,0 +1,197 @@ +// ================================================================================================= +// ADOBE SYSTEMS INCORPORATED +// Copyright 2006 Adobe Systems Incorporated +// All Rights Reserved +// +// NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms +// of the Adobe license agreement accompanying it. +// ================================================================================================= + + + +package com.adobe.xmp.impl; + +import java.io.UnsupportedEncodingException; + + +/** + * @since 12.10.2006 + */ +public class Latin1Converter +{ + /** */ + private static final int STATE_START = 0; + /** */ + private static final int STATE_UTF8CHAR = 11; + + + /** + * Private constructor + */ + private Latin1Converter() + { + // EMPTY + } + + + /** + * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars. + * The result is a buffer where those chars have been converted to UTF-8; + * that means it contains only valid UTF-8 chars. + * <p> + * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking + * at the first four bytes (that works only if the buffer starts with an ASCII-char, + * like xmls '<'). UTF-16/32 flavours do not require further proccessing. + * <p> + * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of + * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte + * sequence. + * <p> + * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code + * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined + * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a + * space. + * <p> + * The official Latin-1 characters in the range 0xA0..0xFF are converted into + * the Unicode Latin Supplement range U+00A0 - U+00FF. + * <p> + * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC), + * it will be left as is. But if only the first two bytes are appearing, + * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to + * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a). + * + * @param buffer a byte buffer contain + * @return Returns a new buffer containing valid UTF-8 + */ + public static ByteBuffer convert(ByteBuffer buffer) + { + if ("UTF-8".equals(buffer.getEncoding())) + { + // the buffer containing one UTF-8 char (up to 8 bytes) + byte[] readAheadBuffer = new byte[8]; + // the number of bytes read ahead. + int readAhead = 0; + // expected UTF8 bytesto come + int expectedBytes = 0; + // output buffer with estimated length + ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3); + + int state = STATE_START; + for (int i = 0; i < buffer.length(); i++) + { + int b = buffer.charAt(i); + + switch (state) + { + default: + case STATE_START: + if (b < 0x7F) + { + out.append((byte) b); + } + else if (b >= 0xC0) + { + // start of UTF8 sequence + expectedBytes = -1; + int test = b; + for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1) + { + expectedBytes++; + } + readAheadBuffer[readAhead++] = (byte) b; + state = STATE_UTF8CHAR; + } + else // implicitly: b >= 0x80 && b < 0xC0 + { + // invalid UTF8 start char, assume to be Latin-1 + byte[] utf8 = convertToUTF8((byte) b); + out.append(utf8); + } + break; + + case STATE_UTF8CHAR: + if (expectedBytes > 0 && (b & 0xC0) == 0x80) + { + // valid UTF8 char, add to readAheadBuffer + readAheadBuffer[readAhead++] = (byte) b; + expectedBytes--; + + if (expectedBytes == 0) + { + out.append(readAheadBuffer, 0, readAhead); + readAhead = 0; + + state = STATE_START; + } + } + else + { + // invalid UTF8 char: + // 1. convert first of seq to UTF8 + byte[] utf8 = convertToUTF8(readAheadBuffer[0]); + out.append(utf8); + + // 2. continue processing at second byte of sequence + i = i - readAhead; + readAhead = 0; + + state = STATE_START; + } + break; + } + } + + // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1 + if (state == STATE_UTF8CHAR) + { + for (int j = 0; j < readAhead; j++) + { + byte b = readAheadBuffer[j]; + byte[] utf8 = convertToUTF8(b); + out.append(utf8); + } + } + + return out; + } + else + { + // Latin-1 fixing applies only to UTF-8 + return buffer; + } + } + + + /** + * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a + * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are + * formally undefined by Windows 1252 and therefore replaced by a space + * (0x20). + * + * @param ch + * an Cp1252 / Latin-1 byte + * @return Returns a byte array containing a UTF-8 byte sequence. + */ + private static byte[] convertToUTF8(byte ch) + { + int c = ch & 0xFF; + try + { + if (c >= 0x80) + { + if (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) + { + return new byte[] { 0x20 }; // space for undefined + } + + // interpret byte as Windows Cp1252 char + return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8"); + } + } + catch (UnsupportedEncodingException e) + { + // EMPTY + } + return new byte[] { ch }; + } +} |