summaryrefslogtreecommitdiff
path: root/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java
diff options
context:
space:
mode:
Diffstat (limited to 'XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java')
-rw-r--r--XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java197
1 files changed, 197 insertions, 0 deletions
diff --git a/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java b/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java
new file mode 100644
index 0000000..118d77d
--- /dev/null
+++ b/XMPCore/src/com/adobe/xmp/impl/Latin1Converter.java
@@ -0,0 +1,197 @@
+// =================================================================================================
+// ADOBE SYSTEMS INCORPORATED
+// Copyright 2006 Adobe Systems Incorporated
+// All Rights Reserved
+//
+// NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
+// of the Adobe license agreement accompanying it.
+// =================================================================================================
+
+
+
+package com.adobe.xmp.impl;
+
+import java.io.UnsupportedEncodingException;
+
+
+/**
+ * @since 12.10.2006
+ */
+public class Latin1Converter
+{
+ /** */
+ private static final int STATE_START = 0;
+ /** */
+ private static final int STATE_UTF8CHAR = 11;
+
+
+ /**
+ * Private constructor
+ */
+ private Latin1Converter()
+ {
+ // EMPTY
+ }
+
+
+ /**
+ * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars.
+ * The result is a buffer where those chars have been converted to UTF-8;
+ * that means it contains only valid UTF-8 chars.
+ * <p>
+ * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking
+ * at the first four bytes (that works only if the buffer starts with an ASCII-char,
+ * like xmls &apos;&lt;&apos;). UTF-16/32 flavours do not require further proccessing.
+ * <p>
+ * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of
+ * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte
+ * sequence.
+ * <p>
+ * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code
+ * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined
+ * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a
+ * space.
+ * <p>
+ * The official Latin-1 characters in the range 0xA0..0xFF are converted into
+ * the Unicode Latin Supplement range U+00A0 - U+00FF.
+ * <p>
+ * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC),
+ * it will be left as is. But if only the first two bytes are appearing,
+ * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to
+ * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a).
+ *
+ * @param buffer a byte buffer contain
+ * @return Returns a new buffer containing valid UTF-8
+ */
+ public static ByteBuffer convert(ByteBuffer buffer)
+ {
+ if ("UTF-8".equals(buffer.getEncoding()))
+ {
+ // the buffer containing one UTF-8 char (up to 8 bytes)
+ byte[] readAheadBuffer = new byte[8];
+ // the number of bytes read ahead.
+ int readAhead = 0;
+ // expected UTF8 bytesto come
+ int expectedBytes = 0;
+ // output buffer with estimated length
+ ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3);
+
+ int state = STATE_START;
+ for (int i = 0; i < buffer.length(); i++)
+ {
+ int b = buffer.charAt(i);
+
+ switch (state)
+ {
+ default:
+ case STATE_START:
+ if (b < 0x7F)
+ {
+ out.append((byte) b);
+ }
+ else if (b >= 0xC0)
+ {
+ // start of UTF8 sequence
+ expectedBytes = -1;
+ int test = b;
+ for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1)
+ {
+ expectedBytes++;
+ }
+ readAheadBuffer[readAhead++] = (byte) b;
+ state = STATE_UTF8CHAR;
+ }
+ else // implicitly: b >= 0x80 && b < 0xC0
+ {
+ // invalid UTF8 start char, assume to be Latin-1
+ byte[] utf8 = convertToUTF8((byte) b);
+ out.append(utf8);
+ }
+ break;
+
+ case STATE_UTF8CHAR:
+ if (expectedBytes > 0 && (b & 0xC0) == 0x80)
+ {
+ // valid UTF8 char, add to readAheadBuffer
+ readAheadBuffer[readAhead++] = (byte) b;
+ expectedBytes--;
+
+ if (expectedBytes == 0)
+ {
+ out.append(readAheadBuffer, 0, readAhead);
+ readAhead = 0;
+
+ state = STATE_START;
+ }
+ }
+ else
+ {
+ // invalid UTF8 char:
+ // 1. convert first of seq to UTF8
+ byte[] utf8 = convertToUTF8(readAheadBuffer[0]);
+ out.append(utf8);
+
+ // 2. continue processing at second byte of sequence
+ i = i - readAhead;
+ readAhead = 0;
+
+ state = STATE_START;
+ }
+ break;
+ }
+ }
+
+ // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1
+ if (state == STATE_UTF8CHAR)
+ {
+ for (int j = 0; j < readAhead; j++)
+ {
+ byte b = readAheadBuffer[j];
+ byte[] utf8 = convertToUTF8(b);
+ out.append(utf8);
+ }
+ }
+
+ return out;
+ }
+ else
+ {
+ // Latin-1 fixing applies only to UTF-8
+ return buffer;
+ }
+ }
+
+
+ /**
+ * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a
+ * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are
+ * formally undefined by Windows 1252 and therefore replaced by a space
+ * (0x20).
+ *
+ * @param ch
+ * an Cp1252 / Latin-1 byte
+ * @return Returns a byte array containing a UTF-8 byte sequence.
+ */
+ private static byte[] convertToUTF8(byte ch)
+ {
+ int c = ch & 0xFF;
+ try
+ {
+ if (c >= 0x80)
+ {
+ if (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)
+ {
+ return new byte[] { 0x20 }; // space for undefined
+ }
+
+ // interpret byte as Windows Cp1252 char
+ return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8");
+ }
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ // EMPTY
+ }
+ return new byte[] { ch };
+ }
+}