src/unicode.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

/**
 * \file unicode.c
 *
 * This file contains general Unicode string manipulation functions.
 * It mainly consist of functions for converting between UCS-2 (used on
 * the devices) and UTF-8 (used by several applications).
 *
 * For a deeper understanding of Unicode encoding formats see the
 * Wikipedia entries for
 * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
 * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
 *
 * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 */

#include "config.h"

#include <stdlib.h>
#include <string.h>
#ifdef HAVE_ICONV
#include "iconv.h"
#else
#error "libmtp unicode.c needs fixing to work without iconv()!"
#endif
#include "libmtp.h"
#include "unicode.h"
#include "util.h"
#include "ptp.h"

/**
 * The size of the buffer (in characters) used for creating string copies.
 */
#define STRING_BUFFER_LENGTH 1024

/**
 * Gets the length (in characters, not bytes) of a unicode
 * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
 * will return a value of 1.
 *
 * @param unicstr a UCS-2 Unicode string
 * @return the length of the string, in number of characters. If you
 *         want to know the length in bytes, multiply this by two and
 *         add two (for zero terminator).
 */
int ucs2_strlen(uint16_t const * const unicstr)
{
  int length;

  /* Unicode strings are terminated with 2 * 0x00 */
  for(length = 0; unicstr[length] != 0x0000U; length ++);
  return length;
}

/**
 * Converts a big-endian UTF-16 2-byte string
 * to a UTF-8 string. Actually just a UCS-2 internal conversion
 * routine that strips off the BOM if there is one.
 *
 * @param device a pointer to the current device.
 * @param unicstr the UTF-16 unicode string to convert
 * @return a UTF-8 string.
 */
char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
{
  PTPParams *params = (PTPParams *) device->params;
  char *stringp = (char *) unicstr;
  char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
  char *locp = loclstr;
  size_t nconv;
  size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
  size_t convmax = STRING_BUFFER_LENGTH*3;

  loclstr[0]='\0';
  /* Do the conversion.  */
  nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
  if (nconv == (size_t) -1) {
    // Return partial string anyway.
    *locp = '\0';
  }
  loclstr[STRING_BUFFER_LENGTH*3] = '\0';
  // Strip off any BOM, it's totally useless...
  if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
    return strdup(loclstr+3);
  }
  return strdup(loclstr);
}

/**
 * Converts a UTF-8 string to a big-endian UTF-16 2-byte string
 * Actually just a UCS-2 internal conversion.
 *
 * @param device a pointer to the current device.
 * @param localstr the UTF-8 unicode string to convert
 * @return a UTF-16 string.
 */
uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
{
  PTPParams *params = (PTPParams *) device->params;
  char *stringp = (char *) localstr; // cast away "const"
  char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
  char *unip = unicstr;
  size_t nconv = 0;
  size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
  size_t convmax = STRING_BUFFER_LENGTH*2;

  unicstr[0]='\0';
  unicstr[1]='\0';

  /* Do the conversion.  */
  nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);

  if (nconv == (size_t) -1) {
    // Return partial string anyway.
    unip[0] = '\0';
    unip[1] = '\0';
  }
  // make sure the string is null terminated
  unicstr[STRING_BUFFER_LENGTH*2] = '\0';
  unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';

  // allocate the string to be returned
  // Note: can't use strdup since every other byte is a null byte
  int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
  uint16_t* ret = malloc(ret_len);
  memcpy(ret,unicstr,(size_t)ret_len);
  return ret;
}

/**
 * This helper function simply removes any consecutive chars
 * > 0x7F and replace then with an underscore. In UTF-8
 * consequtive chars > 0x7F represent one single character so
 * it has to be done like this (and it's elegant). It will only
 * shrink the string in size so no copying is needed.
 */
void strip_7bit_from_utf8(char *str)
{
  int i,j,k;
  i = 0;
  j = 0;
  k = strlen(str);
  while (i < k) {
    if ((uint8_t) str[i] > 0x7FU) {
      str[j] = '_';
      i++;
      // Skip over any consequtive > 0x7F chars.
      while((uint8_t) str[i] > 0x7FU) {
	i++;
      }
    } else {
      str[j] = str[i];
      i++;
    }
    j++;
  }
  // Terminate stripped string...
  str[j] = '\0';
}