Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
charset.c
1/*************************************************************************/
2/* */
3/* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4/* University of Edinburgh. */
5/* */
6/* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9/* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10/* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11/* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12/* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13/* */
14/*************************************************************************/
15#include <stdio.h>
16#include <stdlib.h>
17
18#ifdef FOR_LT
19
20#include "lt-memory.h"
21
22#define Malloc salloc
23
24#else
25
26#include "system.h"
27
28#endif
29
30#include "charset.h"
31#include "string16.h"
32
33int iso_to_unicode[8][256]; /* latin-2 ... latin-9 */
34int iso_max_val[8];
35char8 *unicode_to_iso[8];
36
37/* This table is used to initialise the above arrays */
38
39static int latin_table[8][96] = {
40
41/* latin2 */
42{
430x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
440x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
450x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
460x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
470x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
480x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
490x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
500x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
510x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
520x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
530x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
540x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
55},
56
57/* latin3 */
58{
590x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -00001, 0x0124, 0x00a7,
600x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -00001, 0x017b,
610x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,
620x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -00001, 0x017c,
630x00c0, 0x00c1, 0x00c2, -00001, 0x00c4, 0x010a, 0x0108, 0x00c7,
640x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
65-00001, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,
660x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
670x00e0, 0x00e1, 0x00e2, -00001, 0x00e4, 0x010b, 0x0109, 0x00e7,
680x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
69-00001, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,
700x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,
71},
72
73/* latin4 */
74{
750x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,
760x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
770x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,
780x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
790x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,
800x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
810x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
820x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
830x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,
840x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
850x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
860x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,
87},
88
89/* latin5 */
90{
910x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
920x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f,
930x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
940x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
950x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
960x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
970x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
980x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
990x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
1000x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
1010x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
1020x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f,
103},
104
105/* latin6 */
106{
1070x00a0, -00001, -00001, -00001, 0x00a4, -00001, -00001, -00001,
108-00001, -00001, -00001, -00001, 0x060c, 0x00ad, -00001, -00001,
109-00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
110-00001, -00001, -00001, 0x061b, -00001, -00001, -00001, 0x061f,
111-00001, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
1120x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,
1130x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
1140x0638, 0x0639, 0x063a, -00001, -00001, -00001, -00001, -00001,
1150x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
1160x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,
1170x0650, 0x0651, 0x0652, -00001, -00001, -00001, -00001, -00001,
118-00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
119},
120
121/* latin7 */
122{
1230x00a0, 0x02bd, 0x02bc, 0x00a3, -00001, -00001, 0x00a6, 0x00a7,
1240x00a8, 0x00a9, -00001, 0x00ab, 0x00ac, 0x00ad, -00001, 0x2015,
1250x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,
1260x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
1270x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
1280x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
1290x03a0, 0x03a1, -00001, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
1300x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
1310x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
1320x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
1330x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
1340x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, -00001,
135},
136
137/* latin8 */
138{
1390x00a0, -00001, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
1400x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x203e,
1410x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
1420x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, -00001,
143-00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
144-00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
145-00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
146-00001, -00001, -00001, -00001, -00001, -00001, -00001, 0x2017,
1470x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,
1480x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,
1490x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,
1500x05e8, 0x05e9, 0x05ea, -00001, -00001, -00001, -00001, -00001,
151},
152
153/* latin9 */
154{
1550x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
1560x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
1570x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
1580x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
1590x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
1600x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
1610x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
1620x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,
1630x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
1640x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
1650x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
1660x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff,
167}
168};
169
170const char8 *CharacterEncodingName[CE_enum_count] = {
171 "unknown",
172 "unspecified-ascii-superset",
173
174 "UTF-8",
175 "ISO-646",
176
177 "ISO-8859-1",
178 "ISO-8859-2",
179 "ISO-8859-3",
180 "ISO-8859-4",
181 "ISO-8859-5",
182 "ISO-8859-6",
183 "ISO-8859-7",
184 "ISO-8859-8",
185 "ISO-8859-9",
186
187 "UTF-16",
188 "UTF-16",
189 "ISO-10646-UCS-2",
190 "ISO-10646-UCS-2",
191};
192
193const char8 *CharacterEncodingNameAndByteOrder[CE_enum_count] = {
194 "unknown",
195 "unspecified_ascii_superset",
196
197 "UTF-8",
198 "ISO-646",
199
200 "ISO-8859-1",
201 "ISO-8859-2",
202 "ISO-8859-3",
203 "ISO-8859-4",
204 "ISO-8859-5",
205 "ISO-8859-6",
206 "ISO-8859-7",
207 "ISO-8859-8",
208 "ISO-8859-9",
209
210 "UTF-16-B",
211 "UTF-16-L",
212 "ISO-10646-UCS-2-B",
213 "ISO-10646-UCS-2-L",
214};
215
216struct character_encoding_alias CharacterEncodingAlias[] = {
217 {"ASCII", CE_ISO_646},
218 {"ISO-Latin-1", CE_ISO_8859_1},
219 {"ISO-Latin-2", CE_ISO_8859_2},
220 {"ISO-Latin-3", CE_ISO_8859_3},
221 {"ISO-Latin-4", CE_ISO_8859_4},
222 {"ISO-Latin-5", CE_ISO_8859_5},
223 {"ISO-Latin-6", CE_ISO_8859_6},
224 {"ISO-Latin-7", CE_ISO_8859_7},
225 {"ISO-Latin-8", CE_ISO_8859_8},
226 {"UCS-2", CE_ISO_10646_UCS_2B},
227};
228const int CE_alias_count =
229 sizeof(CharacterEncodingAlias)/sizeof(CharacterEncodingAlias[0]);
230
231CharacterEncoding InternalCharacterEncoding;
232
233void init_charset(void)
234{
235 int i, j;
236
237 /* Determine internal encoding */
238
239#if CHAR_SIZE == 8
240 InternalCharacterEncoding = CE_unspecified_ascii_superset;
241#else
242 union {char b[2]; short s;} bytes;
243 bytes.s = 1;
244
245 InternalCharacterEncoding = (bytes.b[0] == 0) ? CE_UTF_16B : CE_UTF_16L;
246#endif
247
248 /* Make ISO-Latin-N tables */
249
250 for(i=0; i<8; i++)
251 {
252 int max = 0x9f;
253
254 for(j=0; j<0xa0; j++)
255 iso_to_unicode[i][j] = j;
256 for(j=0xa0; j<0x100; j++)
257 {
258 int code = latin_table[i][j-0xa0];
259 iso_to_unicode[i][j] = code;
260 if(code > max) max = code;
261 }
262
263 iso_max_val[i] = max;
264
265 if(!(unicode_to_iso[i] = Malloc(max+1)))
266 {
267 fprintf(stderr, "Malloc failed in charset initialisation\n");
268 exit(1);
269 }
270
271 for(j=0; j<0xa0; j++)
272 unicode_to_iso[i][j] = j;
273 for(j=0xa0; j<=max; j++)
274 unicode_to_iso[i][j] = '?';
275 for(j=0xa0; j<0x100; j++)
276 {
277 int code = latin_table[i][j-0xa0];
278 if(code != -1)
279 unicode_to_iso[i][code] = j;
280 }
281 }
282}
283
284/* Return true if the encoding has 8-bit input units and is the same
285 as ascii for characters <= 127 */
286
287int EncodingIsAsciiSuperset(CharacterEncoding enc)
288{
289 return enc >= CE_unspecified_ascii_superset && enc <= CE_ISO_8859_9;
290}
291
292/*
293 * Return true if enc1 and enc2 have the same size input units, and are
294 * the same for Unicode <= 127.
295 * If so, *enc3 is set to enc2 modified to have the same byte order as enc1.
296 */
297
298int EncodingsCompatible(CharacterEncoding enc1, CharacterEncoding enc2,
299 CharacterEncoding *enc3)
300{
301 if(EncodingIsAsciiSuperset(enc1))
302 {
303 if(EncodingIsAsciiSuperset(enc2))
304 {
305 *enc3 = enc2;
306 return 1;
307 }
308 return 0;
309 }
310
311 if(enc1 == CE_UTF_16B || enc1 == CE_ISO_10646_UCS_2B)
312 {
313 if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
314 *enc3 = CE_UTF_16B;
315 else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
316 *enc3 = CE_ISO_10646_UCS_2B;
317 else
318 return 0;
319 return 1;
320 }
321
322 if(enc1 == CE_UTF_16L || enc1 == CE_ISO_10646_UCS_2L)
323 {
324 if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
325 *enc3 = CE_UTF_16L;
326 else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
327 *enc3 = CE_ISO_10646_UCS_2L;
328 else
329 return 0;
330 return 1;
331 }
332
333 return 0;
334}
335
336CharacterEncoding FindEncoding(char8 *name)
337{
338 int i;
339
340 for(i=0; i<CE_enum_count; i++)
341 if(strcasecmp8(name, CharacterEncodingNameAndByteOrder[i]) == 0)
342 return (CharacterEncoding)i;
343
344 for(i=0; i<CE_enum_count; i++)
345 if(strcasecmp8(name, CharacterEncodingName[i]) == 0)
346 return (CharacterEncoding)i;
347
348 for(i=0; i<CE_alias_count; i++)
349 if(strcasecmp8(name, CharacterEncodingAlias[i].name) == 0)
350 return CharacterEncodingAlias[i].enc;
351
352 return CE_unknown;
353}
354