prism/encoding.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

/**
 * @file encoding.h
 *
 * The encoding interface and implementations used by the parser.
 */
#ifndef PRISM_ENCODING_H
#define PRISM_ENCODING_H

#include "prism/defines.h"
#include "prism/util/pm_strncasecmp.h"

#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

/**
 * This struct defines the functions necessary to implement the encoding
 * interface so we can determine how many bytes the subsequent character takes.
 * Each callback should return the number of bytes, or 0 if the next bytes are
 * invalid for the encoding and type.
 */
typedef struct {
    /**
     * Return the number of bytes that the next character takes if it is valid
     * in the encoding. Does not read more than n bytes. It is assumed that n is
     * at least 1.
     */
    size_t (*char_width)(const uint8_t *b, ptrdiff_t n);

    /**
     * Return the number of bytes that the next character takes if it is valid
     * in the encoding and is alphabetical. Does not read more than n bytes. It
     * is assumed that n is at least 1.
     */
    size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);

    /**
     * Return the number of bytes that the next character takes if it is valid
     * in the encoding and is alphanumeric. Does not read more than n bytes. It
     * is assumed that n is at least 1.
     */
    size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);

    /**
     * Return true if the next character is valid in the encoding and is an
     * uppercase character. Does not read more than n bytes. It is assumed that
     * n is at least 1.
     */
    bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);

    /**
     * The name of the encoding. This should correspond to a value that can be
     * passed to Encoding.find in Ruby.
     */
    const char *name;

    /**
     * Return true if the encoding is a multibyte encoding.
     */
    bool multibyte;
} pm_encoding_t;

/**
 * All of the lookup tables use the first bit of each embedded byte to indicate
 * whether the codepoint is alphabetical.
 */
#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0

/**
 * All of the lookup tables use the second bit of each embedded byte to indicate
 * whether the codepoint is alphanumeric.
 */
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1

/**
 * All of the lookup tables use the third bit of each embedded byte to indicate
 * whether the codepoint is uppercase.
 */
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2

/**
 * Return the size of the next character in the UTF-8 encoding if it is an
 * alphabetical character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns The number of bytes that the next character takes if it is valid in
 *     the encoding, or 0 if it is not.
 */
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);

/**
 * Return the size of the next character in the UTF-8 encoding if it is an
 * alphanumeric character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns The number of bytes that the next character takes if it is valid in
 *     the encoding, or 0 if it is not.
 */
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);

/**
 * Return true if the next character in the UTF-8 encoding if it is an uppercase
 * character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns True if the next character is valid in the encoding and is an
 *     uppercase character, or false if it is not.
 */
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);

/**
 * This lookup table is referenced in both the UTF-8 encoding file and the
 * parser directly in order to speed up the default encoding processing. It is
 * used to indicate whether a character is alphabetical, alphanumeric, or
 * uppercase in unicode mappings.
 */
extern const uint8_t pm_encoding_unicode_table[256];

/**
 * These are all of the encodings that prism supports.
 */
typedef enum {
    PM_ENCODING_UTF_8 = 0,
    PM_ENCODING_ASCII_8BIT,
    PM_ENCODING_BIG5,
    PM_ENCODING_BIG5_HKSCS,
    PM_ENCODING_BIG5_UAO,
    PM_ENCODING_CESU_8,
    PM_ENCODING_CP51932,
    PM_ENCODING_CP850,
    PM_ENCODING_CP852,
    PM_ENCODING_CP855,
    PM_ENCODING_CP949,
    PM_ENCODING_CP950,
    PM_ENCODING_CP951,
    PM_ENCODING_EMACS_MULE,
    PM_ENCODING_EUC_JP,
    PM_ENCODING_EUC_JP_MS,
    PM_ENCODING_EUC_JIS_2004,
    PM_ENCODING_EUC_KR,
    PM_ENCODING_EUC_TW,
    PM_ENCODING_GB12345,
    PM_ENCODING_GB18030,
    PM_ENCODING_GB1988,
    PM_ENCODING_GB2312,
    PM_ENCODING_GBK,
    PM_ENCODING_IBM437,
    PM_ENCODING_IBM720,
    PM_ENCODING_IBM737,
    PM_ENCODING_IBM775,
    PM_ENCODING_IBM852,
    PM_ENCODING_IBM855,
    PM_ENCODING_IBM857,
    PM_ENCODING_IBM860,
    PM_ENCODING_IBM861,
    PM_ENCODING_IBM862,
    PM_ENCODING_IBM863,
    PM_ENCODING_IBM864,
    PM_ENCODING_IBM865,
    PM_ENCODING_IBM866,
    PM_ENCODING_IBM869,
    PM_ENCODING_ISO_8859_1,
    PM_ENCODING_ISO_8859_2,
    PM_ENCODING_ISO_8859_3,
    PM_ENCODING_ISO_8859_4,
    PM_ENCODING_ISO_8859_5,
    PM_ENCODING_ISO_8859_6,
    PM_ENCODING_ISO_8859_7,
    PM_ENCODING_ISO_8859_8,
    PM_ENCODING_ISO_8859_9,
    PM_ENCODING_ISO_8859_10,
    PM_ENCODING_ISO_8859_11,
    PM_ENCODING_ISO_8859_13,
    PM_ENCODING_ISO_8859_14,
    PM_ENCODING_ISO_8859_15,
    PM_ENCODING_ISO_8859_16,
    PM_ENCODING_KOI8_R,
    PM_ENCODING_KOI8_U,
    PM_ENCODING_MAC_CENT_EURO,
    PM_ENCODING_MAC_CROATIAN,
    PM_ENCODING_MAC_CYRILLIC,
    PM_ENCODING_MAC_GREEK,
    PM_ENCODING_MAC_ICELAND,
    PM_ENCODING_MAC_JAPANESE,
    PM_ENCODING_MAC_ROMAN,
    PM_ENCODING_MAC_ROMANIA,
    PM_ENCODING_MAC_THAI,
    PM_ENCODING_MAC_TURKISH,
    PM_ENCODING_MAC_UKRAINE,
    PM_ENCODING_SHIFT_JIS,
    PM_ENCODING_SJIS_DOCOMO,
    PM_ENCODING_SJIS_KDDI,
    PM_ENCODING_SJIS_SOFTBANK,
    PM_ENCODING_STATELESS_ISO_2022_JP,
    PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
    PM_ENCODING_TIS_620,
    PM_ENCODING_US_ASCII,
    PM_ENCODING_UTF8_MAC,
    PM_ENCODING_UTF8_DOCOMO,
    PM_ENCODING_UTF8_KDDI,
    PM_ENCODING_UTF8_SOFTBANK,
    PM_ENCODING_WINDOWS_1250,
    PM_ENCODING_WINDOWS_1251,
    PM_ENCODING_WINDOWS_1252,
    PM_ENCODING_WINDOWS_1253,
    PM_ENCODING_WINDOWS_1254,
    PM_ENCODING_WINDOWS_1255,
    PM_ENCODING_WINDOWS_1256,
    PM_ENCODING_WINDOWS_1257,
    PM_ENCODING_WINDOWS_1258,
    PM_ENCODING_WINDOWS_31J,
    PM_ENCODING_WINDOWS_874,
    PM_ENCODING_MAXIMUM
} pm_encoding_type_t;

/**
 * This is the table of all of the encodings that prism supports.
 */
extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];

/**
 * This is the default UTF-8 encoding. We need a reference to it to quickly
 * create parsers.
 */
#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])

/**
 * This is the US-ASCII encoding. We need a reference to it to be able to
 * compare against it when a string is being created because it could possibly
 * need to fall back to ASCII-8BIT.
 */
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])

/**
 * Parse the given name of an encoding and return a pointer to the corresponding
 * encoding struct if one can be found, otherwise return NULL.
 *
 * @param start A pointer to the first byte of the name.
 * @param end A pointer to the last byte of the name.
 * @returns A pointer to the encoding struct if one is found, otherwise NULL.
 */
const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);

#endif