Ruby  1.9.3p547(2014-05-14revision45962)
gbk.c
Go to the documentation of this file.
1 /**********************************************************************
2  gbk.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 static const int EncLen_GBK[] = {
33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 };
50 
51 static const char GBK_CAN_BE_TRAIL_TABLE[256] = {
52  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
60  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
68 };
69 
70 #define GBK_ISMB_FIRST(byte) (EncLen_GBK[byte] > 1)
71 #define GBK_ISMB_TRAIL(byte) GBK_CAN_BE_TRAIL_TABLE[(byte)]
72 
73 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
74 #define A ACCEPT
75 #define F FAILURE
76 static const signed char trans[][0x100] = {
77  { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
78  /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
79  /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80  /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
81  /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
82  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
83  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
84  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
85  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
86  /* 8 */ A, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87  /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88  /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90  /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
94  },
95  { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
101  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
102  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
103  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
104  /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
105  /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
106  /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
107  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
108  /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
109  /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
110  /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
111  /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
112  }
113 };
114 #undef A
115 #undef F
116 
117 static int
119 {
120  int firstbyte = *p++;
121  state_t s = trans[0][firstbyte];
122 #define RETURN(n) \
123  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
124  ONIGENC_CONSTRUCT_MBCLEN_INVALID()
125  if (s < 0) RETURN(1);
126  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_GBK[firstbyte]-1);
127  s = trans[s][*p++];
128  RETURN(2);
129 #undef RETURN
130 }
131 
132 static OnigCodePoint
133 gbk_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
134 {
135  return onigenc_mbn_mbc_to_code(enc, p, end);
136 }
137 
138 static int
140 {
141  return onigenc_mb2_code_to_mbc(enc, code, buf);
142 }
143 
144 static int
145 gbk_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
146  UChar* lower, OnigEncoding enc)
147 {
148  return onigenc_mbn_mbc_case_fold(enc, flag,
149  pp, end, lower);
150 }
151 
152 #if 0
153 static int
154 gbk_is_mbc_ambiguous(OnigCaseFoldType flag,
155  const UChar** pp, const UChar* end, OnigEncoding enc)
156 {
157  return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
158 }
159 #endif
160 
161 static int
162 gbk_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
163 {
164  return onigenc_mb2_is_code_ctype(enc, code, ctype);
165 }
166 
167 static UChar*
168 gbk_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
169 {
170  const UChar *p;
171  int len;
172 
173  if (s <= start) return (UChar* )s;
174  p = s;
175 
176  if (GBK_ISMB_TRAIL(*p)) {
177  while (p > start) {
178  if (! GBK_ISMB_FIRST(*--p)) {
179  p++;
180  break;
181  }
182  }
183  }
184  len = enclen(enc, p, end);
185  if (p + len > s) return (UChar* )p;
186  p += len;
187  return (UChar* )(p + ((s - p) & ~1));
188 }
189 
190 static int
191 gbk_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
192 {
193  const UChar c = *s;
194  return (GBK_ISMB_TRAIL(c) ? FALSE : TRUE);
195 }
196 
197 OnigEncodingDefine(gbk, GBK) = {
199  "GBK", /* name */
200  2, /* max enc length */
201  1, /* min enc length */
214 };
215 /*
216  * Name: GBK
217  * MIBenum: 113
218  * Link: http://www.iana.org/assignments/character-sets
219  * Link: http://www.iana.org/assignments/charset-reg/GBK
220  * Link: http://www.microsoft.com/globaldev/reference/dbcs/936.mspx
221  */
222 ENC_ALIAS("CP936", "GBK")
#define ARG_UNUSED
Definition: regenc.h:68
unsigned int OnigCodePoint
Definition: oniguruma.h:111
Definition: gbk.c:73
#define FALSE
Definition: nkf.h:185
Definition: gbk.c:73
unsigned int OnigCaseFoldType
Definition: oniguruma.h:117
int onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar **pp, const UChar *end ARG_UNUSED, UChar *lower)
Definition: regenc.c:691
int onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:733
int onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
Definition: regenc.c:750
state_t
Definition: big5.c:90
int onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED, OnigApplyAllCaseFoldFunc f, void *arg, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:409
#define ENC_ALIAS(name, orig)
Definition: encdb.c:18
Definition: gbk.c:73
#define GBK_ISMB_FIRST(byte)
Definition: gbk.c:70
int onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype)
Definition: regenc.c:823
Win32OLEIDispatch * p
Definition: win32ole.c:778
static UChar * gbk_left_adjust_char_head(const UChar *start, const UChar *s, const UChar *end, OnigEncoding enc)
Definition: gbk.c:168
static OnigCodePoint gbk_mbc_to_code(const UChar *p, const UChar *end, OnigEncoding enc)
Definition: gbk.c:133
static const signed char trans[][0x100]
Definition: gbk.c:76
static int gbk_mbc_case_fold(OnigCaseFoldType flag, const UChar **pp, const UChar *end, UChar *lower, OnigEncoding enc)
Definition: gbk.c:145
Definition: gbk.c:73
static int gbk_is_allowed_reverse_match(const UChar *s, const UChar *end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: gbk.c:191
#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n)
Definition: oniguruma.h:250
int onigenc_is_mbc_newline_0x0a(const UChar *p, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:580
#define GBK_ISMB_TRAIL(byte)
Definition: gbk.c:71
#define RETURN(n)
#define TRUE
Definition: nkf.h:186
#define A
Definition: gbk.c:74
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:3913
#define F
Definition: gbk.c:75
state_t
Definition: gbk.c:73
register unsigned int len
Definition: name2ctype.h:22210
static const char GBK_CAN_BE_TRAIL_TABLE[256]
Definition: gbk.c:51
int onigenc_not_support_get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint *ranges[], OnigEncoding enc)
Definition: regenc.c:572
static int gbk_mbc_enc_len(const UChar *p, const UChar *e, OnigEncoding enc ARG_UNUSED)
Definition: gbk.c:118
int onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, const OnigUChar *p, const OnigUChar *end ARG_UNUSED, OnigCaseFoldCodeItem items[], OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:432
static const int EncLen_GBK[]
Definition: gbk.c:32
#define UChar
Definition: oniguruma.h:107
static int gbk_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
Definition: gbk.c:139
#define enclen(enc, p, e)
OnigEncodingDefine(gbk, GBK)
OnigCodePoint onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar *p, const UChar *end)
Definition: regenc.c:673
static int gbk_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
Definition: gbk.c:162
int onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar *p, UChar *end)
Definition: regenc.c:790