aboutsummaryrefslogtreecommitdiffstats
path: root/regparse.c
diff options
context:
space:
mode:
Diffstat (limited to 'regparse.c')
-rw-r--r--regparse.c301
1 files changed, 143 insertions, 158 deletions
diff --git a/regparse.c b/regparse.c
index 440cb5e637..e8af6f3f99 100644
--- a/regparse.c
+++ b/regparse.c
@@ -5807,17 +5807,17 @@ create_node_from_array(int kind, Node **np, Node **node_array)
*
* Target Array name Index
*
- * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E
- * top_alts alts[4] 0 1 2 3*
- * alts+1 list[4] 0 1 2 3*
- * list+1 core_alts[7] 0 1 2 3 4 5 6*
- * core_alts+0 H_list[4] 0 1 2 3*
- * H_list+1 H_alt2[4] 0 1 2 3*
- * h_alt2+1 H_list2[3] 0 1 2*
- * core_alts+4 XP_list[4] 0 1 2 3*
- * XP_list+1 Ex_list[4] 0 1 2 3*
+ * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ * top_alts alts[5] 0 1 2 3 4*
+ * alts+1 list[4] 0 1 2 3*
+ * list+1 core_alts[7] 0 1 2 3 4 5 6*
+ * core_alts+0 H_list[4] 0 1 2 3*
+ * H_list+1 H_alt2[4] 0 1 2 3*
+ * h_alt2+1 H_list2[3] 0 1 2*
+ * core_alts+4 XP_list[4] 0 1 2 3*
+ * XP_list+1 Ex_list[4] 0 1 2 3*
*/
-#define NODE_COMMON_SIZE 15
+#define NODE_COMMON_SIZE 16
static int
node_extended_grapheme_cluster(Node** np, ScanEnv* env)
@@ -5828,208 +5828,193 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
int r = 0;
int num1;
int i;
+ int any_target_position;
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
OnigOptionType option;
- /* node_array is function-global so that we can free all nodes
+ /* node_common is function-global so that we can free all nodes
* in case of error. Unused slots are set to NULL_NODE at all times. */
Node *node_common[NODE_COMMON_SIZE];
+ Node **alts = node_common+0; /* size: 5 */
+
+ for (i=0; i<NODE_COMMON_SIZE; i++)
+ node_common[i] = NULL_NODE;
+
+ /* CRLF, common for both Unicode and non-Unicode */
+ /* \x0D\x0A */
+ r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
+ if (r < 0) goto err;
+ num1 = r;
+ r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
+ if (r < 0) goto err;
+ alts[0] = node_new_str_raw(buf, buf + num1 + r);
+ if (IS_NULL(alts[0])) goto err;
#ifdef USE_UNICODE_PROPERTIES
if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
CClassNode* cc;
- for (i=0; i<NODE_COMMON_SIZE; i++)
- node_common[i] = NULL_NODE;
-
if (propname2ctype(env, "Grapheme_Cluster_Break=Extend") < 0) goto err;
/* Unicode 11.0.0
- * CRLF (this is added last because it is common with non-Unicode encodings)
+ * CRLF (already done)
* | [Control CR LF]
* | precore* core postcore*
* | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */
+
+ /* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */
+ alts[1] = node_new_cclass();
+ if (IS_NULL(alts[1])) goto err;
+ cc = NCCLASS(alts[1]);
+ R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
+ if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
+ R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
+ R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
+ }
+ else {
+ BITSET_SET_BIT(cc->bs, 0x0a);
+ BITSET_SET_BIT(cc->bs, 0x0d);
+ }
+
+ /* precore* core postcore* */
{
- Node **alts = node_common+0; /* size: 4 */
+ Node **list = alts + 3; /* size: 4 */
- /* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */
- alts[0] = node_new_cclass();
- if (IS_NULL(alts[0])) goto err;
- cc = NCCLASS(alts[0]);
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
- if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
- R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
- R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
- }
- else {
- BITSET_SET_BIT(cc->bs, 0x0a);
- BITSET_SET_BIT(cc->bs, 0x0d);
- }
+ /* precore*; precore := Prepend */
+ R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
- /* precore* core postcore* */
+ /* core := hangul-syllable
+ * | ri-sequence
+ * | xpicto-sequence
+ * | [^Control CR LF] */
{
- Node **list = alts + 2; /* size: 4 */
+ Node **core_alts = list + 2; /* size: 7 */
- /* precore*; precore := Prepend */
- R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
+ /* hangul-syllable :=
+ * L* (V+ | LV V* | LVT) T*
+ * | L+
+ * | T+ */
+ /* hangul-syllable is an alternative (would be called H_alt)
+ * inside an alternative, but we flatten it into core_alts */
- /* core := hangul-syllable
- * | ri-sequence
- * | xpicto-sequence
- * | [^Control CR LF] */
+ /* L* (V+ | LV V* | LVT) T* */
{
- Node **core_alts = list + 2; /* size: 7 */
+ Node **H_list = core_alts + 1; /* size: 4 */
+ R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*'));
- /* hangul-syllable :=
- * L* (V+ | LV V* | LVT) T*
- * | L+
- * | T+ */
- /* hangul-syllable is an alternative (would be called H_alt)
- * inside an alternative, but we flatten it into core_alts */
-
- /* L* (V+ | LV V* | LVT) T* */
+ /* V+ | LV V* | LVT */
{
- Node **H_list = core_alts + 1; /* size: 4 */
- R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*'));
+ Node **H_alt2 = H_list + 2; /* size: 4 */
+ R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
- /* V+ | LV V* | LVT */
+ /* LV V* */
{
- Node **H_alt2 = H_list + 2; /* size: 4 */
- R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
-
- /* LV V* */
- {
- Node **H_list2 = H_alt2 + 2; /* size: 3 */
-
- R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV"));
- R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*'));
- R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2));
- }
+ Node **H_list2 = H_alt2 + 2; /* size: 3 */
- R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
- R_ERR(create_node_from_array(ALT, H_list+1, H_alt2));
+ R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV"));
+ R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*'));
+ R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2));
}
- R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*'));
- R_ERR(create_node_from_array(LIST, core_alts+0, H_list));
+ R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
+ R_ERR(create_node_from_array(ALT, H_list+1, H_alt2));
}
- R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
- R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
- /* end of hangul-syllable */
+ R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*'));
+ R_ERR(create_node_from_array(LIST, core_alts+0, H_list));
+ }
- /* ri-sequence := RI RI */
- R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
+ R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
+ R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
+ /* end of hangul-syllable */
- /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
- {
- Node **XP_list = core_alts + 5; /* size: 3 */
- R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic"));
+ /* ri-sequence := RI RI */
+ R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
- /* (Extend* ZWJ \p{Extended_Pictographic})* */
- {
- Node **Ex_list = XP_list + 2; /* size: 4 */
- /* assert(Ex_list+4 <= node_common+NODE_COMMON_SIZE) */
- R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*'));
-
- /* ZWJ (ZERO WIDTH JOINER) */
- r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
- if (r < 0) goto err;
- Ex_list[1] = node_new_str_raw(buf, buf + r);
- if (IS_NULL(Ex_list[1])) goto err;
-
- R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic"));
- R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list));
- }
- R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
-
- R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
- }
+ /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
+ {
+ Node **XP_list = core_alts + 5; /* size: 3 */
+ R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic"));
- /* [^Control CR LF] */
- core_alts[5] = node_new_cclass();
- if (IS_NULL(core_alts[5])) goto err;
- cc = NCCLASS(core_alts[5]);
- if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
- BBuf *inverted_buf = NULL;
-
- /* Start with a positive buffer and invert at the end.
- * Otherwise, adding single-character ranges work the wrong way. */
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
- R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
- R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
- R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
- cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
- }
- else {
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
- BITSET_CLEAR_BIT(cc->bs, 0x0a);
- BITSET_CLEAR_BIT(cc->bs, 0x0d);
+ /* (Extend* ZWJ \p{Extended_Pictographic})* */
+ {
+ Node **Ex_list = XP_list + 2; /* size: 4 */
+ if (!(Ex_list+4 == node_common+NODE_COMMON_SIZE)) exit(1);
+ R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*'));
+
+ /* ZWJ (ZERO WIDTH JOINER) */
+ r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
+ if (r < 0) goto err;
+ Ex_list[1] = node_new_str_raw(buf, buf + r);
+ if (IS_NULL(Ex_list[1])) goto err;
+
+ R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic"));
+ R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list));
}
+ R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
- R_ERR(create_node_from_array(ALT, list+1, core_alts));
+ R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
}
- /* postcore*; postcore = [Extend ZWJ SpacingMark] */
- R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend"));
- cc = NCCLASS(list[2]);
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
- R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
- R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE));
+ /* [^Control CR LF] */
+ core_alts[5] = node_new_cclass();
+ if (IS_NULL(core_alts[5])) goto err;
+ cc = NCCLASS(core_alts[5]);
+ if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
+ BBuf *inverted_buf = NULL;
+
+ /* Start with a positive buffer and invert at the end.
+ * Otherwise, adding single-character ranges work the wrong way. */
+ R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
+ R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
+ R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
+ R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
+ cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
+ }
+ else {
+ R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
+ BITSET_CLEAR_BIT(cc->bs, 0x0a);
+ BITSET_CLEAR_BIT(cc->bs, 0x0d);
+ }
- R_ERR(create_node_from_array(LIST, alts+1, list));
+ R_ERR(create_node_from_array(ALT, list+1, core_alts));
}
- /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
- /* Not in Unicode spec (UAX #29), but added to catch invalid stuff,
- * because this is Ruby spec for String#grapheme_clusters. */
- np1 = node_new_anychar();
- if (IS_NULL(np1)) goto err;
+ /* postcore*; postcore = [Extend ZWJ SpacingMark] */
+ R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend"));
+ cc = NCCLASS(list[2]);
+ R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
+ R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
+ R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE));
- option = env->option;
- ONOFF(option, ONIG_OPTION_MULTILINE, 0);
- tmp = node_new_option(option);
- if (IS_NULL(tmp)) goto err;
- NENCLOSE(tmp)->target = np1;
- alts[2] = tmp;
-
- R_ERR(create_node_from_array(ALT, &top_alt, alts));
+ R_ERR(create_node_from_array(LIST, alts+2, list));
}
+
+ any_target_position = 3;
}
else
#endif /* USE_UNICODE_PROPERTIES */
{
- /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
- np1 = node_new_anychar();
- if (IS_NULL(np1)) goto err;
-
- option = env->option;
- ONOFF(option, ONIG_OPTION_MULTILINE, 0);
- tmp = node_new_option(option);
- if (IS_NULL(tmp)) goto err;
- NENCLOSE(tmp)->target = np1;
- np1 = tmp;
-
- top_alt = onig_node_new_alt(np1, NULL_NODE);
- if (IS_NULL(top_alt)) goto err;
- np1 = NULL;
+ any_target_position = 1;
}
- /* add in CRLF to complete (CRLF | Control | precore* core postcore* | .) */
- /* \x0D\x0A */
- r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
- if (r < 0) goto err;
- num1 = r;
- r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
- if (r < 0) goto err;
- np1 = node_new_str_raw(buf, buf + num1 + r);
+ /* PerlSyntax: (?s:.), RubySyntax: (?m:.), common for both Unicode and non-Unicode */
+ /* Not in Unicode spec (UAX #29), but added to catch invalid stuff,
+ * because this is Ruby spec for String#grapheme_clusters. */
+ np1 = node_new_anychar();
if (IS_NULL(np1)) goto err;
- tmp = onig_node_new_alt(np1, top_alt);
+ option = env->option;
+ ONOFF(option, ONIG_OPTION_MULTILINE, 0);
+ tmp = node_new_option(option);
if (IS_NULL(tmp)) goto err;
- top_alt = tmp;
+ NENCLOSE(tmp)->target = np1;
+ alts[any_target_position] = tmp;
np1 = NULL;
- /* (?>): For efficiency, because there is nothing that isn't in a grapheme cluster,
- and there is only one way to split a string into grapheme clusters. */
+ R_ERR(create_node_from_array(ALT, &top_alt, alts));
+
+ /* (?>): For efficiency, because there is no text piece
+ * that is not in a grapheme cluster, and there is only one way
+ * to split a string into grapheme clusters. */
tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
if (IS_NULL(tmp)) goto err;
NENCLOSE(tmp)->target = top_alt;