From 8130ee5c9dea6d1323d41271cc01c8dc5d8bcc5d Mon Sep 17 00:00:00 2001
From: nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Fri, 2 Dec 2016 03:33:54 +0000
Subject: parse.y: simplify parsing utf-8 string

* parse.y (parser_tokadd_codepoint): move error checks and add
  char.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@56964 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 parse.y | 58 +++++++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

(limited to 'parse.y')

diff --git a/parse.y b/parse.y
index 80c9667b63..50f7b04211 100644
--- a/parse.y
+++ b/parse.y
@@ -5759,22 +5759,31 @@ parser_tok_hex(struct parser_params *parser, size_t *numlen)
 
 static int
 parser_tokadd_codepoint(struct parser_params *parser, rb_encoding **encp,
-			int string_literal, int regexp_literal,
-			int codepoint, int numlen)
+			int regexp_literal, int wide)
 {
+    size_t numlen;
+    int codepoint = scan_hex(lex_p, wide ? 6 : 4, &numlen);
+    if (wide ? (numlen == 0) : (numlen < 4))  {
+	yyerror("invalid Unicode escape");
+	return FALSE;
+    }
+    if (codepoint > 0x10ffff) {
+	yyerror("invalid Unicode codepoint (too large)");
+	return FALSE;
+    }
     if ((codepoint & 0xfffff800) == 0xd800) {
 	yyerror("invalid Unicode codepoint");
 	return FALSE;
     }
     lex_p += numlen;
     if (regexp_literal) {
-	tokcopy(numlen);
+	tokcopy((int)numlen);
     }
     else if (codepoint >= 0x80) {
 	*encp = rb_utf8_encoding();
-	if (string_literal) tokaddmbc(codepoint, *encp);
+	tokaddmbc(codepoint, *encp);
     }
-    else if (string_literal) {
+    else {
 	tokadd(codepoint);
     }
     return TRUE;
@@ -5783,7 +5792,7 @@ parser_tokadd_codepoint(struct parser_params *parser, rb_encoding **encp,
 /* return value is for ?\u3042 */
 static int
 parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
-                   int string_literal, int symbol_literal, int regexp_literal)
+		   int string_literal, int symbol_literal, int regexp_literal)
 {
     /*
      * If string_literal is true, then we allow multiple codepoints
@@ -5792,8 +5801,6 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
      * codepoint without adding it
      */
 
-    int codepoint;
-    size_t numlen;
     const int open_brace = '{', close_brace = '}';
 
     if (regexp_literal) { tokadd('\\'); tokadd('u'); }
@@ -5804,18 +5811,7 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
 	pushback(c);
 	do {
 	    if (regexp_literal) tokadd(last);
-	    codepoint = scan_hex(lex_p, 6, &numlen);
-	    if (numlen == 0)  {
-		yyerror("invalid Unicode escape");
-		return 0;
-	    }
-	    if (codepoint > 0x10ffff) {
-		yyerror("invalid Unicode codepoint (too large)");
-		return 0;
-	    }
-	    if (!parser_tokadd_codepoint(parser, encp,
-					 string_literal, regexp_literal,
-					 codepoint, (int)numlen)) {
+	    if (!parser_tokadd_codepoint(parser, encp, regexp_literal, TRUE)) {
 		return 0;
 	    }
 	    while (ISSPACE(c = nextc())) last = c;
@@ -5831,19 +5827,12 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
 	nextc();
     }
     else {			/* handle \uxxxx form */
-	codepoint = scan_hex(lex_p, 4, &numlen);
-	if (numlen < 4) {
-	    yyerror("invalid Unicode escape");
-	    return 0;
-	}
-	if (!parser_tokadd_codepoint(parser, encp,
-				     string_literal, regexp_literal,
-				     codepoint, 4)) {
+	if (!parser_tokadd_codepoint(parser, encp, regexp_literal, FALSE)) {
 	    return 0;
 	}
     }
 
-    return codepoint;
+    return TRUE;
 }
 
 #define ESCAPE_CONTROL 1
@@ -6189,7 +6178,7 @@ parser_tokadd_string(struct parser_params *parser,
 		}
 		parser_tokadd_utf8(parser, &enc, 1,
 				   func & STR_FUNC_SYMBOL,
-                                   func & STR_FUNC_REGEXP);
+				   func & STR_FUNC_REGEXP);
 		if (has_nonascii && enc != *encp) {
 		    mixed_escape(beg, enc, *encp);
 		}
@@ -7556,13 +7545,8 @@ parse_qmark(struct parser_params *parser, int space_seen)
     else if (c == '\\') {
 	if (peek('u')) {
 	    nextc();
-	    c = parser_tokadd_utf8(parser, &enc, 0, 0, 0);
-	    if (0x80 <= c) {
-		tokaddmbc(c, enc);
-	    }
-	    else {
-		tokadd(c);
-	    }
+	    if (!parser_tokadd_utf8(parser, &enc, 0, 0, 0))
+		return 0;
 	}
 	else if (!lex_eol_p() && !(c = *lex_p, ISASCII(c))) {
 	    nextc();
-- 
cgit v1.2.3