From 3b6c4dad0c0e3e7a27fd1e89f9cf9579e9f53079 Mon Sep 17 00:00:00 2001
From: nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Fri, 8 Apr 2016 17:05:12 +0000
Subject: bignum.c: rb_cstr_parse_inum

* bignum.c (rb_cstr_parse_inum): [EXPERIMENTAL] new function to
  parse integer in C-string with length.  the name and the
  arguments may be changed in the future.
* bignum.c (rb_str_to_inum): preserve encoding of the argument in
  error messages, and no longer needs to copy non-terminated
  strings.
* bignum.c (rb_str2big_{poweroftwo,normal,karatsuba,gmp}): ditto.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@54518 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 ChangeLog                 |  12 +++
 bignum.c                  | 245 +++++++++++++++++++++++++++++-----------------
 internal.h                |   1 +
 test/ruby/test_integer.rb |   2 +
 4 files changed, 169 insertions(+), 91 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 7cddd7ce23..b3b4d4d8da 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+Sat Apr  9 02:05:10 2016  Nobuyoshi Nakada  <nobu@ruby-lang.org>
+
+	* bignum.c (rb_cstr_parse_inum): [EXPERIMENTAL] new function to
+	  parse integer in C-string with length.  the name and the
+	  arguments may be changed in the future.
+
+	* bignum.c (rb_str_to_inum): preserve encoding of the argument in
+	  error messages, and no longer needs to copy non-terminated
+	  strings.
+
+	* bignum.c (rb_str2big_{poweroftwo,normal,karatsuba,gmp}): ditto.
+
 Thu Apr  7 19:04:03 2016  Nobuyoshi Nakada  <nobu@ruby-lang.org>
 
 	* doc/regexp.rdoc (comments): [DOC] terminators cannot appear in
diff --git a/bignum.c b/bignum.c
index d6ebeb5ed5..a6d669e887 100644
--- a/bignum.c
+++ b/bignum.c
@@ -3686,6 +3686,7 @@ rb_integer_unpack(const void *words, size_t numwords, size_t wordsize, size_t na
 #define conv_digit(c) (ruby_digit36_to_number_table[(unsigned char)(c)])
 
 NORETURN(static inline void invalid_radix(int base));
+NORETURN(static inline void invalid_integer(VALUE s));
 
 static inline int
 valid_radix_p(int base)
@@ -3699,16 +3700,29 @@ invalid_radix(int base)
     rb_raise(rb_eArgError, "invalid radix %d", base);
 }
 
-static void
-str2big_scan_digits(const char *s, const char *str, int base, int badcheck, size_t *num_digits_p, size_t *len_p)
+static inline void
+invalid_integer(VALUE s)
+{
+    rb_raise(rb_eArgError, "invalid value for Integer(): %+"PRIsVALUE, s);
+}
+
+static int
+str2big_scan_digits(const char *s, const char *str, int base, int badcheck, size_t *num_digits_p, ssize_t *len_p)
 {
     char nondigit = 0;
     size_t num_digits = 0;
     const char *digits_start = str;
     const char *digits_end = str;
+    ssize_t len = *len_p;
 
     int c;
 
+    if (!len) {
+	*num_digits_p = 0;
+	*len_p = 0;
+	return TRUE;
+    }
+
     if (badcheck && *str == '_') goto bad;
 
     while ((c = *str++) != 0) {
@@ -3718,27 +3732,32 @@ str2big_scan_digits(const char *s, const char *str, int base, int badcheck, size
 		break;
 	    }
 	    nondigit = (char) c;
-	    continue;
 	}
-	else if ((c = conv_digit(c)) < 0) {
+	else if ((c = conv_digit(c)) < 0 || c >= base) {
 	    break;
 	}
-	if (c >= base) break;
-	nondigit = 0;
-        num_digits++;
-        digits_end = str;
+	else {
+	    nondigit = 0;
+	    num_digits++;
+	    digits_end = str;
+	}
+	if (len > 0 && !--len) break;
     }
-    if (badcheck) {
+    if (badcheck && nondigit) goto bad;
+    if (badcheck && len) {
 	str--;
-	if (s+1 < str && str[-1] == '_') goto bad;
-	while (*str && ISSPACE(*str)) str++;
-	if (*str) {
+	while (*str && ISSPACE(*str)) {
+	    str++;
+	    if (len > 0 && !--len) break;
+	}
+	if (len && *str) {
 	  bad:
-	    rb_invalid_str(s, "Integer()");
+	    return FALSE;
 	}
     }
     *num_digits_p = num_digits;
     *len_p = digits_end - digits_start;
+    return TRUE;
 }
 
 static VALUE
@@ -3972,56 +3991,92 @@ str2big_gmp(
 VALUE
 rb_cstr_to_inum(const char *str, int base, int badcheck)
 {
-    const char *s = str;
+    char *end;
+    VALUE ret = rb_cstr_parse_inum(str, -1, (badcheck ? NULL : &end), base);
+    if (NIL_P(ret)) {
+	if (badcheck) rb_invalid_str(str, "Integer()");
+	ret = INT2FIX(0);
+    }
+    return ret;
+}
+
+/*
+ * Parse +str+ as Ruby Integer, i.e., underscores, 0d and 0b prefixes.
+ *
+ * str:  pointer to the string to be parsed.
+ *       should be NUL-terminated if +len+ is negative.
+ * len:  length of +str+ if >= 0.  if +len+ is negative, +str+ should
+ *       be NUL-terminated.
+ * endp: if non-NULL, the address after parsed part is stored.  if
+ *       NULL, Qnil is returned when +str+ is not valid as an Integer.
+ * base: see +rb_cstr_to_inum+
+ */
+
+VALUE
+rb_cstr_parse_inum(const char *str, ssize_t len, char **endp, int base)
+{
+    const char *const s = str;
     char sign = 1;
     int c;
     VALUE z;
 
-    int bits_per_digit;
+    unsigned long val;
+    int ov;
 
     const char *digits_start, *digits_end;
     size_t num_digits;
     size_t num_bdigits;
-    size_t len;
+    const ssize_t len0 = len;
+    const int badcheck = !endp;
+
+#define ADV(n) do {\
+	if (len > 0 && len <= (n)) goto bad; \
+	str += (n); \
+	len -= (n); \
+    } while (0)
+#define ASSERT_LEN() do {\
+	assert(len != 0); \
+	if (len0 >= 0) assert(s + len0 == str + len); \
+    } while (0)
 
     if (!str) {
-	if (badcheck) {
-          bad:
-            rb_invalid_str(s, "Integer()");
-        }
-	return INT2FIX(0);
+      bad:
+	if (endp) *endp = (char *)str;
+	return Qnil;
     }
-    while (ISSPACE(*str)) str++;
+    if (len) {
+	while (ISSPACE(*str)) ADV(1);
 
-    if (str[0] == '+') {
-	str++;
-    }
-    else if (str[0] == '-') {
-	str++;
-	sign = 0;
-    }
-    if (str[0] == '+' || str[0] == '-') {
-	if (badcheck) goto bad;
-	return INT2FIX(0);
+	if (str[0] == '+') {
+	    ADV(1);
+	}
+	else if (str[0] == '-') {
+	    ADV(1);
+	    sign = 0;
+	}
+	ASSERT_LEN();
+	if (str[0] == '+' || str[0] == '-') {
+	    goto bad;
+	}
     }
     if (base <= 0) {
-	if (str[0] == '0') {
+	if (str[0] == '0' && len > 1) {
 	    switch (str[1]) {
 	      case 'x': case 'X':
 		base = 16;
-                str += 2;
+		ADV(2);
 		break;
 	      case 'b': case 'B':
 		base = 2;
-                str += 2;
+		ADV(2);
 		break;
 	      case 'o': case 'O':
 		base = 8;
-                str += 2;
+		ADV(2);
 		break;
 	      case 'd': case 'D':
 		base = 10;
-                str += 2;
+		ADV(2);
 		break;
 	      default:
 		base = 8;
@@ -4034,31 +4089,36 @@ rb_cstr_to_inum(const char *str, int base, int badcheck)
 	    base = 10;
 	}
     }
+    else if (len == 1) {
+	/* no prefix */
+    }
     else if (base == 2) {
 	if (str[0] == '0' && (str[1] == 'b'||str[1] == 'B')) {
-	    str += 2;
+	    ADV(2);
 	}
     }
     else if (base == 8) {
 	if (str[0] == '0' && (str[1] == 'o'||str[1] == 'O')) {
-	    str += 2;
+	    ADV(2);
 	}
     }
     else if (base == 10) {
 	if (str[0] == '0' && (str[1] == 'd'||str[1] == 'D')) {
-	    str += 2;
+	    ADV(2);
 	}
     }
     else if (base == 16) {
 	if (str[0] == '0' && (str[1] == 'x'||str[1] == 'X')) {
-	    str += 2;
+	    ADV(2);
 	}
     }
     if (!valid_radix_p(base)) {
         invalid_radix(base);
     }
-    if (*str == '0') {		/* squeeze preceding 0s */
+    if (!len) goto bad;
+    if (*str == '0' && len != 1) { /* squeeze preceding 0s */
 	int us = 0;
+	const char *end = len < 0 ? NULL : str + len;
 	while ((c = *++str) == '0' || c == '_') {
 	    if (c == '_') {
 		if (++us >= 2)
@@ -4067,26 +4127,29 @@ rb_cstr_to_inum(const char *str, int base, int badcheck)
 	    else {
 		us = 0;
 	    }
+	    if (str == end) break;
 	}
-	if (!(c = *str) || ISSPACE(c)) --str;
+	if (!c || ISSPACE(c)) --str;
+	if (end) len = end - str;
+	ASSERT_LEN();
     }
     c = *str;
     c = conv_digit(c);
     if (c < 0 || c >= base) {
-	if (badcheck) goto bad;
-	return INT2FIX(0);
+	goto bad;
     }
 
-    bits_per_digit = bit_length(base-1);
-    if (bits_per_digit * strlen(str) <= sizeof(long) * CHAR_BIT) {
-        char *end;
-	unsigned long val = STRTOUL(str, &end, base);
-
-	if (str < end && *end == '_') goto bigparse;
+    val = ruby_scan_digits(str, len, base, &num_digits, &ov);
+    if (!ov) {
+	const char *end = &str[num_digits];
+	if (num_digits > 0 && *end == '_') goto bigparse;
+	if (endp) *endp = (char *)end;
 	if (badcheck) {
-	    if (end == str) goto bad; /* no number */
-	    while (*end && ISSPACE(*end)) end++;
-	    if (*end) goto bad;	      /* trailing garbage */
+	    if (num_digits == 0) return Qnil; /* no number */
+	    while (len < 0 ? *end : end < str + len) {
+		if (!ISSPACE(*end)) return Qnil; /* trailing garbage */
+		end++;
+	    }
 	}
 
 	if (POSFIXABLE(val)) {
@@ -4105,12 +4168,13 @@ rb_cstr_to_inum(const char *str, int base, int badcheck)
 
   bigparse:
     digits_start = str;
-    str2big_scan_digits(s, str, base, badcheck, &num_digits, &len);
+    if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len))
+	goto bad;
     digits_end = digits_start + len;
 
     if (POW2_P(base)) {
         z = str2big_poweroftwo(sign, digits_start, digits_end, num_digits,
-                bits_per_digit);
+			       bit_length(base-1));
     }
     else {
         int digits_per_bdigits_dbl;
@@ -4140,32 +4204,19 @@ rb_cstr_to_inum(const char *str, int base, int badcheck)
 VALUE
 rb_str_to_inum(VALUE str, int base, int badcheck)
 {
-    char *s;
-    long len;
-    VALUE v = 0;
     VALUE ret;
+    const char *s;
+    long len;
+    char *end;
 
     StringValue(str);
     rb_must_asciicompat(str);
-    if (badcheck) {
-	s = StringValueCStr(str);
-    }
-    else {
-	s = RSTRING_PTR(str);
+    RSTRING_GETMEM(str, s, len);
+    ret = rb_cstr_parse_inum(s, len, (badcheck ? NULL : &end), base);
+    if (NIL_P(ret)) {
+	if (badcheck) invalid_integer(str);
+	ret = INT2FIX(0);
     }
-    if (s) {
-	len = RSTRING_LEN(str);
-	if (s[len]) {		/* no sentinel somehow */
-	    char *p = ALLOCV(v, len+1);
-
-	    MEMCPY(p, s, char, len);
-	    p[len] = '\0';
-	    s = p;
-	}
-    }
-    ret = rb_cstr_to_inum(s, base, badcheck);
-    if (v)
-	ALLOCV_END(v);
     return ret;
 }
 
@@ -4176,7 +4227,7 @@ rb_str2big_poweroftwo(VALUE arg, int base, int badcheck)
     const char *s, *str;
     const char *digits_start, *digits_end;
     size_t num_digits;
-    size_t len;
+    ssize_t len;
     VALUE z;
 
     if (!valid_radix_p(base) || !POW2_P(base)) {
@@ -4185,13 +4236,16 @@ rb_str2big_poweroftwo(VALUE arg, int base, int badcheck)
 
     rb_must_asciicompat(arg);
     s = str = StringValueCStr(arg);
+    len = RSTRING_LEN(arg);
     if (*str == '-') {
+	len--;
         str++;
         positive_p = 0;
     }
 
     digits_start = str;
-    str2big_scan_digits(s, str, base, badcheck, &num_digits, &len);
+    if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len))
+	invalid_integer(arg);
     digits_end = digits_start + len;
 
     z = str2big_poweroftwo(positive_p, digits_start, digits_end, num_digits,
@@ -4209,7 +4263,7 @@ rb_str2big_normal(VALUE arg, int base, int badcheck)
     const char *s, *str;
     const char *digits_start, *digits_end;
     size_t num_digits;
-    size_t len;
+    ssize_t len;
     VALUE z;
 
     int digits_per_bdigits_dbl;
@@ -4220,14 +4274,17 @@ rb_str2big_normal(VALUE arg, int base, int badcheck)
     }
 
     rb_must_asciicompat(arg);
-    s = str = StringValueCStr(arg);
-    if (*str == '-') {
+    s = str = StringValuePtr(arg);
+    len = RSTRING_LEN(arg);
+    if (len > 0 && *str == '-') {
+	len--;
         str++;
         positive_p = 0;
     }
 
     digits_start = str;
-    str2big_scan_digits(s, str, base, badcheck, &num_digits, &len);
+    if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len))
+	invalid_integer(arg);
     digits_end = digits_start + len;
 
     maxpow_in_bdigit_dbl(base, &digits_per_bdigits_dbl);
@@ -4248,7 +4305,7 @@ rb_str2big_karatsuba(VALUE arg, int base, int badcheck)
     const char *s, *str;
     const char *digits_start, *digits_end;
     size_t num_digits;
-    size_t len;
+    ssize_t len;
     VALUE z;
 
     int digits_per_bdigits_dbl;
@@ -4259,14 +4316,17 @@ rb_str2big_karatsuba(VALUE arg, int base, int badcheck)
     }
 
     rb_must_asciicompat(arg);
-    s = str = StringValueCStr(arg);
-    if (*str == '-') {
+    s = str = StringValuePtr(arg);
+    len = RSTRING_LEN(arg);
+    if (len > 0 && *str == '-') {
+	len--;
         str++;
         positive_p = 0;
     }
 
     digits_start = str;
-    str2big_scan_digits(s, str, base, badcheck, &num_digits, &len);
+    if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len))
+	invalid_integer(arg);
     digits_end = digits_start + len;
 
     maxpow_in_bdigit_dbl(base, &digits_per_bdigits_dbl);
@@ -4288,7 +4348,7 @@ rb_str2big_gmp(VALUE arg, int base, int badcheck)
     const char *s, *str;
     const char *digits_start, *digits_end;
     size_t num_digits;
-    size_t len;
+    ssize_t len;
     VALUE z;
 
     int digits_per_bdigits_dbl;
@@ -4299,14 +4359,17 @@ rb_str2big_gmp(VALUE arg, int base, int badcheck)
     }
 
     rb_must_asciicompat(arg);
-    s = str = StringValueCStr(arg);
-    if (*str == '-') {
+    s = str = StringValuePtr(arg);
+    len = RSTRING_LEN(arg);
+    if (len > 0 && *str == '-') {
+	len--;
         str++;
         positive_p = 0;
     }
 
     digits_start = str;
-    str2big_scan_digits(s, str, base, badcheck, &num_digits, &len);
+    if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len))
+	invalid_integer(arg);
     digits_end = digits_start + len;
 
     maxpow_in_bdigit_dbl(base, &digits_per_bdigits_dbl);
diff --git a/internal.h b/internal.h
index 3970431fcb..86a0852cc1 100644
--- a/internal.h
+++ b/internal.h
@@ -782,6 +782,7 @@ VALUE rb_big_odd_p(VALUE);
 VALUE rb_big_even_p(VALUE);
 VALUE rb_integer_float_cmp(VALUE x, VALUE y);
 VALUE rb_integer_float_eq(VALUE x, VALUE y);
+VALUE rb_cstr_parse_inum(const char *str, ssize_t len, char **endp, int base);
 
 /* class.c */
 VALUE rb_class_boot(VALUE);
diff --git a/test/ruby/test_integer.rb b/test/ruby/test_integer.rb
index 3f5fca75d7..a82562f887 100644
--- a/test/ruby/test_integer.rb
+++ b/test/ruby/test_integer.rb
@@ -98,6 +98,8 @@ class TestInteger < Test::Unit::TestCase
     assert_raise(Encoding::CompatibilityError, bug6192) {Integer("0".encode("utf-32be"))}
     assert_raise(Encoding::CompatibilityError, bug6192) {Integer("0".encode("utf-32le"))}
     assert_raise(Encoding::CompatibilityError, bug6192) {Integer("0".encode("iso-2022-jp"))}
+
+    assert_raise_with_message(ArgumentError, /\u{1f4a1}/) {Integer("\u{1f4a1}")}
   end
 
   def test_int_p
-- 
cgit v1.2.3