From 3b6c4dad0c0e3e7a27fd1e89f9cf9579e9f53079 Mon Sep 17 00:00:00 2001 From: nobu Date: Fri, 8 Apr 2016 17:05:12 +0000 Subject: bignum.c: rb_cstr_parse_inum * bignum.c (rb_cstr_parse_inum): [EXPERIMENTAL] new function to parse integer in C-string with length. the name and the arguments may be changed in the future. * bignum.c (rb_str_to_inum): preserve encoding of the argument in error messages, and no longer needs to copy non-terminated strings. * bignum.c (rb_str2big_{poweroftwo,normal,karatsuba,gmp}): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@54518 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 12 +++ bignum.c | 245 +++++++++++++++++++++++++++++----------------- internal.h | 1 + test/ruby/test_integer.rb | 2 + 4 files changed, 169 insertions(+), 91 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7cddd7ce23..b3b4d4d8da 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +Sat Apr 9 02:05:10 2016 Nobuyoshi Nakada + + * bignum.c (rb_cstr_parse_inum): [EXPERIMENTAL] new function to + parse integer in C-string with length. the name and the + arguments may be changed in the future. + + * bignum.c (rb_str_to_inum): preserve encoding of the argument in + error messages, and no longer needs to copy non-terminated + strings. + + * bignum.c (rb_str2big_{poweroftwo,normal,karatsuba,gmp}): ditto. + Thu Apr 7 19:04:03 2016 Nobuyoshi Nakada * doc/regexp.rdoc (comments): [DOC] terminators cannot appear in diff --git a/bignum.c b/bignum.c index d6ebeb5ed5..a6d669e887 100644 --- a/bignum.c +++ b/bignum.c @@ -3686,6 +3686,7 @@ rb_integer_unpack(const void *words, size_t numwords, size_t wordsize, size_t na #define conv_digit(c) (ruby_digit36_to_number_table[(unsigned char)(c)]) NORETURN(static inline void invalid_radix(int base)); +NORETURN(static inline void invalid_integer(VALUE s)); static inline int valid_radix_p(int base) @@ -3699,16 +3700,29 @@ invalid_radix(int base) rb_raise(rb_eArgError, "invalid radix %d", base); } -static void -str2big_scan_digits(const char *s, const char *str, int base, int badcheck, size_t *num_digits_p, size_t *len_p) +static inline void +invalid_integer(VALUE s) +{ + rb_raise(rb_eArgError, "invalid value for Integer(): %+"PRIsVALUE, s); +} + +static int +str2big_scan_digits(const char *s, const char *str, int base, int badcheck, size_t *num_digits_p, ssize_t *len_p) { char nondigit = 0; size_t num_digits = 0; const char *digits_start = str; const char *digits_end = str; + ssize_t len = *len_p; int c; + if (!len) { + *num_digits_p = 0; + *len_p = 0; + return TRUE; + } + if (badcheck && *str == '_') goto bad; while ((c = *str++) != 0) { @@ -3718,27 +3732,32 @@ str2big_scan_digits(const char *s, const char *str, int base, int badcheck, size break; } nondigit = (char) c; - continue; } - else if ((c = conv_digit(c)) < 0) { + else if ((c = conv_digit(c)) < 0 || c >= base) { break; } - if (c >= base) break; - nondigit = 0; - num_digits++; - digits_end = str; + else { + nondigit = 0; + num_digits++; + digits_end = str; + } + if (len > 0 && !--len) break; } - if (badcheck) { + if (badcheck && nondigit) goto bad; + if (badcheck && len) { str--; - if (s+1 < str && str[-1] == '_') goto bad; - while (*str && ISSPACE(*str)) str++; - if (*str) { + while (*str && ISSPACE(*str)) { + str++; + if (len > 0 && !--len) break; + } + if (len && *str) { bad: - rb_invalid_str(s, "Integer()"); + return FALSE; } } *num_digits_p = num_digits; *len_p = digits_end - digits_start; + return TRUE; } static VALUE @@ -3972,56 +3991,92 @@ str2big_gmp( VALUE rb_cstr_to_inum(const char *str, int base, int badcheck) { - const char *s = str; + char *end; + VALUE ret = rb_cstr_parse_inum(str, -1, (badcheck ? NULL : &end), base); + if (NIL_P(ret)) { + if (badcheck) rb_invalid_str(str, "Integer()"); + ret = INT2FIX(0); + } + return ret; +} + +/* + * Parse +str+ as Ruby Integer, i.e., underscores, 0d and 0b prefixes. + * + * str: pointer to the string to be parsed. + * should be NUL-terminated if +len+ is negative. + * len: length of +str+ if >= 0. if +len+ is negative, +str+ should + * be NUL-terminated. + * endp: if non-NULL, the address after parsed part is stored. if + * NULL, Qnil is returned when +str+ is not valid as an Integer. + * base: see +rb_cstr_to_inum+ + */ + +VALUE +rb_cstr_parse_inum(const char *str, ssize_t len, char **endp, int base) +{ + const char *const s = str; char sign = 1; int c; VALUE z; - int bits_per_digit; + unsigned long val; + int ov; const char *digits_start, *digits_end; size_t num_digits; size_t num_bdigits; - size_t len; + const ssize_t len0 = len; + const int badcheck = !endp; + +#define ADV(n) do {\ + if (len > 0 && len <= (n)) goto bad; \ + str += (n); \ + len -= (n); \ + } while (0) +#define ASSERT_LEN() do {\ + assert(len != 0); \ + if (len0 >= 0) assert(s + len0 == str + len); \ + } while (0) if (!str) { - if (badcheck) { - bad: - rb_invalid_str(s, "Integer()"); - } - return INT2FIX(0); + bad: + if (endp) *endp = (char *)str; + return Qnil; } - while (ISSPACE(*str)) str++; + if (len) { + while (ISSPACE(*str)) ADV(1); - if (str[0] == '+') { - str++; - } - else if (str[0] == '-') { - str++; - sign = 0; - } - if (str[0] == '+' || str[0] == '-') { - if (badcheck) goto bad; - return INT2FIX(0); + if (str[0] == '+') { + ADV(1); + } + else if (str[0] == '-') { + ADV(1); + sign = 0; + } + ASSERT_LEN(); + if (str[0] == '+' || str[0] == '-') { + goto bad; + } } if (base <= 0) { - if (str[0] == '0') { + if (str[0] == '0' && len > 1) { switch (str[1]) { case 'x': case 'X': base = 16; - str += 2; + ADV(2); break; case 'b': case 'B': base = 2; - str += 2; + ADV(2); break; case 'o': case 'O': base = 8; - str += 2; + ADV(2); break; case 'd': case 'D': base = 10; - str += 2; + ADV(2); break; default: base = 8; @@ -4034,31 +4089,36 @@ rb_cstr_to_inum(const char *str, int base, int badcheck) base = 10; } } + else if (len == 1) { + /* no prefix */ + } else if (base == 2) { if (str[0] == '0' && (str[1] == 'b'||str[1] == 'B')) { - str += 2; + ADV(2); } } else if (base == 8) { if (str[0] == '0' && (str[1] == 'o'||str[1] == 'O')) { - str += 2; + ADV(2); } } else if (base == 10) { if (str[0] == '0' && (str[1] == 'd'||str[1] == 'D')) { - str += 2; + ADV(2); } } else if (base == 16) { if (str[0] == '0' && (str[1] == 'x'||str[1] == 'X')) { - str += 2; + ADV(2); } } if (!valid_radix_p(base)) { invalid_radix(base); } - if (*str == '0') { /* squeeze preceding 0s */ + if (!len) goto bad; + if (*str == '0' && len != 1) { /* squeeze preceding 0s */ int us = 0; + const char *end = len < 0 ? NULL : str + len; while ((c = *++str) == '0' || c == '_') { if (c == '_') { if (++us >= 2) @@ -4067,26 +4127,29 @@ rb_cstr_to_inum(const char *str, int base, int badcheck) else { us = 0; } + if (str == end) break; } - if (!(c = *str) || ISSPACE(c)) --str; + if (!c || ISSPACE(c)) --str; + if (end) len = end - str; + ASSERT_LEN(); } c = *str; c = conv_digit(c); if (c < 0 || c >= base) { - if (badcheck) goto bad; - return INT2FIX(0); + goto bad; } - bits_per_digit = bit_length(base-1); - if (bits_per_digit * strlen(str) <= sizeof(long) * CHAR_BIT) { - char *end; - unsigned long val = STRTOUL(str, &end, base); - - if (str < end && *end == '_') goto bigparse; + val = ruby_scan_digits(str, len, base, &num_digits, &ov); + if (!ov) { + const char *end = &str[num_digits]; + if (num_digits > 0 && *end == '_') goto bigparse; + if (endp) *endp = (char *)end; if (badcheck) { - if (end == str) goto bad; /* no number */ - while (*end && ISSPACE(*end)) end++; - if (*end) goto bad; /* trailing garbage */ + if (num_digits == 0) return Qnil; /* no number */ + while (len < 0 ? *end : end < str + len) { + if (!ISSPACE(*end)) return Qnil; /* trailing garbage */ + end++; + } } if (POSFIXABLE(val)) { @@ -4105,12 +4168,13 @@ rb_cstr_to_inum(const char *str, int base, int badcheck) bigparse: digits_start = str; - str2big_scan_digits(s, str, base, badcheck, &num_digits, &len); + if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len)) + goto bad; digits_end = digits_start + len; if (POW2_P(base)) { z = str2big_poweroftwo(sign, digits_start, digits_end, num_digits, - bits_per_digit); + bit_length(base-1)); } else { int digits_per_bdigits_dbl; @@ -4140,32 +4204,19 @@ rb_cstr_to_inum(const char *str, int base, int badcheck) VALUE rb_str_to_inum(VALUE str, int base, int badcheck) { - char *s; - long len; - VALUE v = 0; VALUE ret; + const char *s; + long len; + char *end; StringValue(str); rb_must_asciicompat(str); - if (badcheck) { - s = StringValueCStr(str); - } - else { - s = RSTRING_PTR(str); + RSTRING_GETMEM(str, s, len); + ret = rb_cstr_parse_inum(s, len, (badcheck ? NULL : &end), base); + if (NIL_P(ret)) { + if (badcheck) invalid_integer(str); + ret = INT2FIX(0); } - if (s) { - len = RSTRING_LEN(str); - if (s[len]) { /* no sentinel somehow */ - char *p = ALLOCV(v, len+1); - - MEMCPY(p, s, char, len); - p[len] = '\0'; - s = p; - } - } - ret = rb_cstr_to_inum(s, base, badcheck); - if (v) - ALLOCV_END(v); return ret; } @@ -4176,7 +4227,7 @@ rb_str2big_poweroftwo(VALUE arg, int base, int badcheck) const char *s, *str; const char *digits_start, *digits_end; size_t num_digits; - size_t len; + ssize_t len; VALUE z; if (!valid_radix_p(base) || !POW2_P(base)) { @@ -4185,13 +4236,16 @@ rb_str2big_poweroftwo(VALUE arg, int base, int badcheck) rb_must_asciicompat(arg); s = str = StringValueCStr(arg); + len = RSTRING_LEN(arg); if (*str == '-') { + len--; str++; positive_p = 0; } digits_start = str; - str2big_scan_digits(s, str, base, badcheck, &num_digits, &len); + if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len)) + invalid_integer(arg); digits_end = digits_start + len; z = str2big_poweroftwo(positive_p, digits_start, digits_end, num_digits, @@ -4209,7 +4263,7 @@ rb_str2big_normal(VALUE arg, int base, int badcheck) const char *s, *str; const char *digits_start, *digits_end; size_t num_digits; - size_t len; + ssize_t len; VALUE z; int digits_per_bdigits_dbl; @@ -4220,14 +4274,17 @@ rb_str2big_normal(VALUE arg, int base, int badcheck) } rb_must_asciicompat(arg); - s = str = StringValueCStr(arg); - if (*str == '-') { + s = str = StringValuePtr(arg); + len = RSTRING_LEN(arg); + if (len > 0 && *str == '-') { + len--; str++; positive_p = 0; } digits_start = str; - str2big_scan_digits(s, str, base, badcheck, &num_digits, &len); + if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len)) + invalid_integer(arg); digits_end = digits_start + len; maxpow_in_bdigit_dbl(base, &digits_per_bdigits_dbl); @@ -4248,7 +4305,7 @@ rb_str2big_karatsuba(VALUE arg, int base, int badcheck) const char *s, *str; const char *digits_start, *digits_end; size_t num_digits; - size_t len; + ssize_t len; VALUE z; int digits_per_bdigits_dbl; @@ -4259,14 +4316,17 @@ rb_str2big_karatsuba(VALUE arg, int base, int badcheck) } rb_must_asciicompat(arg); - s = str = StringValueCStr(arg); - if (*str == '-') { + s = str = StringValuePtr(arg); + len = RSTRING_LEN(arg); + if (len > 0 && *str == '-') { + len--; str++; positive_p = 0; } digits_start = str; - str2big_scan_digits(s, str, base, badcheck, &num_digits, &len); + if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len)) + invalid_integer(arg); digits_end = digits_start + len; maxpow_in_bdigit_dbl(base, &digits_per_bdigits_dbl); @@ -4288,7 +4348,7 @@ rb_str2big_gmp(VALUE arg, int base, int badcheck) const char *s, *str; const char *digits_start, *digits_end; size_t num_digits; - size_t len; + ssize_t len; VALUE z; int digits_per_bdigits_dbl; @@ -4299,14 +4359,17 @@ rb_str2big_gmp(VALUE arg, int base, int badcheck) } rb_must_asciicompat(arg); - s = str = StringValueCStr(arg); - if (*str == '-') { + s = str = StringValuePtr(arg); + len = RSTRING_LEN(arg); + if (len > 0 && *str == '-') { + len--; str++; positive_p = 0; } digits_start = str; - str2big_scan_digits(s, str, base, badcheck, &num_digits, &len); + if (!str2big_scan_digits(s, str, base, badcheck, &num_digits, &len)) + invalid_integer(arg); digits_end = digits_start + len; maxpow_in_bdigit_dbl(base, &digits_per_bdigits_dbl); diff --git a/internal.h b/internal.h index 3970431fcb..86a0852cc1 100644 --- a/internal.h +++ b/internal.h @@ -782,6 +782,7 @@ VALUE rb_big_odd_p(VALUE); VALUE rb_big_even_p(VALUE); VALUE rb_integer_float_cmp(VALUE x, VALUE y); VALUE rb_integer_float_eq(VALUE x, VALUE y); +VALUE rb_cstr_parse_inum(const char *str, ssize_t len, char **endp, int base); /* class.c */ VALUE rb_class_boot(VALUE); diff --git a/test/ruby/test_integer.rb b/test/ruby/test_integer.rb index 3f5fca75d7..a82562f887 100644 --- a/test/ruby/test_integer.rb +++ b/test/ruby/test_integer.rb @@ -98,6 +98,8 @@ class TestInteger < Test::Unit::TestCase assert_raise(Encoding::CompatibilityError, bug6192) {Integer("0".encode("utf-32be"))} assert_raise(Encoding::CompatibilityError, bug6192) {Integer("0".encode("utf-32le"))} assert_raise(Encoding::CompatibilityError, bug6192) {Integer("0".encode("iso-2022-jp"))} + + assert_raise_with_message(ArgumentError, /\u{1f4a1}/) {Integer("\u{1f4a1}")} end def test_int_p -- cgit v1.2.3