From 3f9b0936aa846bdf6984019ca40bc629fe05d929 Mon Sep 17 00:00:00 2001
From: knu <knu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Sat, 24 Nov 2012 18:46:15 +0000
Subject: String#{lines,chars,codepoints,bytes} now return an array.

* string.c (rb_str_each_line, rb_str_lines): String#lines now
  returns an array instead of an enumerator.  Passing a block is
  deprecated but still supported for backwards compatibility.
  Based on the patch by yhara. [Feature #6670]

* string.c (rb_str_each_char, rb_str_chars): Ditto for
  String#chars.

* string.c (rb_str_each_codepoint, rb_str_codepoints): Ditto for
  String#codepoints.

* string.c (rb_str_each_byte, rb_str_bytes): Ditto for
  String#bytes.

* NEWS: Add notes for the above changes.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37838 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 string.c | 397 +++++++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 298 insertions(+), 99 deletions(-)

(limited to 'string.c')
diff --git a/string.c b/string.c
index c63f59ad90..68a4e46bb6 100644
--- a/string.c
+++ b/string.c
@@ -6098,45 +6098,8 @@ rb_str_split(VALUE str, const char *sep0)
 }
 
 
-/*
- *  call-seq:
- *     str.each_line(separator=$/) {|substr| block }   -> str
- *     str.each_line(separator=$/)                     -> an_enumerator
- *
- *     str.lines(separator=$/) {|substr| block }       -> str
- *     str.lines(separator=$/)                         -> an_enumerator
- *
- *  Splits <i>str</i> using the supplied parameter as the record separator
- *  (<code>$/</code> by default), passing each substring in turn to the supplied
- *  block. If a zero-length record separator is supplied, the string is split
- *  into paragraphs delimited by multiple successive newlines.
- *
- *  If no block is given, an enumerator is returned instead.
- *
- *     print "Example one\n"
- *     "hello\nworld".each_line {|s| p s}
- *     print "Example two\n"
- *     "hello\nworld".each_line('l') {|s| p s}
- *     print "Example three\n"
- *     "hello\n\n\nworld".each_line('') {|s| p s}
- *
- *  <em>produces:</em>
- *
- *     Example one
- *     "hello\n"
- *     "world"
- *     Example two
- *     "hel"
- *     "l"
- *     "o\nworl"
- *     "d"
- *     Example three
- *     "hello\n\n\n"
- *     "world"
- */
-
 static VALUE
-rb_str_each_line(int argc, VALUE *argv, VALUE str)
+rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
 {
     rb_encoding *enc;
     VALUE rs;
@@ -6146,6 +6109,7 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str)
     VALUE line;
     int n;
     VALUE orig = str;
+    VALUE ary;
 
     if (argc == 0) {
 	rs = rb_rs;
@@ -6153,10 +6117,34 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str)
     else {
 	rb_scan_args(argc, argv, "01", &rs);
     }
-    RETURN_ENUMERATOR(str, argc, argv);
+
+    if (rb_block_given_p()) {
+	if (wantarray) {
+#if 0 /* next major */
+	    rb_warn("given block not used");
+	    ary = rb_ary_new();
+#else
+	    rb_warning("passing a block to String#lines is deprecated");
+	    wantarray = 0;
+#endif
+	}
+    }
+    else {
+	if (wantarray)
+	    ary = rb_ary_new();
+	else
+	    RETURN_ENUMERATOR(str, argc, argv);
+    }
+
     if (NIL_P(rs)) {
-	rb_yield(str);
-	return orig;
+	if (wantarray) {
+	    rb_ary_push(ary, str);
+	    return ary;
+	}
+	else {
+	    rb_yield(str);
+	    return orig;
+	}
     }
     str = rb_str_new4(str);
     ptr = p = s = RSTRING_PTR(str);
@@ -6179,7 +6167,10 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str)
 	    line = rb_str_new5(str, s, p - s);
 	    OBJ_INFECT(line, str);
 	    rb_enc_cr_str_copy_for_substr(line, str);
-	    rb_yield(line);
+	    if (wantarray)
+		rb_ary_push(ary, line);
+	    else
+		rb_yield(line);
 	    str_mod_check(str, ptr, len);
 	    s = p;
 	}
@@ -6215,7 +6206,10 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str)
 	    line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
 	    OBJ_INFECT(line, str);
 	    rb_enc_cr_str_copy_for_substr(line, str);
-	    rb_yield(line);
+	    if (wantarray)
+		rb_ary_push(ary, line);
+	    else
+		rb_yield(line);
 	    str_mod_check(str, ptr, len);
 	    s = p + (rslen ? rslen : n);
 	}
@@ -6227,11 +6221,76 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str)
 	line = rb_str_new5(str, s, pend - s);
 	OBJ_INFECT(line, str);
 	rb_enc_cr_str_copy_for_substr(line, str);
-	rb_yield(line);
+	if (wantarray)
+	    rb_ary_push(ary, line);
+	else
+	    rb_yield(line);
 	RB_GC_GUARD(str);
     }
 
-    return orig;
+    if (wantarray)
+	return ary;
+    else
+	return orig;
+}
+
+/*
+ *  call-seq:
+ *     str.each_line(separator=$/) {|substr| block }   -> str
+ *     str.each_line(separator=$/)                     -> an_enumerator
+ *
+ *  Splits <i>str</i> using the supplied parameter as the record
+ *  separator (<code>$/</code> by default), passing each substring in
+ *  turn to the supplied block.  If a zero-length record separator is
+ *  supplied, the string is split into paragraphs delimited by
+ *  multiple successive newlines.
+ *
+ *  If no block is given, an enumerator is returned instead.
+ *
+ *     print "Example one\n"
+ *     "hello\nworld".each_line {|s| p s}
+ *     print "Example two\n"
+ *     "hello\nworld".each_line('l') {|s| p s}
+ *     print "Example three\n"
+ *     "hello\n\n\nworld".each_line('') {|s| p s}
+ *
+ *  <em>produces:</em>
+ *
+ *     Example one
+ *     "hello\n"
+ *     "world"
+ *     Example two
+ *     "hel"
+ *     "l"
+ *     "o\nworl"
+ *     "d"
+ *     Example three
+ *     "hello\n\n\n"
+ *     "world"
+ */
+
+static VALUE
+rb_str_each_line(int argc, VALUE *argv, VALUE str)
+{
+    return rb_str_enumerate_lines(argc, argv, str, 0);
+}
+
+/*
+ *  call-seq:
+ *     str.lines(separator=$/)  -> an_array
+ *
+ *  Returns an array of lines in <i>str</i> split using the supplied
+ *  record separator (<code>$/</code> by default).  This is a
+ *  shorthand for <code>str.each_line(separator).to_a</code>.
+ *
+ *  If a block is given, which is a deprecated form, works the same as
+ *  <code>each_line</code>.
+ */
+
+static VALUE
+rb_str_lines(int argc, VALUE *argv, VALUE str)
+{
+    return rb_str_enumerate_lines(argc, argv, str, 1);
 }
 
 static VALUE
@@ -6240,16 +6299,49 @@ rb_str_each_byte_size(VALUE str, VALUE args)
     return LONG2FIX(RSTRING_LEN(str));
 }
 
+static VALUE
+rb_str_enumerate_bytes(VALUE str, int wantarray)
+{
+    long i;
+    VALUE ary;
+
+    if (rb_block_given_p()) {
+	if (wantarray) {
+#if 0 /* next major */
+	    rb_warn("given block not used");
+	    ary = rb_ary_new();
+#else
+	    rb_warning("passing a block to String#bytes is deprecated");
+	    wantarray = 0;
+#endif
+	}
+    }
+    else {
+	if (wantarray)
+	    ary = rb_ary_new2(RSTRING_LEN(str));
+	else
+	    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
+    }
+
+    for (i=0; i<RSTRING_LEN(str); i++) {
+	if (wantarray)
+	    rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
+	else
+	    rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
+    }
+    if (wantarray)
+	return ary;
+    else
+	return str;
+}
+
 /*
  *  call-seq:
- *     str.bytes {|fixnum| block }        -> str
- *     str.bytes                          -> an_enumerator
- *
  *     str.each_byte {|fixnum| block }    -> str
  *     str.each_byte                      -> an_enumerator
  *
- *  Passes each byte in <i>str</i> to the given block, or returns
- *  an enumerator if no block is given.
+ *  Passes each byte in <i>str</i> to the given block, or returns an
+ *  enumerator if no block is given.
  *
  *     "hello".each_byte {|c| print c, ' ' }
  *
@@ -6261,13 +6353,24 @@ rb_str_each_byte_size(VALUE str, VALUE args)
 static VALUE
 rb_str_each_byte(VALUE str)
 {
-    long i;
+    return rb_str_enumerate_bytes(str, 0);
+}
 
-    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
-    for (i=0; i<RSTRING_LEN(str); i++) {
-	rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
-    }
-    return str;
+/*
+ *  call-seq:
+ *     str.bytes    -> an_array
+ *
+ *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
+ *  <code>str.each_byte.to_a</code>.
+ *
+ *  If a block is given, which is a deprecated form, works the same as
+ *  <code>each_byte</code>.
+ */
+
+static VALUE
+rb_str_bytes(VALUE str)
+{
+    return rb_str_enumerate_bytes(str, 1);
 }
 
 static VALUE
@@ -6285,33 +6388,33 @@ rb_str_each_char_size(VALUE str)
     return LONG2FIX(len);
 }
 
-/*
- *  call-seq:
- *     str.chars {|cstr| block }        -> str
- *     str.chars                        -> an_enumerator
- *
- *     str.each_char {|cstr| block }    -> str
- *     str.each_char                    -> an_enumerator
- *
- *  Passes each character in <i>str</i> to the given block, or returns
- *  an enumerator if no block is given.
- *
- *     "hello".each_char {|c| print c, ' ' }
- *
- *  <em>produces:</em>
- *
- *     h e l l o
- */
-
 static VALUE
-rb_str_each_char(VALUE str)
+rb_str_enumerate_chars(VALUE str, int wantarray)
 {
     VALUE orig = str;
     long i, len, n;
     const char *ptr;
     rb_encoding *enc;
+    VALUE ary;
+
+    if (rb_block_given_p()) {
+	if (wantarray) {
+#if 0 /* next major */
+	    rb_warn("given block not used");
+	    ary = rb_ary_new();
+#else
+	    rb_warning("passing a block to String#chars is deprecated");
+	    wantarray = 0;
+#endif
+	}
+    }
+    else {
+	if (wantarray)
+	    ary = rb_ary_new();
+	else
+	    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
+    }
 
-    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
     str = rb_str_new4(str);
     ptr = RSTRING_PTR(str);
     len = RSTRING_LEN(str);
@@ -6321,63 +6424,159 @@ rb_str_each_char(VALUE str)
       case ENC_CODERANGE_7BIT:
 	for (i = 0; i < len; i += n) {
 	    n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
-	    rb_yield(rb_str_subseq(str, i, n));
+	    if (wantarray)
+		rb_ary_push(ary, rb_str_subseq(str, i, n));
+	    else
+		rb_yield(rb_str_subseq(str, i, n));
 	}
 	break;
       default:
 	for (i = 0; i < len; i += n) {
 	    n = rb_enc_mbclen(ptr + i, ptr + len, enc);
-	    rb_yield(rb_str_subseq(str, i, n));
+	    if (wantarray)
+		rb_ary_push(ary, rb_str_subseq(str, i, n));
+	    else
+		rb_yield(rb_str_subseq(str, i, n));
 	}
     }
-    return orig;
+    if (wantarray)
+	return ary;
+    else
+	return orig;
 }
 
 /*
  *  call-seq:
- *     str.codepoints {|integer| block }        -> str
- *     str.codepoints                           -> an_enumerator
+ *     str.each_char {|cstr| block }    -> str
+ *     str.each_char                    -> an_enumerator
  *
- *     str.each_codepoint {|integer| block }    -> str
- *     str.each_codepoint                       -> an_enumerator
+ *  Passes each character in <i>str</i> to the given block, or returns
+ *  an enumerator if no block is given.
  *
- *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
- *  also known as a <i>codepoint</i> when applied to Unicode strings to the
- *  given block.
+ *     "hello".each_char {|c| print c, ' ' }
  *
- *  If no block is given, an enumerator is returned instead.
+ *  <em>produces:</em>
  *
- *     "hello\u0639".each_codepoint {|c| print c, ' ' }
+ *     h e l l o
+ */
+
+static VALUE
+rb_str_each_char(VALUE str)
+{
+    return rb_str_enumerate_chars(str, 0);
+}
+
+/*
+ *  call-seq:
+ *     str.chars    -> an_array
  *
- *  <em>produces:</em>
+ *  Returns an array of characters in <i>str</i>.  This is a shorthand
+ *  for <code>str.each_char.to_a</code>.
  *
- *     104 101 108 108 111 1593
+ *  If a block is given, which is a deprecated form, works the same as
+ *  <code>each_char</code>.
  */
 
 static VALUE
-rb_str_each_codepoint(VALUE str)
+rb_str_chars(VALUE str)
+{
+    return rb_str_enumerate_chars(str, 1);
+}
+
+
+static VALUE
+rb_str_enumerate_codepoints(VALUE str, int wantarray)
 {
     VALUE orig = str;
     int n;
     unsigned int c;
     const char *ptr, *end;
     rb_encoding *enc;
+    VALUE ary;
+
+    if (single_byte_optimizable(str))
+	return rb_str_enumerate_bytes(str, wantarray);
+
+    if (rb_block_given_p()) {
+	if (wantarray) {
+#if 0 /* next major */
+	    rb_warn("given block not used");
+	    ary = rb_ary_new();
+#else
+	    rb_warning("passing a block to String#codepoints is deprecated");
+	    wantarray = 0;
+#endif
+	}
+    }
+    else {
+	if (wantarray)
+	    ary = rb_ary_new();
+	else
+	    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
+    }
 
-    if (single_byte_optimizable(str)) return rb_str_each_byte(str);
-    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
     str = rb_str_new4(str);
     ptr = RSTRING_PTR(str);
     end = RSTRING_END(str);
     enc = STR_ENC_GET(str);
     while (ptr < end) {
 	c = rb_enc_codepoint_len(ptr, end, &n, enc);
-	rb_yield(UINT2NUM(c));
+	if (wantarray)
+	    rb_ary_push(ary, UINT2NUM(c));
+	else
+	    rb_yield(UINT2NUM(c));
 	ptr += n;
     }
     RB_GC_GUARD(str);
-    return orig;
+    if (wantarray)
+	return ary;
+    else
+	return orig;
+}
+
+/*
+ *  call-seq:
+ *     str.each_codepoint {|integer| block }    -> str
+ *     str.each_codepoint                       -> an_enumerator
+ *
+ *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
+ *  also known as a <i>codepoint</i> when applied to Unicode strings to the
+ *  given block.
+ *
+ *  If no block is given, an enumerator is returned instead.
+ *
+ *     "hello\u0639".each_codepoint {|c| print c, ' ' }
+ *
+ *  <em>produces:</em>
+ *
+ *     104 101 108 108 111 1593
+ */
+
+static VALUE
+rb_str_each_codepoint(VALUE str)
+{
+    return rb_str_enumerate_codepoints(str, 0);
+}
+
+/*
+ *  call-seq:
+ *     str.codepoints   -> an_array
+ *
+ *  Returns an array of the <code>Integer</code> ordinals of the
+ *  characters in <i>str</i>.  This is a shorthand for
+ *  <code>str.each_codepoint.to_a</code>.
+ *
+ *  If a block is given, which is a deprecated form, works the same as
+ *  <code>each_codepoint</code>.
+ */
+
+static VALUE
+rb_str_codepoints(VALUE str)
+{
+    return rb_str_enumerate_codepoints(str, 1);
 }
 
+
 static long
 chopped_length(VALUE str)
 {
@@ -7994,10 +8193,10 @@ Init_String(void)
     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
-    rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
-    rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
-    rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
-    rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
+    rb_define_method(rb_cString, "lines", rb_str_lines, -1);
+    rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
+    rb_define_method(rb_cString, "chars", rb_str_chars, 0);
+    rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
-- 
cgit v1.2.3