aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--io.c7
-rw-r--r--string.c120
-rw-r--r--transcode.c101
3 files changed, 131 insertions, 97 deletions
diff --git a/io.c b/io.c
index efe37ca835..19becbd181 100644
--- a/io.c
+++ b/io.c
@@ -8943,6 +8943,13 @@ rb_io_make_open_file(VALUE obj)
* fd = IO.sysopen(path) # => 3
* IO.new(fd) # => #<IO:fd 3>
*
+ * The new \IO object does not inherit encoding
+ * (because the integer file descriptor does not have an encoding):
+ *
+ * fd = IO.sysopen('t.rus', 'rb')
+ * io = IO.new(fd)
+ * io.external_encoding # => #<Encoding:UTF-8> # Not ASCII-8BIT.
+ *
* Optional argument +mode+ (defaults to 'r') must specify a valid mode
* see IO@Modes:
*
diff --git a/string.c b/string.c
index 986eee945c..0fdde85b17 100644
--- a/string.c
+++ b/string.c
@@ -6670,7 +6670,6 @@ rb_str_escape(VALUE str)
* and with special characters escaped:
*
* s = "foo\tbar\tbaz\n"
- * # => "foo\tbar\tbaz\n"
* s.inspect
* # => "\"foo\\tbar\\tbaz\\n\""
*
@@ -10963,9 +10962,22 @@ rb_str_force_encoding(VALUE str, VALUE enc)
/*
* call-seq:
- * str.b -> str
+ * b -> string
+ *
+ * Returns a copy of +self+ with that has ASCII-8BIT encoding;
+ * the contents (bytes) of +self+ are not modified:
+ *
+ * s = "\x99"
+ * s.encoding # => #<Encoding:UTF-8>
+ * t = s.b # => "\x99"
+ * t.encoding # => #<Encoding:ASCII-8BIT>
+ *
+ * s = "\u4095"
+ * s.encoding # => #<Encoding:UTF-8>
+ * s.bytes # => [228, 130, 149]
+ * t = s.b # => "\xE4\x82\x95"
+ * t.encoding # => #<Encoding:ASCII-8BIT>
*
- * Returns a copied string whose encoding is ASCII-8BIT.
*/
static VALUE
@@ -11341,17 +11353,38 @@ enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
/*
* call-seq:
- * str.scrub -> new_str
- * str.scrub(repl) -> new_str
- * str.scrub{|bytes|} -> new_str
+ * scrub(replacement_string = default_replacement) -> string
+ * scrub{|bytes| ... } -> string
+ *
+ * Returns a copy of self with each invalid byte sequence replaced
+ * by a replacement string.
+ *
+ * With no block given and no argument, replaces each invalid sequence
+ * with the default replacement string
+ * (<tt>"\uFFFD"</tt> for a Unicode encoding, <tt>'?'</tt> otherwise):
+ *
+ * "\uFFFD".bytes # => [239, 191, 189]
+ * s = "foo\x81\x81bar"
+ * s.bytes
+ * # => [102, 111, 111, 129, 129, 98, 97, 114]
+ * s.scrub.bytes
+ * # => [102, 111, 111, 239, 191, 189, 239, 191, 189, 98, 97, 114]
*
- * If the string is invalid byte sequence then replace invalid bytes with given replacement
- * character, else returns self.
- * If block is given, replace invalid bytes with returned value of the block.
+ * With no block given and argument +replacement_string+ given,
+ * replaces each invalid sequence with that string:
+ *
+ * "foo\x81\x81bar".scrub('xyzzy') # => "fooxyzzyxyzzybar"
+ *
+ * With a block given, replaces each invalid sequence with the value
+ * of the block:
+ *
+ * "foo\x81\x81bar".scrub {|bytes| p bytes; 'XYZZY' } # => "fooXYZZYXYZZYbar"
+ *
+ * Output:
+ *
+ * "\x81"
+ * "\x81"
*
- * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
- * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
- * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
*/
static VALUE
str_scrub(int argc, VALUE *argv, VALUE str)
@@ -11363,17 +11396,12 @@ str_scrub(int argc, VALUE *argv, VALUE str)
/*
* call-seq:
- * str.scrub! -> str
- * str.scrub!(repl) -> str
- * str.scrub!{|bytes|} -> str
+ * scrub! -> self
+ * scrub!(replacement_string = default_replacement) -> self
+ * scrub!{|bytes|} -> self
*
- * If the string is invalid byte sequence then replace invalid bytes with given replacement
- * character, else returns self.
- * If block is given, replace invalid bytes with returned value of the block.
+ * Like String#scrub, except that any replacements are made in +self+.
*
- * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
- * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
- * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
*/
static VALUE
str_scrub_bang(int argc, VALUE *argv, VALUE str)
@@ -11405,25 +11433,36 @@ unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
/*
* call-seq:
- * str.unicode_normalize(form=:nfc)
+ * unicode_normalize(form = :nfc) -> string
*
- * Unicode Normalization---Returns a normalized form of +str+,
- * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
- * The normalization form used is determined by +form+, which can
- * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
- * The default is +:nfc+.
+ * Returns a copy of +self+ with
+ * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
*
- * If the string is not in a Unicode Encoding, then an Exception is raised.
- * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
- * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
- * Anything other than UTF-8 is implemented by converting to UTF-8,
- * which makes it slower than UTF-8.
+ * Argument +form+ must be one of the following symbols
+ * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
+ *
+ * - +:nfc+: Canonical decomposition, followed by canonical composition.
+ * - +:nfd+: Canonical decomposition.
+ * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
+ * - +:nfkd+: Compatibility decomposition.
+ *
+ * +self+ must have encoding UTF-8 or one of the other supported encodings:
+ *
+ * UnicodeNormalize::UNICODE_ENCODINGS
+ * # =>
+ * [#<Encoding:UTF-16BE (autoload)>,
+ * #<Encoding:UTF-16LE>,
+ * #<Encoding:UTF-32BE (autoload)>,
+ * #<Encoding:UTF-32LE (autoload)>,
+ * #<Encoding:GB18030 (autoload)>,
+ * #<Encoding:UTF-16BE (autoload)>,
+ * #<Encoding:UTF-32BE (autoload)>]
+ *
+ * Examples:
+ *
+ * "a\u0300".unicode_normalize # => "a"
+ * "\u00E0".unicode_normalize(:nfd) # => "a "
*
- * "a\u0300".unicode_normalize #=> "\u00E0"
- * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
- * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
- * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
- * #=> Encoding::CompatibilityError raised
*/
static VALUE
rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
@@ -11433,10 +11472,11 @@ rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
/*
* call-seq:
- * str.unicode_normalize!(form=:nfc)
+ * unicode_normalize!(form = :nfc) -> self
+ *
+ * Like String#unicode_normalize, except that the normalization
+ * is performed on +self+.
*
- * Destructive version of String#unicode_normalize, doing Unicode
- * normalization in place.
*/
static VALUE
rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
diff --git a/transcode.c b/transcode.c
index 9cc4d00f28..400ad13775 100644
--- a/transcode.c
+++ b/transcode.c
@@ -2801,16 +2801,11 @@ str_encode_associate(VALUE str, int encidx)
/*
* call-seq:
- * str.encode!(encoding, **options) -> str
- * str.encode!(dst_encoding, src_encoding, **options) -> str
+ * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
+ * encode!(dst_encoding, src_encoding, **enc_opts) -> self
+ *
+ * Like #encode, but applies encoding changes to +self+; returns +self+.
*
- * The first form transcodes the contents of <i>str</i> from
- * str.encoding to +encoding+.
- * The second form transcodes the contents of <i>str</i> from
- * src_encoding to dst_encoding.
- * The +options+ keyword arguments give details for conversion. See String#encode
- * for details.
- * Returns the string even if no changes were made.
*/
static VALUE
@@ -2837,58 +2832,50 @@ static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
/*
* call-seq:
- * str.encode(encoding, **options) -> str
- * str.encode(dst_encoding, src_encoding, **options) -> str
- * str.encode(**options) -> str
+ * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
+ * encode(dst_encoding, src_encoding, **enc_opts) -> string
+ *
+ * Returns a copy of +self+ transcoded as determined by +dst_encoding+.
+ * By default, raises an exception if +self+
+ * contains an invalid byte or a character not defined in +dst_encoding+;
+ * that behavior may be modified by encoding options; see below.
+ *
+ * With no arguments:
+ *
+ * - Uses the same encoding if <tt>Encoding.default_internal</tt> is +nil+
+ * (the default):
+ *
+ * Encoding.default_internal # => nil
+ * s = "Ruby\x99".force_encoding('Windows-1252')
+ * s.encoding # => #<Encoding:Windows-1252>
+ * s.bytes # => [82, 117, 98, 121, 153]
+ * t = s.encode # => "Ruby\x99"
+ * t.encoding # => #<Encoding:Windows-1252>
+ * t.bytes # => [82, 117, 98, 121, 226, 132, 162]
+ *
+ * - Otherwise, uses the encoding <tt>Encoding.default_internal</tt>:
+ *
+ * Encoding.default_internal = 'UTF-8'
+ * t = s.encode # => "Ruby™"
+ * t.encoding # => #<Encoding:UTF-8>
+ *
+ * With only argument +dst_encoding+ given, uses that encoding:
+ *
+ * s = "Ruby\x99".force_encoding('Windows-1252')
+ * s.encoding # => #<Encoding:Windows-1252>
+ * t = s.encode('UTF-8') # => "Ruby™"
+ * t.encoding # => #<Encoding:UTF-8>
*
- * The first form returns a copy of +str+ transcoded
- * to encoding +encoding+.
- * The second form returns a copy of +str+ transcoded
- * from src_encoding to dst_encoding.
- * The last form returns a copy of +str+ transcoded to
- * <tt>Encoding.default_internal</tt>.
+ * With arguments +dst_encoding+ and +src_encoding+ given,
+ * interprets +self+ using +src_encoding+, encodes the new string using +dst_encoding+:
*
- * By default, the first and second form raise
- * Encoding::UndefinedConversionError for characters that are
- * undefined in the destination encoding, and
- * Encoding::InvalidByteSequenceError for invalid byte sequences
- * in the source encoding. The last form by default does not raise
- * exceptions but uses replacement strings.
+ * s = "Ruby\x99"
+ * t = s.encode('UTF-8', 'Windows-1252') # => "Ruby™"
+ * t.encoding # => #<Encoding:UTF-8>
*
- * The +options+ keyword arguments give details for conversion.
- * The arguments are:
+ * Optional keyword arguments +enc_opts+ specify encoding options;
+ * see {Encoding Options}[rdoc-ref:encoding.rdoc@Encoding+Options].
*
- * :invalid ::
- * If the value is +:replace+, #encode replaces invalid byte sequences in
- * +str+ with the replacement character. The default is to raise the
- * Encoding::InvalidByteSequenceError exception
- * :undef ::
- * If the value is +:replace+, #encode replaces characters which are
- * undefined in the destination encoding with the replacement character.
- * The default is to raise the Encoding::UndefinedConversionError.
- * :replace ::
- * Sets the replacement string to the given value. The default replacement
- * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
- * :fallback ::
- * Sets the replacement string by the given object for undefined
- * character. The object should be a Hash, a Proc, a Method, or an
- * object which has [] method.
- * Its key is an undefined character encoded in the source encoding
- * of current transcoder. Its value can be any encoding until it
- * can be converted into the destination encoding of the transcoder.
- * :xml ::
- * The value must be +:text+ or +:attr+.
- * If the value is +:text+ #encode replaces undefined characters with their
- * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
- * are converted to "&amp;", "&lt;", and "&gt;", respectively.
- * If the value is +:attr+, #encode also quotes the replacement result
- * (using '"'), and replaces '"' with "&quot;".
- * :cr_newline ::
- * Replaces LF ("\n") with CR ("\r") if value is true.
- * :crlf_newline ::
- * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
- * :universal_newline ::
- * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
*/
static VALUE