aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--encoding.c6
-rw-r--r--string.c61
-rw-r--r--test/ruby/test_m17n.rb4
4 files changed, 56 insertions, 23 deletions
diff --git a/ChangeLog b/ChangeLog
index 0161f2a77a..48f993766d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Wed Jan 15 14:03:47 2014 Nobuyoshi Nakada <nobu@ruby-lang.org>
+
+ * string.c (get_actual_encoding): get actual encoding according to
+ the BOM if exists.
+
+ * string.c (rb_str_inspect): use according encoding, instead of
+ pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug #8940]
+
Tue Jan 14 21:07:22 2014 Masaki Matsushita <glass.saga@gmail.com>
* ext/thread/thread.c (rb_szqueue_clear): notify SZQUEUE_WAITERS
diff --git a/encoding.c b/encoding.c
index ab03806dce..f102524379 100644
--- a/encoding.c
+++ b/encoding.c
@@ -598,6 +598,12 @@ rb_enc_from_index(int index)
return enc_table.list[index].enc;
}
+rb_encoding *
+rb_enc_get_from_index(int index)
+{
+ return must_encindex(index);
+}
+
int
rb_enc_registered(const char *name)
{
diff --git a/string.c b/string.c
index 900f900f16..50b050bb5d 100644
--- a/string.c
+++ b/string.c
@@ -123,6 +123,38 @@ VALUE rb_cSymbol;
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
+rb_encoding *rb_enc_get_from_index(int index);
+
+static rb_encoding *
+get_actual_encoding(const int encidx, VALUE str)
+{
+ const unsigned char *q;
+
+ switch (encidx) {
+ case ENCINDEX_UTF_16:
+ if (RSTRING_LEN(str) < 2) break;
+ q = (const unsigned char *)RSTRING_PTR(str);
+ if (q[0] == 0xFE && q[1] == 0xFF) {
+ return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
+ }
+ if (q[0] == 0xFF && q[1] == 0xFE) {
+ return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
+ }
+ return rb_ascii8bit_encoding();
+ case ENCINDEX_UTF_32:
+ if (RSTRING_LEN(str) < 4) break;
+ q = (const unsigned char *)RSTRING_PTR(str);
+ if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
+ return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
+ }
+ if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
+ return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
+ }
+ return rb_ascii8bit_encoding();
+ }
+ return rb_enc_from_index(encidx);
+}
+
static int fstring_cmp(VALUE a, VALUE b);
static st_table* frozen_strings;
@@ -4749,8 +4781,8 @@ rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
VALUE
rb_str_inspect(VALUE str)
{
- rb_encoding *enc = STR_ENC_GET(str);
- int encidx = rb_enc_to_index(enc);
+ int encidx = ENCODING_GET(str);
+ rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
const char *p, *pend, *prev;
char buf[CHAR_ESC_LEN + 1];
VALUE result = rb_str_buf_new(0);
@@ -4765,27 +4797,10 @@ rb_str_inspect(VALUE str)
p = RSTRING_PTR(str); pend = RSTRING_END(str);
prev = p;
- if (encidx == ENCINDEX_UTF_16 && p + 2 <= pend) {
- const unsigned char *q = (const unsigned char *)p;
- if (q[0] == 0xFE && q[1] == 0xFF)
- enc = rb_enc_from_index(ENCINDEX_UTF_16BE);
- else if (q[0] == 0xFF && q[1] == 0xFE)
- enc = rb_enc_from_index(ENCINDEX_UTF_16LE);
- else {
- enc = rb_ascii8bit_encoding();
- unicode_p = 0;
- }
- }
- else if (encidx == ENCINDEX_UTF_32 && p + 4 <= pend) {
- const unsigned char *q = (const unsigned char *)p;
- if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
- enc = rb_enc_from_index(ENCINDEX_UTF_32BE);
- else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
- enc = rb_enc_from_index(ENCINDEX_UTF_32LE);
- else {
- enc = rb_ascii8bit_encoding();
- unicode_p = 0;
- }
+ actenc = get_actual_encoding(encidx, str);
+ if (actenc != enc) {
+ enc = actenc;
+ if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
}
while (p < pend) {
unsigned int c, cc;
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index 4557cd6c3c..f5f23c09f1 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -228,14 +228,18 @@ class TestM17N < Test::Unit::TestCase
STR_WITHOUT_BOM = "\u3042".freeze
STR_WITH_BOM = "\uFEFF\u3042".freeze
+ bug8940 = '[ruby-core:59757] [Bug #8940]'
%w/UTF-16 UTF-32/.each do |enc|
%w/BE LE/.each do |endian|
+ bom = "\uFEFF".encode("#{enc}#{endian}").force_encoding(enc)
+
define_method("test_utf_16_32_inspect(#{enc}#{endian})") do
s = STR_WITHOUT_BOM.encode(enc + endian)
# When a UTF-16/32 string doesn't have a BOM,
# inspect as a dummy encoding string.
assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect,
s.dup.force_encoding(enc).inspect)
+ assert_normal_exit("#{bom.b.dump}.force_encoding('#{enc}').inspect", bug8940)
end
define_method("test_utf_16_32_inspect(#{enc}#{endian}-BOM)") do