aboutsummaryrefslogtreecommitdiffstats
path: root/test/prism/encoding_test.rb
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-12-04 12:51:22 -0500
committerKevin Newton <kddnewton@gmail.com>2023-12-06 14:23:38 -0500
commit82f18baa21d0df59c30d8a6e60bf3e0991de1114 (patch)
treed861044ddaf3d334fee10325f15eab9887ae546b /test/prism/encoding_test.rb
parent9620ca678929f28dd8dab8e278e438a430a85022 (diff)
downloadruby-82f18baa21d0df59c30d8a6e60bf3e0991de1114.tar.gz
[ruby/prism] Provide flags for changing encodings
https://github.com/ruby/prism/commit/e838eaff6f
Diffstat (limited to 'test/prism/encoding_test.rb')
-rw-r--r--test/prism/encoding_test.rb213
1 files changed, 133 insertions, 80 deletions
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index 94ba3a6c2a..e4678c6f82 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -7,90 +7,16 @@ require_relative "test_helper"
module Prism
class EncodingTest < TestCase
codepoints_1byte = 0...0x100
- codepoints_2bytes = 0...0x10000
-
encodings = {
- Encoding::ASCII => codepoints_1byte,
- Encoding::ASCII_8BIT => codepoints_1byte,
- Encoding::CP850 => codepoints_1byte,
- Encoding::CP852 => codepoints_1byte,
- Encoding::CP855 => codepoints_1byte,
- Encoding::GB1988 => codepoints_1byte,
- Encoding::IBM437 => codepoints_1byte,
- Encoding::IBM720 => codepoints_1byte,
- Encoding::IBM737 => codepoints_1byte,
- Encoding::IBM775 => codepoints_1byte,
- Encoding::IBM852 => codepoints_1byte,
- Encoding::IBM855 => codepoints_1byte,
- Encoding::IBM857 => codepoints_1byte,
- Encoding::IBM860 => codepoints_1byte,
- Encoding::IBM861 => codepoints_1byte,
- Encoding::IBM862 => codepoints_1byte,
- Encoding::IBM863 => codepoints_1byte,
- Encoding::IBM864 => codepoints_1byte,
- Encoding::IBM865 => codepoints_1byte,
- Encoding::IBM866 => codepoints_1byte,
- Encoding::IBM869 => codepoints_1byte,
- Encoding::ISO_8859_1 => codepoints_1byte,
- Encoding::ISO_8859_2 => codepoints_1byte,
- Encoding::ISO_8859_3 => codepoints_1byte,
- Encoding::ISO_8859_4 => codepoints_1byte,
- Encoding::ISO_8859_5 => codepoints_1byte,
- Encoding::ISO_8859_6 => codepoints_1byte,
- Encoding::ISO_8859_7 => codepoints_1byte,
- Encoding::ISO_8859_8 => codepoints_1byte,
- Encoding::ISO_8859_9 => codepoints_1byte,
- Encoding::ISO_8859_10 => codepoints_1byte,
- Encoding::ISO_8859_11 => codepoints_1byte,
- Encoding::ISO_8859_13 => codepoints_1byte,
- Encoding::ISO_8859_14 => codepoints_1byte,
- Encoding::ISO_8859_15 => codepoints_1byte,
- Encoding::ISO_8859_16 => codepoints_1byte,
- Encoding::KOI8_R => codepoints_1byte,
- Encoding::KOI8_U => codepoints_1byte,
- Encoding::MACCENTEURO => codepoints_1byte,
- Encoding::MACCROATIAN => codepoints_1byte,
- Encoding::MACCYRILLIC => codepoints_1byte,
- Encoding::MACGREEK => codepoints_1byte,
- Encoding::MACICELAND => codepoints_1byte,
- Encoding::MACROMAN => codepoints_1byte,
- Encoding::MACROMANIA => codepoints_1byte,
- Encoding::MACTHAI => codepoints_1byte,
- Encoding::MACTURKISH => codepoints_1byte,
- Encoding::MACUKRAINE => codepoints_1byte,
- Encoding::TIS_620 => codepoints_1byte,
- Encoding::Windows_1250 => codepoints_1byte,
- Encoding::Windows_1251 => codepoints_1byte,
- Encoding::Windows_1252 => codepoints_1byte,
- Encoding::Windows_1253 => codepoints_1byte,
- Encoding::Windows_1254 => codepoints_1byte,
- Encoding::Windows_1255 => codepoints_1byte,
- Encoding::Windows_1256 => codepoints_1byte,
- Encoding::Windows_1257 => codepoints_1byte,
- Encoding::Windows_1258 => codepoints_1byte,
- Encoding::Windows_874 => codepoints_1byte,
- Encoding::Big5 => codepoints_2bytes,
- Encoding::Big5_HKSCS => codepoints_2bytes,
- Encoding::Big5_UAO => codepoints_2bytes,
- Encoding::CP949 => codepoints_2bytes,
- Encoding::CP950 => codepoints_2bytes,
- Encoding::CP951 => codepoints_2bytes,
- Encoding::EUC_KR => codepoints_2bytes,
- Encoding::GBK => codepoints_2bytes,
- Encoding::GB12345 => codepoints_2bytes,
- Encoding::GB2312 => codepoints_2bytes,
- Encoding::MACJAPANESE => codepoints_2bytes,
- Encoding::Shift_JIS => codepoints_2bytes,
- Encoding::SJIS_DoCoMo => codepoints_2bytes,
- Encoding::SJIS_KDDI => codepoints_2bytes,
- Encoding::SJIS_SoftBank => codepoints_2bytes,
- Encoding::Windows_31J => codepoints_2bytes
+ Encoding::ASCII_8BIT => codepoints_1byte,
+ Encoding::US_ASCII => codepoints_1byte,
+ Encoding::Windows_1253 => codepoints_1byte
}
- # By default we don't test every codepoint in these encodings because they
- # are 3 and 4 byte representations so it can drastically slow down the test
- # suite.
+ # By default we don't test every codepoint in these encodings because it
+ # takes a very long time.
if ENV["PRISM_TEST_ALL_ENCODINGS"]
+ codepoints_2bytes = 0...0x10000
codepoints_unicode = (0...0x110000)
codepoints_eucjp = [
@@ -118,6 +44,78 @@ module Prism
]
encodings.merge!(
+ Encoding::CP850 => codepoints_1byte,
+ Encoding::CP852 => codepoints_1byte,
+ Encoding::CP855 => codepoints_1byte,
+ Encoding::GB1988 => codepoints_1byte,
+ Encoding::IBM437 => codepoints_1byte,
+ Encoding::IBM720 => codepoints_1byte,
+ Encoding::IBM737 => codepoints_1byte,
+ Encoding::IBM775 => codepoints_1byte,
+ Encoding::IBM852 => codepoints_1byte,
+ Encoding::IBM855 => codepoints_1byte,
+ Encoding::IBM857 => codepoints_1byte,
+ Encoding::IBM860 => codepoints_1byte,
+ Encoding::IBM861 => codepoints_1byte,
+ Encoding::IBM862 => codepoints_1byte,
+ Encoding::IBM863 => codepoints_1byte,
+ Encoding::IBM864 => codepoints_1byte,
+ Encoding::IBM865 => codepoints_1byte,
+ Encoding::IBM866 => codepoints_1byte,
+ Encoding::IBM869 => codepoints_1byte,
+ Encoding::ISO_8859_1 => codepoints_1byte,
+ Encoding::ISO_8859_2 => codepoints_1byte,
+ Encoding::ISO_8859_3 => codepoints_1byte,
+ Encoding::ISO_8859_4 => codepoints_1byte,
+ Encoding::ISO_8859_5 => codepoints_1byte,
+ Encoding::ISO_8859_6 => codepoints_1byte,
+ Encoding::ISO_8859_7 => codepoints_1byte,
+ Encoding::ISO_8859_8 => codepoints_1byte,
+ Encoding::ISO_8859_9 => codepoints_1byte,
+ Encoding::ISO_8859_10 => codepoints_1byte,
+ Encoding::ISO_8859_11 => codepoints_1byte,
+ Encoding::ISO_8859_13 => codepoints_1byte,
+ Encoding::ISO_8859_14 => codepoints_1byte,
+ Encoding::ISO_8859_15 => codepoints_1byte,
+ Encoding::ISO_8859_16 => codepoints_1byte,
+ Encoding::KOI8_R => codepoints_1byte,
+ Encoding::KOI8_U => codepoints_1byte,
+ Encoding::MACCENTEURO => codepoints_1byte,
+ Encoding::MACCROATIAN => codepoints_1byte,
+ Encoding::MACCYRILLIC => codepoints_1byte,
+ Encoding::MACGREEK => codepoints_1byte,
+ Encoding::MACICELAND => codepoints_1byte,
+ Encoding::MACROMAN => codepoints_1byte,
+ Encoding::MACROMANIA => codepoints_1byte,
+ Encoding::MACTHAI => codepoints_1byte,
+ Encoding::MACTURKISH => codepoints_1byte,
+ Encoding::MACUKRAINE => codepoints_1byte,
+ Encoding::TIS_620 => codepoints_1byte,
+ Encoding::Windows_1250 => codepoints_1byte,
+ Encoding::Windows_1251 => codepoints_1byte,
+ Encoding::Windows_1252 => codepoints_1byte,
+ Encoding::Windows_1254 => codepoints_1byte,
+ Encoding::Windows_1255 => codepoints_1byte,
+ Encoding::Windows_1256 => codepoints_1byte,
+ Encoding::Windows_1257 => codepoints_1byte,
+ Encoding::Windows_1258 => codepoints_1byte,
+ Encoding::Windows_874 => codepoints_1byte,
+ Encoding::Big5 => codepoints_2bytes,
+ Encoding::Big5_HKSCS => codepoints_2bytes,
+ Encoding::Big5_UAO => codepoints_2bytes,
+ Encoding::CP949 => codepoints_2bytes,
+ Encoding::CP950 => codepoints_2bytes,
+ Encoding::CP951 => codepoints_2bytes,
+ Encoding::EUC_KR => codepoints_2bytes,
+ Encoding::GBK => codepoints_2bytes,
+ Encoding::GB12345 => codepoints_2bytes,
+ Encoding::GB2312 => codepoints_2bytes,
+ Encoding::MACJAPANESE => codepoints_2bytes,
+ Encoding::Shift_JIS => codepoints_2bytes,
+ Encoding::SJIS_DoCoMo => codepoints_2bytes,
+ Encoding::SJIS_KDDI => codepoints_2bytes,
+ Encoding::SJIS_SoftBank => codepoints_2bytes,
+ Encoding::Windows_31J => codepoints_2bytes,
Encoding::UTF_8 => codepoints_unicode,
Encoding::UTF8_MAC => codepoints_unicode,
Encoding::UTF8_DoCoMo => codepoints_unicode,
@@ -136,6 +134,8 @@ module Prism
)
end
+ # These test that we're correctly parsing codepoints for each alias of each
+ # encoding that prism supports.
encodings.each do |encoding, range|
encoding.names.each do |name|
next if name == "locale"
@@ -146,6 +146,17 @@ module Prism
end
end
+ # These test that we're correctly setting the flags on strings for each
+ # encoding that prism supports.
+ escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+ escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+ encodings.each_key do |encoding|
+ define_method(:"test_encoding_flags_#{encoding.name}") do
+ assert_encoding_flags(encoding, escapes)
+ end
+ end
+
def test_coding
result = Prism.parse("# coding: utf-8\n'string'")
actual = result.value.statements.body.first.unescaped.encoding
@@ -292,5 +303,47 @@ module Prism
refute Prism.parse(source).success?
end
end
+
+ def assert_encoding_flags(encoding, escapes)
+ escapes.each do |escaped|
+ source = "# encoding: #{encoding.name}\n\"#{escaped}\""
+
+ expected =
+ begin
+ eval(source).encoding
+ rescue SyntaxError => error
+ if error.message.include?("UTF-8 mixed within")
+ error.message[/: (.+?)\n/, 1]
+ else
+ raise
+ end
+ end
+
+ actual =
+ Prism.parse(source).then do |result|
+ if result.success?
+ string = result.value.statements.body.first
+
+ if string.forced_utf8_encoding?
+ Encoding::UTF_8
+ elsif string.forced_binary_encoding?
+ Encoding::ASCII_8BIT
+ else
+ encoding
+ end
+ else
+ error = result.errors.first
+
+ if error.message.include?("mixed")
+ error.message
+ else
+ raise error.message
+ end
+ end
+ end
+
+ assert_equal expected, actual
+ end
+ end
end
end