From 76cac4c05a7be61a94a709b8b850118ad0bfa684 Mon Sep 17 00:00:00 2001 From: Yusuke Endoh Date: Wed, 21 Oct 2020 13:29:19 +0900 Subject: [ruby/reline] Improve the performance of `get_mbchar_width` It is about three times faster to use one big regexp instead of sequential matching. https://github.com/ruby/reline/commit/e36f6c0707 --- lib/reline/unicode.rb | 40 +++++++++++++++++++++------------- lib/reline/unicode/east_asian_width.rb | 26 +++++++++++----------- 2 files changed, 38 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index cd8c27e85b..df2f6719a4 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -72,20 +72,32 @@ class Reline::Unicode }.join end + require 'reline/unicode/east_asian_width' + + MBCharWidthRE = / + (? + [#{ EscapedChars.map {|c| "\\x%02x" % c.ord }.join }] (?# ^ + char, such as ^M, ^H, ^[, ...) + ) + | (?^\u{2E3B}) (?# THREE-EM DASH) + | (?^\p{M}) + | (? + #{ EastAsianWidth::TYPE_F } + | #{ EastAsianWidth::TYPE_W } + ) + | (? + #{ EastAsianWidth::TYPE_H } + | #{ EastAsianWidth::TYPE_NA } + | #{ EastAsianWidth::TYPE_N } + ) + /x + def self.get_mbchar_width(mbchar) - case mbchar.encode(Encoding::UTF_8) - when *EscapedChars # ^ + char, such as ^M, ^H, ^[, ... - 2 - when /^\u{2E3B}/ # THREE-EM DASH - 3 - when /^\p{M}/ - 0 - when EastAsianWidth::TYPE_A - Reline.ambiguous_width - when EastAsianWidth::TYPE_F, EastAsianWidth::TYPE_W - 2 - when EastAsianWidth::TYPE_H, EastAsianWidth::TYPE_NA, EastAsianWidth::TYPE_N - 1 + m = mbchar.encode(Encoding::UTF_8).match(MBCharWidthRE) + case + when m[:width_2_1], m[:width_2_2] then 2 + when m[:width_3] then 3 + when m[:width_0] then 0 + when m[:width_1] then 1 else nil end @@ -591,5 +603,3 @@ class Reline::Unicode [byte_size, width] end end - -require 'reline/unicode/east_asian_width' diff --git a/lib/reline/unicode/east_asian_width.rb b/lib/reline/unicode/east_asian_width.rb index 7483c78936..89bc9d9435 100644 --- a/lib/reline/unicode/east_asian_width.rb +++ b/lib/reline/unicode/east_asian_width.rb @@ -1,16 +1,16 @@ class Reline::Unicode::EastAsianWidth # This is based on EastAsianWidth.txt - # http://www.unicode.org/Public/13.0.0/ucd/EastAsianWidth.txt + # EastAsianWidth.txt # Fullwidth - TYPE_F = /^([#{ %W( + TYPE_F = /^[#{ %W( \u{3000} \u{FF01}-\u{FF60} \u{FFE0}-\u{FFE6} - ).join }])/ + ).join }]/ # Halfwidth - TYPE_H = /^([#{ %W( + TYPE_H = /^[#{ %W( \u{20A9} \u{FF61}-\u{FFBE} \u{FFC2}-\u{FFC7} @@ -18,10 +18,10 @@ class Reline::Unicode::EastAsianWidth \u{FFD2}-\u{FFD7} \u{FFDA}-\u{FFDC} \u{FFE8}-\u{FFEE} - ).join }])/ + ).join }]/ # Wide - TYPE_W = /^([#{ %W( + TYPE_W = /^[#{ %W( \u{1100}-\u{115F} \u{231A}-\u{231B} \u{2329}-\u{232A} @@ -136,10 +136,10 @@ class Reline::Unicode::EastAsianWidth \u{1FAD0}-\u{1FAD6} \u{20000}-\u{2FFFD} \u{30000}-\u{3FFFD} - ).join }])/ + ).join }]/ # Narrow - TYPE_NA = /^([#{ %W( + TYPE_NA = /^[#{ %W( \u{0020}-\u{007E} \u{00A2}-\u{00A3} \u{00A5}-\u{00A6} @@ -147,10 +147,10 @@ class Reline::Unicode::EastAsianWidth \u{00AF} \u{27E6}-\u{27ED} \u{2985}-\u{2986} - ).join }])/ + ).join }]/ # Ambiguous - TYPE_A = /^([#{ %W( + TYPE_A = /^[#{ %W( \u{00A1} \u{00A4} \u{00A7}-\u{00A8} @@ -330,10 +330,10 @@ class Reline::Unicode::EastAsianWidth \u{E0100}-\u{E01EF} \u{F0000}-\u{FFFFD} \u{100000}-\u{10FFFD} - ).join }])/ + ).join }]/ # Neutral - TYPE_N = /^([#{ %W( + TYPE_N = /^[#{ %W( \u{0000}-\u{001F} \u{007F}-\u{00A0} \u{00A9} @@ -1160,5 +1160,5 @@ class Reline::Unicode::EastAsianWidth \u{1FBF0}-\u{1FBF9} \u{E0001} \u{E0020}-\u{E007F} - ).join }])/ + ).join }]/ end -- cgit v1.2.3