spec/rubyspec/core/string/unicode_normalize_spec.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

# -*- encoding: utf-8 -*-
require File.expand_path('../../../spec_helper', __FILE__)

# Examples taken from http://www.unicode.org/reports/tr15/#Norm_Forms

describe "String#unicode_normalize" do
  before :each do
    @accented_f = "\u1e9b\u0323"
    @angstrom = "\u212b"
    @ohm = "\u2126"
  end

  it "normalizes code points in the string according to the form that is specified" do
    @accented_f.unicode_normalize(:nfc).should == "\u1e9b\u0323"
    @accented_f.unicode_normalize(:nfd).should == "\u017f\u0323\u0307"
    @accented_f.unicode_normalize(:nfkc).should == "\u1e69"
    @accented_f.unicode_normalize(:nfkd).should == "\u0073\u0323\u0307"
  end

  it "defaults to the nfc normalization form if no forms are specified" do
    @accented_f.unicode_normalize.should == "\u1e9b\u0323"
    @angstrom.unicode_normalize.should == "\u00c5"
    @ohm.unicode_normalize.should == "\u03a9"
  end

  # http://unicode.org/faq/normalization.html#6
  context "returns normalized form of string by default" do
    it "03D3 (ϓ) GREEK UPSILON WITH ACUTE AND HOOK SYMBOL" do
      "\u03D3".unicode_normalize(:nfc).should == "\u03D3"
      "\u03D3".unicode_normalize(:nfd).should == "\u03D2\u0301"
      "\u03D3".unicode_normalize(:nfkc).should == "\u038E"
      "\u03D3".unicode_normalize(:nfkd).should == "\u03A5\u0301"
    end

    it "03D4 (ϔ) GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" do
      "\u03D4".unicode_normalize(:nfc).should == "\u03D4"
      "\u03D4".unicode_normalize(:nfd).should == "\u03D2\u0308"
      "\u03D4".unicode_normalize(:nfkc).should == "\u03AB"
      "\u03D4".unicode_normalize(:nfkd).should == "\u03A5\u0308"
    end

    it "1E9B (ẛ) LATIN SMALL LETTER LONG S WITH DOT ABOVE" do
      "\u1E9B".unicode_normalize(:nfc).should == "\u1E9B"
      "\u1E9B".unicode_normalize(:nfd).should == "\u017F\u0307"
      "\u1E9B".unicode_normalize(:nfkc).should == "\u1E61"
      "\u1E9B".unicode_normalize(:nfkd).should == "\u0073\u0307"
    end
  end

  it "raises an Encoding::CompatibilityError if string is not in an unicode encoding" do
    lambda do
      [0xE0].pack('C').force_encoding("ISO-8859-1").unicode_normalize(:nfd)
    end.should raise_error(Encoding::CompatibilityError)
  end

  it "raises an ArgumentError if the specified form is invalid" do
    lambda {
      @angstrom.unicode_normalize(:invalid_form)
    }.should raise_error(ArgumentError)
  end
end

describe "String#unicode_normalize!" do
  it "normalizes code points and modifies the receiving string" do
    angstrom = "\u212b"
    angstrom.unicode_normalize!
    angstrom.should == "\u00c5"
    angstrom.should_not == "\u212b"
  end

  it "modifies original string (nfc)" do
    str = "a\u0300"
    str.unicode_normalize!(:nfc)

    str.should_not == "a\u0300"
    str.should == "à"
  end

  it "modifies self in place (nfd)" do
    str = "\u00E0"
    str.unicode_normalize!(:nfd)

    str.should_not == "\u00E0"
    str.should == "a\u0300"
  end

  it "modifies self in place (nfkc)" do
    str = "\u1E9B\u0323"
    str.unicode_normalize!(:nfkc)

    str.should_not == "\u1E9B\u0323"
    str.should == "\u1E69"
  end

  it "modifies self in place (nfkd)" do
    str = "\u1E9B\u0323"
    str.unicode_normalize!(:nfkd)

    str.should_not == "\u1E9B\u0323"
    str.should == "s\u0323\u0307"
  end

  it "raises an Encoding::CompatibilityError if the string is not in an unicode encoding" do
    lambda {
      [0xE0].pack('C').force_encoding("ISO-8859-1").unicode_normalize!
    }.should raise_error(Encoding::CompatibilityError)
  end

  it "raises an ArgumentError if the specified form is invalid" do
    ohm = "\u2126"
    lambda {
      ohm.unicode_normalize!(:invalid_form)
    }.should raise_error(ArgumentError)
  end
end