aboutsummaryrefslogtreecommitdiffstats
path: root/lib/bundler/similarity_detector.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/bundler/similarity_detector.rb')
-rw-r--r--lib/bundler/similarity_detector.rb62
1 files changed, 62 insertions, 0 deletions
diff --git a/lib/bundler/similarity_detector.rb b/lib/bundler/similarity_detector.rb
new file mode 100644
index 0000000000..e9c1413ea3
--- /dev/null
+++ b/lib/bundler/similarity_detector.rb
@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+module Bundler
+ class SimilarityDetector
+ SimilarityScore = Struct.new(:string, :distance)
+
+ # initialize with an array of words to be matched against
+ def initialize(corpus)
+ @corpus = corpus
+ end
+
+ # return an array of words similar to 'word' from the corpus
+ def similar_words(word, limit = 3)
+ words_by_similarity = @corpus.map {|w| SimilarityScore.new(w, levenshtein_distance(word, w)) }
+ words_by_similarity.select {|s| s.distance <= limit }.sort_by(&:distance).map(&:string)
+ end
+
+ # return the result of 'similar_words', concatenated into a list
+ # (eg "a, b, or c")
+ def similar_word_list(word, limit = 3)
+ words = similar_words(word, limit)
+ if words.length == 1
+ words[0]
+ elsif words.length > 1
+ [words[0..-2].join(", "), words[-1]].join(" or ")
+ end
+ end
+
+ protected
+
+ # http://www.informit.com/articles/article.aspx?p=683059&seqNum=36
+ def levenshtein_distance(this, that, ins = 2, del = 2, sub = 1)
+ # ins, del, sub are weighted costs
+ return nil if this.nil?
+ return nil if that.nil?
+ dm = [] # distance matrix
+
+ # Initialize first row values
+ dm[0] = (0..this.length).collect {|i| i * ins }
+ fill = [0] * (this.length - 1)
+
+ # Initialize first column values
+ (1..that.length).each do |i|
+ dm[i] = [i * del, fill.flatten]
+ end
+
+ # populate matrix
+ (1..that.length).each do |i|
+ (1..this.length).each do |j|
+ # critical comparison
+ dm[i][j] = [
+ dm[i - 1][j - 1] + (this[j - 1] == that[i - 1] ? 0 : sub),
+ dm[i][j - 1] + ins,
+ dm[i - 1][j] + del
+ ].min
+ end
+ end
+
+ # The last value in matrix is the Levenshtein distance between the strings
+ dm[that.length][this.length]
+ end
+ end
+end