上一篇看到 Trie 的数据结构, 想着用 Ruby 和 Go 大概实现一下对比看看, 顺便看看一下 Benchmark.

(挺没意义的一个事 = 。 =)

普通的 trie 是一个字符一个结点, 压缩 trie 的结点可能是一个字符串, 空间更省一些吧.

20k.txt 是两万个英文单词, github 地址 https://github.com/first20hours/google-10000-english/blob/master/20k.txt.

Trie in Ruby

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
# ruby 2.4
require 'benchmark'

class Node
  attr_reader :char, :children

  def initialize(c)
    @char = c
    @children = []
  end

  def build(c)
    child = find(c)
    if child.nil?
      child = Node.new(c)
      @children << child
    end
    return child
  end

  def find(c)
    @children.each do |child|
      return child if child.char == c
    end
    nil
  end
end

class Trie
  attr_reader :root

  def initialize
    @root = Node.new(nil)
  end

  def build(word)
    node = @root
    word.chars.each do |char|
      child = node.build(char)
      node = child
    end
  end

  def has?(word)
    node = @root
    word.chars.each do |char|
      found = node.find(char)
      return false if found.nil?
      node = found
    end

    return true
  end
end

class Trie2 < Hash
  def initialize
    super
  end

  def build(string)
    string.chars.inject(self) do |h, char|
      h[char] ||= { }
    end
  end

  def has?(string)
    tr = self
    string.chars.each do |char|
      return false if tr[char].nil?
      tr = tr[char]
    end
    return true
  end
end


t = Trie.new
File.readlines('./20k.txt').each do |line|
  t.build(line.gsub('\n', ''))
end

puts Benchmark.measure {
  200_000.times do
    t.has?('42082')
    t.has?('oops')
    t.has?('Supercalifragilisticexpialidocious')
  end
}

t2 = Trie2.new
File.readlines('./20k.txt').each do |line|
t2.build(line.gsub('\n', ''))
end

puts Benchmark.measure {
  200_000.times do
    t2.has?('42082')
    t2.has?('oops')
    t2.has?('Supercalifragilisticexpialidocious')
  end
}

# Benchmark
# 2.950000   0.010000   2.960000 (  2.978725)
# 1.320000   0.010000   1.330000 (  1.325868)

Trie in Golang

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// go 1.11
package main

import (
    "bufio"
    "log"
    "os"
    "testing"
)

type Node struct {
    Char     rune
    Children []*Node
}

func NewNode(r rune) *Node {
    return &Node{Char: r}
}

func (n *Node) insert(r rune) *Node {
    child := n.get(r)
    if child == nil {
        child = NewNode(r)
        n.Children = append(n.Children, child)
    }

    return child
}

func (n *Node) get(r rune) *Node {
    for _, child := range n.Children {
        if child.Char == r {
            return child
        }
    }
    return nil
}

type Trie struct {
    Root *Node
}

func NewTrie() *Trie {
    var r rune
    trie := Trie{Root: NewNode(r)}
    return &trie
}

func (tr *Trie) Build(word string) {
    node := tr.Root
    runeArr := []rune(word)
    for _, char := range runeArr {
        child := node.insert(char)
        node = child
    }
}

func (tr *Trie) Has(word string) bool {
    node := tr.Root
    runeArr := []rune(word)
    for _, char := range runeArr {
        found := node.get(char)
        if found == nil {
            return false
        }
        node = found
    }
    return true
}

func BenchmarkTrieFind(b *testing.B) {
    var trie1 = NewTrie()
    file, err := os.Open("./20k.txt")
    if err != nil {
        log.Fatal(err)
    }
    defer file.Close()

    scanner := bufio.NewScanner(file)

    for scanner.Scan() {
        trie1.Build(scanner.Text())
    }

    b.ResetTimer()

    for i := 0; i < b.N; i++ {
        trie1.Has("42082")
        trie1.Has("oops")
        trie1.Has("Supercalifragilisticexpialidocious")
    }
}

// goos: darwin
// goarch: amd64
// BenchmarkTrieFind-8   	 5000000	       255 ns/op	     144 B/op	       1 allocs/op
// PASS
// ok  	_/Users/xguox/Desktop	1.591s
// Success: Benchmarks passed.

Ruby 的 Benchmark 是 200_000 次的总和, 单位是 s, 换成 go 实现下面的 ns 大概 6629 ns/op.