-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.go
122 lines (104 loc) · 2.82 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package ergolas
import (
"fmt"
"regexp"
"strings"
)
type TokenType string
type Token struct {
Type TokenType
Value string
Location int
}
func computeLineColumn(source string, index int) (line, column int) {
lines := strings.Split(source, "\n")
totalChars := 0
for i, line := range lines {
lineLength := len(line) + 1
if index < totalChars+lineLength {
lineIndex := index - totalChars
return i + 1, lineIndex + 1
}
totalChars += lineLength
}
panic("character index out of range")
}
type TokenizeError struct {
Source *string
Location int
Message string
}
func (e TokenizeError) Error() string {
line, col := computeLineColumn(*e.Source, e.Location)
return fmt.Sprintf(`[%d:%d] %s`, line, col, e.Message)
}
type rule struct {
Type TokenType
Regex *regexp.Regexp
Ignore bool
}
var (
FloatToken TokenType = "Float"
IntegerToken TokenType = "Integer"
StringToken TokenType = "String"
QuoteToken TokenType = "Quote"
UnquoteToken TokenType = "Unquote"
LOperatorToken TokenType = "LOperator"
ROperatorToken TokenType = "ROperator"
PunctuationToken TokenType = "Punctuation"
IdentifierToken TokenType = "Identifier"
CommentToken TokenType = "Comment"
WhitespaceToken TokenType = "Whitespace"
NewlineToken TokenType = "Newline"
)
var rules = []rule{
{Type: FloatToken,
Regex: regexp.MustCompile(`^[0-9]+\.[0-9]+`)},
{Type: IntegerToken,
Regex: regexp.MustCompile(`^[0-9]+`)},
{Type: StringToken,
Regex: regexp.MustCompile(`^"(\\.|[^"])*"`)},
{Type: ROperatorToken, // The operators ":=", "::", "<-", "->" and "|>" are right associative
Regex: regexp.MustCompile(`^(\:\=|\:\:)`)},
{Type: QuoteToken,
Regex: regexp.MustCompile(`^:`)},
{Type: UnquoteToken,
Regex: regexp.MustCompile(`^\$`)},
{Type: LOperatorToken,
Regex: regexp.MustCompile(`^[\+\-\*\/\%\=\<\>\!\&\|\^]+`)},
{Type: PunctuationToken,
Regex: regexp.MustCompile(`^[\.\,\;\(\)\[\]\{\}]`)},
{Type: IdentifierToken,
Regex: regexp.MustCompile(`^[a-zA-Z\-\_\$][a-zA-Z0-9\-\_\$]*`)},
{Type: NewlineToken,
Regex: regexp.MustCompile(`^\n\s*`)},
{Type: CommentToken, Ignore: true,
Regex: regexp.MustCompile(`^#.*`)},
{Type: WhitespaceToken, Ignore: true,
Regex: regexp.MustCompile(`^[ \t]+`)},
}
func matchRules(source string) (*Token, bool) {
for _, rule := range rules {
match := rule.Regex.FindString(source)
if match != "" {
return &Token{Type: rule.Type, Value: match}, rule.Ignore
}
}
return nil, true
}
func Tokenize(source string) ([]Token, error) {
cursor := 0
tokens := []Token{}
for cursor < len(source) {
remaining := source[cursor:]
t, ignore := matchRules(remaining)
if t == nil {
return nil, TokenizeError{&source, cursor, "unexpected character"}
}
cursor += len(t.Value)
if !ignore {
tokens = append(tokens, *t)
}
}
return tokens, nil
}