From 80a6c500952589f2be5ca24f7a9a0ac065c3f11c Mon Sep 17 00:00:00 2001 From: Mikhail Kirillov Date: Wed, 15 Nov 2023 13:43:38 +0300 Subject: [PATCH] improve spaces check --- .github/workflows/ci.yaml | 2 +- go.mod | 4 ++-- go.sum | 8 ++++++-- runes/const.go | 6 ++++++ tokenizer.go | 13 ++++++++----- tokenizer_test.go | 2 ++ util.go | 13 +++++++++++++ vars.go | 2 ++ 8 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 util.go diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ac9618a..cc8065a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - go-version: [1.15.x] + go-version: [1.20.x] platform: [ubuntu-latest] runs-on: ${{ matrix.platform }} steps: diff --git a/go.mod b/go.mod index 87767c2..fab91c8 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,5 @@ module github.com/wmentor/tokens -go 1.15 +go 1.20 -require github.com/wmentor/tbuf v1.0.0 // indirect +require github.com/wmentor/tbuf v1.0.1 diff --git a/go.sum b/go.sum index 7a9bd10..2369632 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,6 @@ -github.com/wmentor/tbuf v1.0.0 h1:KHfiIdOTWor7a/5dSoLovxgGuipLAAU1X4+U3jzdIZ4= -github.com/wmentor/tbuf v1.0.0/go.mod h1:YvYY3BMph/UVPSIMbQoraxgr7+7DCAvYSSJHZk2gsBQ= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/wmentor/tbuf v1.0.1 h1:IonHpWwR0Wyh3Jfu0AbGSqzVDzUZ1zU61ML5F1CdBno= +github.com/wmentor/tbuf v1.0.1/go.mod h1:1lO+hvrkqqjEcR74vrNfBL3jg0NnpGHDWHeFxRsk7js= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/runes/const.go b/runes/const.go index 96bdc09..5c26bc7 100644 --- a/runes/const.go +++ b/runes/const.go @@ -1,3 +1,5 @@ +// Copyright (c) 2023, Mikhail Kirillov + package runes const ( @@ -47,4 +49,8 @@ const ( TRADE rune = '™' UML rune = '¨' YEN rune = '¥' + ZWSP rune = '\u200B' + ZWNBSP rune = '\uFEFF' + ZWJ rune = '\u200D' + ZWNJ rune = '\u200C' ) diff --git a/tokenizer.go b/tokenizer.go index 91ba313..24d615d 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -1,3 +1,5 @@ +// Copyright (c) 2023, Mikhail Kirillov + package tokens import ( @@ -7,6 +9,7 @@ import ( "unicode" buffer "github.com/wmentor/tbuf" + "github.com/wmentor/tokens/runes" ) @@ -176,7 +179,7 @@ func (t *Tokenizer) onRune(r rune) { func (t *Tokenizer) state0(r rune) { switch { - case unicode.IsSpace(r): + case isSpace(r): t.mode = 0 case t.isAlNum(r): @@ -235,7 +238,7 @@ func (t *Tokenizer) state1(r rune) { case t.isAlNum(r): t.mkr1.WriteRune(r) - case unicode.IsSpace(r): + case isSpace(r): t.onToken(t.mkr1.String()) t.mode = 0 @@ -268,7 +271,7 @@ func (t *Tokenizer) state2(r rune) { t.onToken(string(r)) t.mode = 0 - case unicode.IsSpace(r): + case isSpace(r): t.onToken(t.mkr1.String()) t.onToken(string(t.prevRune)) t.mode = 0 @@ -305,7 +308,7 @@ func (t *Tokenizer) state4(r rune) { } func (t *Tokenizer) state5(r rune) { - if unicode.IsSpace(r) { + if isSpace(r) { t.onToken(t.mkr1.String()) t.mode = 0 } else { @@ -323,7 +326,7 @@ func (t *Tokenizer) state6(r rune) { t.mkr1.Reset() t.mkr1.WriteRune('#') - case unicode.IsSpace(r): + case isSpace(r): t.onToken(t.mkr1.String()) t.mode = 0 diff --git a/tokenizer_test.go b/tokenizer_test.go index 091703f..4804f76 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -1,3 +1,5 @@ +// Copyright (c) 2023, Mikhail Kirillov + package tokens_test import ( diff --git a/util.go b/util.go new file mode 100644 index 0000000..f701205 --- /dev/null +++ b/util.go @@ -0,0 +1,13 @@ +// Copyright (c) 2023, Mikhail Kirillov + +package tokens + +import ( + "unicode" + + "github.com/wmentor/tokens/runes" +) + +func isSpace(r rune) bool { + return unicode.IsSpace(r) || r == runes.ZWSP || r == runes.ZWNBSP || r == runes.ZWJ || r == runes.ZWNJ +} diff --git a/vars.go b/vars.go index 2b5583a..a9efaf3 100644 --- a/vars.go +++ b/vars.go @@ -1,3 +1,5 @@ +// Copyright (c) 2023, Mikhail Kirillov + package tokens import (