1 package uniseg
2
3 import "unicode/utf8"
4
5
6 const (
7 wbAny = iota
8 wbCR
9 wbLF
10 wbNewline
11 wbWSegSpace
12 wbHebrewLetter
13 wbALetter
14 wbWB7
15 wbWB7c
16 wbNumeric
17 wbWB11
18 wbKatakana
19 wbExtendNumLet
20 wbOddRI
21 wbEvenRI
22 wbZWJBit = 16
23 )
24
25
26
27
28
29 func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
30 switch uint64(state) | uint64(prop)<<32 {
31
32 case wbAny | prNewline<<32:
33 return wbNewline, true, 32
34 case wbAny | prCR<<32:
35 return wbCR, true, 32
36 case wbAny | prLF<<32:
37 return wbLF, true, 32
38
39
40 case wbNewline | prAny<<32:
41 return wbAny, true, 31
42 case wbCR | prAny<<32:
43 return wbAny, true, 31
44 case wbLF | prAny<<32:
45 return wbAny, true, 31
46
47
48 case wbCR | prLF<<32:
49 return wbLF, false, 30
50
51
52 case wbAny | prWSegSpace<<32:
53 return wbWSegSpace, true, 9990
54 case wbWSegSpace | prWSegSpace<<32:
55 return wbWSegSpace, false, 34
56
57
58 case wbAny | prALetter<<32:
59 return wbALetter, true, 9990
60 case wbAny | prHebrewLetter<<32:
61 return wbHebrewLetter, true, 9990
62 case wbALetter | prALetter<<32:
63 return wbALetter, false, 50
64 case wbALetter | prHebrewLetter<<32:
65 return wbHebrewLetter, false, 50
66 case wbHebrewLetter | prALetter<<32:
67 return wbALetter, false, 50
68 case wbHebrewLetter | prHebrewLetter<<32:
69 return wbHebrewLetter, false, 50
70
71
72 case wbWB7 | prALetter<<32:
73 return wbALetter, false, 70
74 case wbWB7 | prHebrewLetter<<32:
75 return wbHebrewLetter, false, 70
76
77
78 case wbHebrewLetter | prSingleQuote<<32:
79 return wbAny, false, 71
80
81
82 case wbWB7c | prHebrewLetter<<32:
83 return wbHebrewLetter, false, 73
84
85
86 case wbAny | prNumeric<<32:
87 return wbNumeric, true, 9990
88 case wbNumeric | prNumeric<<32:
89 return wbNumeric, false, 80
90
91
92 case wbALetter | prNumeric<<32:
93 return wbNumeric, false, 90
94 case wbHebrewLetter | prNumeric<<32:
95 return wbNumeric, false, 90
96
97
98 case wbNumeric | prALetter<<32:
99 return wbALetter, false, 100
100 case wbNumeric | prHebrewLetter<<32:
101 return wbHebrewLetter, false, 100
102
103
104 case wbWB11 | prNumeric<<32:
105 return wbNumeric, false, 110
106
107
108 case wbAny | prKatakana<<32:
109 return wbKatakana, true, 9990
110 case wbKatakana | prKatakana<<32:
111 return wbKatakana, false, 130
112
113
114 case wbAny | prExtendNumLet<<32:
115 return wbExtendNumLet, true, 9990
116 case wbALetter | prExtendNumLet<<32:
117 return wbExtendNumLet, false, 131
118 case wbHebrewLetter | prExtendNumLet<<32:
119 return wbExtendNumLet, false, 131
120 case wbNumeric | prExtendNumLet<<32:
121 return wbExtendNumLet, false, 131
122 case wbKatakana | prExtendNumLet<<32:
123 return wbExtendNumLet, false, 131
124 case wbExtendNumLet | prExtendNumLet<<32:
125 return wbExtendNumLet, false, 131
126
127
128 case wbExtendNumLet | prALetter<<32:
129 return wbALetter, false, 132
130 case wbExtendNumLet | prHebrewLetter<<32:
131 return wbHebrewLetter, false, 132
132 case wbExtendNumLet | prNumeric<<32:
133 return wbNumeric, false, 132
134 case wbExtendNumLet | prKatakana<<32:
135 return wbKatakana, false, 132
136
137 default:
138 return -1, false, -1
139 }
140 }
141
142
143
144
145
146
147 func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
148
149 nextProperty := property(workBreakCodePoints, r)
150
151
152 if nextProperty == prZWJ {
153
154 if state == wbNewline || state == wbCR || state == wbLF {
155 return wbAny | wbZWJBit, true
156 }
157 if state < 0 {
158 return wbAny | wbZWJBit, false
159 }
160 return state | wbZWJBit, false
161 } else if nextProperty == prExtend || nextProperty == prFormat {
162
163 if state == wbNewline || state == wbCR || state == wbLF {
164 return wbAny, true
165 }
166 if state == wbWSegSpace || state == wbAny|wbZWJBit {
167 return wbAny, false
168 }
169 if state < 0 {
170 return wbAny, false
171 }
172 return state, false
173 } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
174
175 return wbAny, false
176 }
177 if state >= 0 {
178 state = state &^ wbZWJBit
179 }
180
181
182 var rule int
183 newState, wordBreak, rule = wbTransitions(state, nextProperty)
184 if newState < 0 {
185
186 anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
187 anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
188 if anyPropState >= 0 && anyStateState >= 0 {
189
190 newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
191 if anyPropRule < anyStateRule {
192 wordBreak, rule = anyPropWordBreak, anyPropRule
193 }
194 } else if anyPropState >= 0 {
195
196 newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
197
198
199
200
201 } else if anyStateState >= 0 {
202
203 newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
204 } else {
205
206 newState, wordBreak, rule = wbAny, true, 9990
207 }
208 }
209
210
211
212
213
214 farProperty := -1
215 if rule > 60 &&
216 (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
217 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote ||
218 nextProperty == prDoubleQuote ||
219 nextProperty == prMidNum) {
220 for {
221 var (
222 r rune
223 length int
224 )
225 if b != nil {
226 r, length = utf8.DecodeRune(b)
227 b = b[length:]
228 } else {
229 r, length = utf8.DecodeRuneInString(str)
230 str = str[length:]
231 }
232 if r == utf8.RuneError {
233 break
234 }
235 prop := property(workBreakCodePoints, r)
236 if prop == prExtend || prop == prFormat || prop == prZWJ {
237 continue
238 }
239 farProperty = prop
240 break
241 }
242 }
243
244
245 if rule > 60 &&
246 (state == wbALetter || state == wbHebrewLetter) &&
247 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
248 (farProperty == prALetter || farProperty == prHebrewLetter) {
249 return wbWB7, false
250 }
251
252
253 if rule > 72 &&
254 state == wbHebrewLetter &&
255 nextProperty == prDoubleQuote &&
256 farProperty == prHebrewLetter {
257 return wbWB7c, false
258 }
259
260
261 if rule > 120 &&
262 state == wbNumeric &&
263 (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
264 farProperty == prNumeric {
265 return wbWB11, false
266 }
267
268
269 if newState == wbAny && nextProperty == prRegionalIndicator {
270 if state != wbOddRI && state != wbEvenRI {
271
272 return wbOddRI, true
273 }
274 if state == wbOddRI {
275
276 return wbEvenRI, false
277 }
278 return wbOddRI, true
279 }
280
281 return
282 }
283
View as plain text