1 package uniseg
2
3 import "unicode/utf8"
4
5
6 const (
7 sbAny = iota
8 sbCR
9 sbParaSep
10 sbATerm
11 sbUpper
12 sbLower
13 sbSB7
14 sbSB8Close
15 sbSB8Sp
16 sbSTerm
17 sbSB8aClose
18 sbSB8aSp
19 )
20
21
22
23
24
25 func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
26 switch uint64(state) | uint64(prop)<<32 {
27
28 case sbAny | prCR<<32:
29 return sbCR, false, 9990
30 case sbCR | prLF<<32:
31 return sbParaSep, false, 30
32
33
34 case sbAny | prSep<<32:
35 return sbParaSep, false, 9990
36 case sbAny | prLF<<32:
37 return sbParaSep, false, 9990
38 case sbParaSep | prAny<<32:
39 return sbAny, true, 40
40 case sbCR | prAny<<32:
41 return sbAny, true, 40
42
43
44 case sbAny | prATerm<<32:
45 return sbATerm, false, 9990
46 case sbATerm | prNumeric<<32:
47 return sbAny, false, 60
48 case sbSB7 | prNumeric<<32:
49 return sbAny, false, 60
50
51
52 case sbAny | prUpper<<32:
53 return sbUpper, false, 9990
54 case sbAny | prLower<<32:
55 return sbLower, false, 9990
56 case sbUpper | prATerm<<32:
57 return sbSB7, false, 70
58 case sbLower | prATerm<<32:
59 return sbSB7, false, 70
60 case sbSB7 | prUpper<<32:
61 return sbUpper, false, 70
62
63
64 case sbAny | prSTerm<<32:
65 return sbSTerm, false, 9990
66 case sbATerm | prSContinue<<32:
67 return sbAny, false, 81
68 case sbATerm | prATerm<<32:
69 return sbATerm, false, 81
70 case sbATerm | prSTerm<<32:
71 return sbSTerm, false, 81
72 case sbSB7 | prSContinue<<32:
73 return sbAny, false, 81
74 case sbSB7 | prATerm<<32:
75 return sbATerm, false, 81
76 case sbSB7 | prSTerm<<32:
77 return sbSTerm, false, 81
78 case sbSB8Close | prSContinue<<32:
79 return sbAny, false, 81
80 case sbSB8Close | prATerm<<32:
81 return sbATerm, false, 81
82 case sbSB8Close | prSTerm<<32:
83 return sbSTerm, false, 81
84 case sbSB8Sp | prSContinue<<32:
85 return sbAny, false, 81
86 case sbSB8Sp | prATerm<<32:
87 return sbATerm, false, 81
88 case sbSB8Sp | prSTerm<<32:
89 return sbSTerm, false, 81
90 case sbSTerm | prSContinue<<32:
91 return sbAny, false, 81
92 case sbSTerm | prATerm<<32:
93 return sbATerm, false, 81
94 case sbSTerm | prSTerm<<32:
95 return sbSTerm, false, 81
96 case sbSB8aClose | prSContinue<<32:
97 return sbAny, false, 81
98 case sbSB8aClose | prATerm<<32:
99 return sbATerm, false, 81
100 case sbSB8aClose | prSTerm<<32:
101 return sbSTerm, false, 81
102 case sbSB8aSp | prSContinue<<32:
103 return sbAny, false, 81
104 case sbSB8aSp | prATerm<<32:
105 return sbATerm, false, 81
106 case sbSB8aSp | prSTerm<<32:
107 return sbSTerm, false, 81
108
109
110 case sbATerm | prClose<<32:
111 return sbSB8Close, false, 90
112 case sbSB7 | prClose<<32:
113 return sbSB8Close, false, 90
114 case sbSB8Close | prClose<<32:
115 return sbSB8Close, false, 90
116 case sbATerm | prSp<<32:
117 return sbSB8Sp, false, 90
118 case sbSB7 | prSp<<32:
119 return sbSB8Sp, false, 90
120 case sbSB8Close | prSp<<32:
121 return sbSB8Sp, false, 90
122 case sbSTerm | prClose<<32:
123 return sbSB8aClose, false, 90
124 case sbSB8aClose | prClose<<32:
125 return sbSB8aClose, false, 90
126 case sbSTerm | prSp<<32:
127 return sbSB8aSp, false, 90
128 case sbSB8aClose | prSp<<32:
129 return sbSB8aSp, false, 90
130 case sbATerm | prSep<<32:
131 return sbParaSep, false, 90
132 case sbATerm | prCR<<32:
133 return sbParaSep, false, 90
134 case sbATerm | prLF<<32:
135 return sbParaSep, false, 90
136 case sbSB7 | prSep<<32:
137 return sbParaSep, false, 90
138 case sbSB7 | prCR<<32:
139 return sbParaSep, false, 90
140 case sbSB7 | prLF<<32:
141 return sbParaSep, false, 90
142 case sbSB8Close | prSep<<32:
143 return sbParaSep, false, 90
144 case sbSB8Close | prCR<<32:
145 return sbParaSep, false, 90
146 case sbSB8Close | prLF<<32:
147 return sbParaSep, false, 90
148 case sbSTerm | prSep<<32:
149 return sbParaSep, false, 90
150 case sbSTerm | prCR<<32:
151 return sbParaSep, false, 90
152 case sbSTerm | prLF<<32:
153 return sbParaSep, false, 90
154 case sbSB8aClose | prSep<<32:
155 return sbParaSep, false, 90
156 case sbSB8aClose | prCR<<32:
157 return sbParaSep, false, 90
158 case sbSB8aClose | prLF<<32:
159 return sbParaSep, false, 90
160
161
162 case sbSB8Sp | prSp<<32:
163 return sbSB8Sp, false, 100
164 case sbSB8aSp | prSp<<32:
165 return sbSB8aSp, false, 100
166 case sbSB8Sp | prSep<<32:
167 return sbParaSep, false, 100
168 case sbSB8Sp | prCR<<32:
169 return sbParaSep, false, 100
170 case sbSB8Sp | prLF<<32:
171 return sbParaSep, false, 100
172
173
174 case sbATerm | prAny<<32:
175 return sbAny, true, 110
176 case sbSB7 | prAny<<32:
177 return sbAny, true, 110
178 case sbSB8Close | prAny<<32:
179 return sbAny, true, 110
180 case sbSB8Sp | prAny<<32:
181 return sbAny, true, 110
182 case sbSTerm | prAny<<32:
183 return sbAny, true, 110
184 case sbSB8aClose | prAny<<32:
185 return sbAny, true, 110
186 case sbSB8aSp | prAny<<32:
187 return sbAny, true, 110
188
189
190 default:
191 return -1, false, -1
192 }
193 }
194
195
196
197
198
199
200
201 func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
202
203 nextProperty := property(sentenceBreakCodePoints, r)
204
205
206 if nextProperty == prExtend || nextProperty == prFormat {
207 if state == sbParaSep || state == sbCR {
208 return sbAny, true
209 }
210 if state < 0 {
211 return sbAny, true
212 }
213 return state, false
214 }
215
216
217 var rule int
218 newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
219 if newState < 0 {
220
221 anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
222 anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
223 if anyPropState >= 0 && anyStateState >= 0 {
224
225 newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
226 if anyPropRule < anyStateRule {
227 sentenceBreak, rule = anyPropProp, anyPropRule
228 }
229 } else if anyPropState >= 0 {
230
231 newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
232
233
234
235
236 } else if anyStateState >= 0 {
237
238 newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
239 } else {
240
241 newState, sentenceBreak, rule = sbAny, false, 9990
242 }
243 }
244
245
246 if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
247
248 var length int
249 for nextProperty != prOLetter &&
250 nextProperty != prUpper &&
251 nextProperty != prLower &&
252 nextProperty != prSep &&
253 nextProperty != prCR &&
254 nextProperty != prLF &&
255 nextProperty != prATerm &&
256 nextProperty != prSTerm {
257
258 if b != nil {
259 r, length = utf8.DecodeRune(b)
260 b = b[length:]
261 } else {
262 r, length = utf8.DecodeRuneInString(str)
263 str = str[length:]
264 }
265 if r == utf8.RuneError {
266 break
267 }
268 nextProperty = property(sentenceBreakCodePoints, r)
269 }
270 if nextProperty == prLower {
271 return sbLower, false
272 }
273 }
274
275 return
276 }
277
View as plain text