Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
268 lines
8.5 KiB
268 lines
8.5 KiB
package uniseg |
|
|
|
import "unicode/utf8" |
|
|
|
// The states of the grapheme cluster parser. |
|
const ( |
|
grAny = iota |
|
grCR |
|
grControlLF |
|
grL |
|
grLVV |
|
grLVTT |
|
grPrepend |
|
grExtendedPictographic |
|
grExtendedPictographicZWJ |
|
grRIOdd |
|
grRIEven |
|
) |
|
|
|
// The grapheme cluster parser's breaking instructions. |
|
const ( |
|
grNoBoundary = iota |
|
grBoundary |
|
) |
|
|
|
// The grapheme cluster parser's state transitions. Maps (state, property) to |
|
// (new state, breaking instruction, rule number). The breaking instruction |
|
// always refers to the boundary between the last and next code point. |
|
// |
|
// This map is queried as follows: |
|
// |
|
// 1. Find specific state + specific property. Stop if found. |
|
// 2. Find specific state + any property. |
|
// 3. Find any state + specific property. |
|
// 4. If only (2) or (3) (but not both) was found, stop. |
|
// 5. If both (2) and (3) were found, use state and breaking instruction from |
|
// the transition with the lower rule number, prefer (3) if rule numbers |
|
// are equal. Stop. |
|
// 6. Assume grAny and grBoundary. |
|
var grTransitions = map[[2]int][3]int{ |
|
// GB5 |
|
{grAny, prCR}: {grCR, grBoundary, 50}, |
|
{grAny, prLF}: {grControlLF, grBoundary, 50}, |
|
{grAny, prControl}: {grControlLF, grBoundary, 50}, |
|
|
|
// GB4 |
|
{grCR, prAny}: {grAny, grBoundary, 40}, |
|
{grControlLF, prAny}: {grAny, grBoundary, 40}, |
|
|
|
// GB3. |
|
{grCR, prLF}: {grAny, grNoBoundary, 30}, |
|
|
|
// GB6. |
|
{grAny, prL}: {grL, grBoundary, 9990}, |
|
{grL, prL}: {grL, grNoBoundary, 60}, |
|
{grL, prV}: {grLVV, grNoBoundary, 60}, |
|
{grL, prLV}: {grLVV, grNoBoundary, 60}, |
|
{grL, prLVT}: {grLVTT, grNoBoundary, 60}, |
|
|
|
// GB7. |
|
{grAny, prLV}: {grLVV, grBoundary, 9990}, |
|
{grAny, prV}: {grLVV, grBoundary, 9990}, |
|
{grLVV, prV}: {grLVV, grNoBoundary, 70}, |
|
{grLVV, prT}: {grLVTT, grNoBoundary, 70}, |
|
|
|
// GB8. |
|
{grAny, prLVT}: {grLVTT, grBoundary, 9990}, |
|
{grAny, prT}: {grLVTT, grBoundary, 9990}, |
|
{grLVTT, prT}: {grLVTT, grNoBoundary, 80}, |
|
|
|
// GB9. |
|
{grAny, prExtend}: {grAny, grNoBoundary, 90}, |
|
{grAny, prZWJ}: {grAny, grNoBoundary, 90}, |
|
|
|
// GB9a. |
|
{grAny, prSpacingMark}: {grAny, grNoBoundary, 91}, |
|
|
|
// GB9b. |
|
{grAny, prPreprend}: {grPrepend, grBoundary, 9990}, |
|
{grPrepend, prAny}: {grAny, grNoBoundary, 92}, |
|
|
|
// GB11. |
|
{grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990}, |
|
{grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110}, |
|
{grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110}, |
|
{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110}, |
|
|
|
// GB12 / GB13. |
|
{grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990}, |
|
{grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120}, |
|
{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120}, |
|
} |
|
|
|
// Graphemes implements an iterator over Unicode extended grapheme clusters, |
|
// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to |
|
// "user-perceived characters". These characters often consist of multiple |
|
// code points (e.g. the "woman kissing woman" emoji consists of 8 code points: |
|
// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ + |
|
// woman) and the rules described in Annex #29 must be applied to group those |
|
// code points into clusters perceived by the user as one character. |
|
type Graphemes struct { |
|
// The code points over which this class iterates. |
|
codePoints []rune |
|
|
|
// The (byte-based) indices of the code points into the original string plus |
|
// len(original string). Thus, len(indices) = len(codePoints) + 1. |
|
indices []int |
|
|
|
// The current grapheme cluster to be returned. These are indices into |
|
// codePoints/indices. If start == end, we either haven't started iterating |
|
// yet (0) or the iteration has already completed (1). |
|
start, end int |
|
|
|
// The index of the next code point to be parsed. |
|
pos int |
|
|
|
// The current state of the code point parser. |
|
state int |
|
} |
|
|
|
// NewGraphemes returns a new grapheme cluster iterator. |
|
func NewGraphemes(s string) *Graphemes { |
|
l := utf8.RuneCountInString(s) |
|
codePoints := make([]rune, l) |
|
indices := make([]int, l+1) |
|
i := 0 |
|
for pos, r := range s { |
|
codePoints[i] = r |
|
indices[i] = pos |
|
i++ |
|
} |
|
indices[l] = len(s) |
|
g := &Graphemes{ |
|
codePoints: codePoints, |
|
indices: indices, |
|
} |
|
g.Next() // Parse ahead. |
|
return g |
|
} |
|
|
|
// Next advances the iterator by one grapheme cluster and returns false if no |
|
// clusters are left. This function must be called before the first cluster is |
|
// accessed. |
|
func (g *Graphemes) Next() bool { |
|
g.start = g.end |
|
|
|
// The state transition gives us a boundary instruction BEFORE the next code |
|
// point so we always need to stay ahead by one code point. |
|
|
|
// Parse the next code point. |
|
for g.pos <= len(g.codePoints) { |
|
// GB2. |
|
if g.pos == len(g.codePoints) { |
|
g.end = g.pos |
|
g.pos++ |
|
break |
|
} |
|
|
|
// Determine the property of the next character. |
|
nextProperty := property(g.codePoints[g.pos]) |
|
g.pos++ |
|
|
|
// Find the applicable transition. |
|
var boundary bool |
|
transition, ok := grTransitions[[2]int{g.state, nextProperty}] |
|
if ok { |
|
// We have a specific transition. We'll use it. |
|
g.state = transition[0] |
|
boundary = transition[1] == grBoundary |
|
} else { |
|
// No specific transition found. Try the less specific ones. |
|
transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}] |
|
transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}] |
|
if okAnyProp && okAnyState { |
|
// Both apply. We'll use a mix (see comments for grTransitions). |
|
g.state = transAnyState[0] |
|
boundary = transAnyState[1] == grBoundary |
|
if transAnyProp[2] < transAnyState[2] { |
|
g.state = transAnyProp[0] |
|
boundary = transAnyProp[1] == grBoundary |
|
} |
|
} else if okAnyProp { |
|
// We only have a specific state. |
|
g.state = transAnyProp[0] |
|
boundary = transAnyProp[1] == grBoundary |
|
// This branch will probably never be reached because okAnyState will |
|
// always be true given the current transition map. But we keep it here |
|
// for future modifications to the transition map where this may not be |
|
// true anymore. |
|
} else if okAnyState { |
|
// We only have a specific property. |
|
g.state = transAnyState[0] |
|
boundary = transAnyState[1] == grBoundary |
|
} else { |
|
// No known transition. GB999: Any x Any. |
|
g.state = grAny |
|
boundary = true |
|
} |
|
} |
|
|
|
// If we found a cluster boundary, let's stop here. The current cluster will |
|
// be the one that just ended. |
|
if g.pos-1 == 0 /* GB1 */ || boundary { |
|
g.end = g.pos - 1 |
|
break |
|
} |
|
} |
|
|
|
return g.start != g.end |
|
} |
|
|
|
// Runes returns a slice of runes (code points) which corresponds to the current |
|
// grapheme cluster. If the iterator is already past the end or Next() has not |
|
// yet been called, nil is returned. |
|
func (g *Graphemes) Runes() []rune { |
|
if g.start == g.end { |
|
return nil |
|
} |
|
return g.codePoints[g.start:g.end] |
|
} |
|
|
|
// Str returns a substring of the original string which corresponds to the |
|
// current grapheme cluster. If the iterator is already past the end or Next() |
|
// has not yet been called, an empty string is returned. |
|
func (g *Graphemes) Str() string { |
|
if g.start == g.end { |
|
return "" |
|
} |
|
return string(g.codePoints[g.start:g.end]) |
|
} |
|
|
|
// Bytes returns a byte slice which corresponds to the current grapheme cluster. |
|
// If the iterator is already past the end or Next() has not yet been called, |
|
// nil is returned. |
|
func (g *Graphemes) Bytes() []byte { |
|
if g.start == g.end { |
|
return nil |
|
} |
|
return []byte(string(g.codePoints[g.start:g.end])) |
|
} |
|
|
|
// Positions returns the interval of the current grapheme cluster as byte |
|
// positions into the original string. The first returned value "from" indexes |
|
// the first byte and the second returned value "to" indexes the first byte that |
|
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of |
|
// the original string "str". If Next() has not yet been called, both values are |
|
// 0. If the iterator is already past the end, both values are 1. |
|
func (g *Graphemes) Positions() (int, int) { |
|
return g.indices[g.start], g.indices[g.end] |
|
} |
|
|
|
// Reset puts the iterator into its initial state such that the next call to |
|
// Next() sets it to the first grapheme cluster again. |
|
func (g *Graphemes) Reset() { |
|
g.start, g.end, g.pos, g.state = 0, 0, 0, grAny |
|
g.Next() // Parse ahead again. |
|
} |
|
|
|
// GraphemeClusterCount returns the number of user-perceived characters |
|
// (grapheme clusters) for the given string. To calculate this number, it |
|
// iterates through the string using the Graphemes iterator. |
|
func GraphemeClusterCount(s string) (n int) { |
|
g := NewGraphemes(s) |
|
for g.Next() { |
|
n++ |
|
} |
|
return |
|
}
|
|
|