Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
179 lines
3.5 KiB
179 lines
3.5 KiB
package mahonia |
|
|
|
// decoding HTML entities |
|
|
|
import ( |
|
"sort" |
|
) |
|
|
|
// EntityDecoder returns a Decoder that decodes HTML character entities. |
|
// If there is no valid character entity at the current position, it returns INVALID_CHAR. |
|
// So it needs to be combined with another Decoder via FallbackDecoder. |
|
func EntityDecoder() Decoder { |
|
var leftover rune // leftover rune from two-rune entity |
|
return func(p []byte) (r rune, size int, status Status) { |
|
if leftover != 0 { |
|
r = leftover |
|
leftover = 0 |
|
return r, 0, SUCCESS |
|
} |
|
|
|
if len(p) == 0 { |
|
return 0, 0, NO_ROOM |
|
} |
|
|
|
if p[0] != '&' { |
|
return 0xfffd, 1, INVALID_CHAR |
|
} |
|
|
|
if len(p) < 3 { |
|
return 0, 1, NO_ROOM |
|
} |
|
|
|
r, size, status = 0xfffd, 1, INVALID_CHAR |
|
n := 1 // number of bytes read so far |
|
|
|
if p[n] == '#' { |
|
n++ |
|
c := p[n] |
|
hex := false |
|
if c == 'x' || c == 'X' { |
|
hex = true |
|
n++ |
|
} |
|
|
|
var x rune |
|
for n < len(p) { |
|
c = p[n] |
|
n++ |
|
if hex { |
|
if '0' <= c && c <= '9' { |
|
x = 16*x + rune(c) - '0' |
|
continue |
|
} else if 'a' <= c && c <= 'f' { |
|
x = 16*x + rune(c) - 'a' + 10 |
|
continue |
|
} else if 'A' <= c && c <= 'F' { |
|
x = 16*x + rune(c) - 'A' + 10 |
|
continue |
|
} |
|
} else if '0' <= c && c <= '9' { |
|
x = 10*x + rune(c) - '0' |
|
continue |
|
} |
|
if c != ';' { |
|
n-- |
|
} |
|
break |
|
} |
|
|
|
if n == len(p) && p[n-1] != ';' { |
|
return 0, 0, NO_ROOM |
|
} |
|
|
|
size = n |
|
if p[n-1] == ';' { |
|
n-- |
|
} |
|
if hex { |
|
n-- |
|
} |
|
n-- |
|
// Now n is the number of actual digits read. |
|
if n == 0 { |
|
return 0xfffd, 1, INVALID_CHAR |
|
} |
|
|
|
if 0x80 <= x && x <= 0x9F { |
|
// Replace characters from Windows-1252 with UTF-8 equivalents. |
|
x = replacementTable[x-0x80] |
|
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { |
|
// Replace invalid characters with the replacement character. |
|
return 0xfffd, size, INVALID_CHAR |
|
} |
|
|
|
r = x |
|
status = SUCCESS |
|
return |
|
} |
|
|
|
// Look for a named entity in EntityList. |
|
|
|
possible := entityList |
|
for len(possible) > 0 { |
|
if len(p) <= n { |
|
leftover = 0 |
|
return 0, 0, NO_ROOM |
|
} |
|
|
|
c := p[n] |
|
|
|
// Narrow down the selection in possible to those items that have c in the |
|
// appropriate byte. |
|
first := sort.Search(len(possible), func(i int) bool { |
|
e := possible[i].name |
|
if len(e) < n { |
|
return false |
|
} |
|
return e[n-1] >= c |
|
}) |
|
possible = possible[first:] |
|
last := sort.Search(len(possible), func(i int) bool { |
|
return possible[i].name[n-1] > c |
|
}) |
|
possible = possible[:last] |
|
|
|
n++ |
|
if len(possible) > 0 && len(possible[0].name) == n-1 { |
|
r, leftover = possible[0].r1, possible[0].r2 |
|
size = n |
|
status = SUCCESS |
|
// but don't return yet, since we need the longest match |
|
} |
|
} |
|
|
|
return |
|
} |
|
} |
|
|
|
// This table is copied from /src/pkg/html/escape.go in the Go source |
|
// |
|
// These replacements permit compatibility with old numeric entities that |
|
// assumed Windows-1252 encoding. |
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference |
|
var replacementTable = [...]rune{ |
|
'\u20AC', // First entry is what 0x80 should be replaced with. |
|
'\u0081', |
|
'\u201A', |
|
'\u0192', |
|
'\u201E', |
|
'\u2026', |
|
'\u2020', |
|
'\u2021', |
|
'\u02C6', |
|
'\u2030', |
|
'\u0160', |
|
'\u2039', |
|
'\u0152', |
|
'\u008D', |
|
'\u017D', |
|
'\u008F', |
|
'\u0090', |
|
'\u2018', |
|
'\u2019', |
|
'\u201C', |
|
'\u201D', |
|
'\u2022', |
|
'\u2013', |
|
'\u2014', |
|
'\u02DC', |
|
'\u2122', |
|
'\u0161', |
|
'\u203A', |
|
'\u0153', |
|
'\u009D', |
|
'\u017E', |
|
'\u0178', // Last entry is 0x9F. |
|
// 0x00->'\uFFFD' is handled programmatically. |
|
// 0x0D->'\u000D' is a no-op. |
|
}
|
|
|