"))
+ test.Bytes(t, l.Text(), nil)
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte("text"))
+ test.Bytes(t, l.Text(), []byte("text"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte("comment"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte(" doctype"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte("cdata"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte("script"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte("
"))
+ test.Bytes(t, l.Text(), []byte("svg"))
+ test.Bytes(t, l.AttrVal(), nil)
+}
+
+func TestOffset(t *testing.T) {
+ l := NewLexer(bytes.NewBufferString(`
text
`))
+ test.T(t, l.Offset(), 0)
+ _, _ = l.Next()
+ test.T(t, l.Offset(), 4) //
+ _, _ = l.Next()
+ test.T(t, l.Offset(), 20) // text
+ _, _ = l.Next()
+ test.T(t, l.Offset(), 26) //
+}
+
////////////////////////////////////////////////////////////////
var J int
diff -Nru golang-github-tdewolff-parse-2.3.9/html/util.go golang-github-tdewolff-parse-2.4.2/html/util.go
--- golang-github-tdewolff-parse-2.3.9/html/util.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/html/util.go 2019-12-17 13:35:25.000000000 +0000
@@ -1,82 +1,23 @@
package html
-import "github.com/tdewolff/parse/v2"
-
var (
singleQuoteEntityBytes = []byte("'")
doubleQuoteEntityBytes = []byte(""")
)
-var charTable = [256]bool{
- // ASCII
- false, false, false, false, false, false, false, false,
- false, true, true, true, true, true, false, false, // tab, new line, vertical tab, form feed, carriage return
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-
- true, false, true, false, false, false, true, true, // space, ", &, '
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, true, true, true, false, // <, =, >
-
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-
- true, false, false, false, false, false, false, false, // `
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-
- // non-ASCII
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
-}
-
// EscapeAttrVal returns the escaped attribute value bytes without quotes.
func EscapeAttrVal(buf *[]byte, orig, b []byte, isXML bool) []byte {
singles := 0
doubles := 0
unquoted := true
entities := false
- for i, c := range b {
+ for _, c := range b {
if charTable[c] {
- if c == '&' {
- entities = true
- if quote, n := parse.QuoteEntity(b[i:]); n > 0 {
- if quote == '"' {
- unquoted = false
- doubles++
- } else {
- unquoted = false
- singles++
- }
- }
- } else {
- unquoted = false
- if c == '"' {
- doubles++
- } else if c == '\'' {
- singles++
- }
+ unquoted = false
+ if c == '"' {
+ doubles++
+ } else if c == '\'' {
+ singles++
}
}
}
@@ -106,18 +47,7 @@
j := 1
start := 0
for i, c := range b {
- if c == '&' {
- if entityQuote, n := parse.QuoteEntity(b[i:]); n > 0 {
- j += copy(t[j:], b[start:i])
- if entityQuote != quote {
- t[j] = entityQuote
- j++
- } else {
- j += copy(t[j:], escapedQuote)
- }
- start = i + n
- }
- } else if c == quote {
+ if c == quote {
j += copy(t[j:], b[start:i])
j += copy(t[j:], escapedQuote)
start = i + 1
@@ -127,3 +57,1147 @@
t[j] = quote
return t[:j+1]
}
+
+var charTable = [256]bool{
+ // ASCII
+ false, false, false, false, false, false, false, false,
+ false, true, true, false, true, true, false, false, // tab, line feed, form feed, carriage return
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+
+ true, false, true, false, false, false, false, true, // space, "), '
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, true, true, true, false, // <, =, >
+
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+
+ true, false, false, false, false, false, false, false, // `
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+
+ // non-ASCII
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+}
+
+// Entities are all named character entities.
+var EntitiesMap = map[string][]byte{
+ "AElig": []byte("Æ"),
+ "AMP": []byte("&"),
+ "Aacute": []byte("Á"),
+ "Abreve": []byte("Ă"),
+ "Acirc": []byte("Â"),
+ "Agrave": []byte("À"),
+ "Alpha": []byte("Α"),
+ "Amacr": []byte("Ā"),
+ "Aogon": []byte("Ą"),
+ "ApplyFunction": []byte("⁡"),
+ "Aring": []byte("Å"),
+ "Assign": []byte("≔"),
+ "Atilde": []byte("Ã"),
+ "Backslash": []byte("∖"),
+ "Barwed": []byte("⌆"),
+ "Because": []byte("∵"),
+ "Bernoullis": []byte("ℬ"),
+ "Breve": []byte("˘"),
+ "Bumpeq": []byte("≎"),
+ "Cacute": []byte("Ć"),
+ "CapitalDifferentialD": []byte("ⅅ"),
+ "Cayleys": []byte("ℭ"),
+ "Ccaron": []byte("Č"),
+ "Ccedil": []byte("Ç"),
+ "Ccirc": []byte("Ĉ"),
+ "Cconint": []byte("∰"),
+ "Cedilla": []byte("¸"),
+ "CenterDot": []byte("·"),
+ "CircleDot": []byte("⊙"),
+ "CircleMinus": []byte("⊖"),
+ "CirclePlus": []byte("⊕"),
+ "CircleTimes": []byte("⊗"),
+ "ClockwiseContourIntegral": []byte("∲"),
+ "CloseCurlyDoubleQuote": []byte("”"),
+ "CloseCurlyQuote": []byte("’"),
+ "Congruent": []byte("≡"),
+ "Conint": []byte("∯"),
+ "ContourIntegral": []byte("∮"),
+ "Coproduct": []byte("∐"),
+ "CounterClockwiseContourIntegral": []byte("∳"),
+ "CupCap": []byte("≍"),
+ "DDotrahd": []byte("⤑"),
+ "Dagger": []byte("‡"),
+ "Dcaron": []byte("Ď"),
+ "Delta": []byte("Δ"),
+ "DiacriticalAcute": []byte("´"),
+ "DiacriticalDot": []byte("˙"),
+ "DiacriticalDoubleAcute": []byte("˝"),
+ "DiacriticalGrave": []byte("`"),
+ "DiacriticalTilde": []byte("˜"),
+ "Diamond": []byte("⋄"),
+ "DifferentialD": []byte("ⅆ"),
+ "DotDot": []byte("⃜"),
+ "DotEqual": []byte("≐"),
+ "DoubleContourIntegral": []byte("∯"),
+ "DoubleDot": []byte("¨"),
+ "DoubleDownArrow": []byte("⇓"),
+ "DoubleLeftArrow": []byte("⇐"),
+ "DoubleLeftRightArrow": []byte("⇔"),
+ "DoubleLeftTee": []byte("⫤"),
+ "DoubleLongLeftArrow": []byte("⟸"),
+ "DoubleLongLeftRightArrow": []byte("⟺"),
+ "DoubleLongRightArrow": []byte("⟹"),
+ "DoubleRightArrow": []byte("⇒"),
+ "DoubleRightTee": []byte("⊨"),
+ "DoubleUpArrow": []byte("⇑"),
+ "DoubleUpDownArrow": []byte("⇕"),
+ "DoubleVerticalBar": []byte("∥"),
+ "DownArrow": []byte("↓"),
+ "DownArrowBar": []byte("⤓"),
+ "DownArrowUpArrow": []byte("⇵"),
+ "DownBreve": []byte("̑"),
+ "DownLeftRightVector": []byte("⥐"),
+ "DownLeftTeeVector": []byte("⥞"),
+ "DownLeftVector": []byte("↽"),
+ "DownLeftVectorBar": []byte("⥖"),
+ "DownRightTeeVector": []byte("⥟"),
+ "DownRightVector": []byte("⇁"),
+ "DownRightVectorBar": []byte("⥗"),
+ "DownTee": []byte("⊤"),
+ "DownTeeArrow": []byte("↧"),
+ "Downarrow": []byte("⇓"),
+ "Dstrok": []byte("Đ"),
+ "Eacute": []byte("É"),
+ "Ecaron": []byte("Ě"),
+ "Ecirc": []byte("Ê"),
+ "Egrave": []byte("È"),
+ "Element": []byte("∈"),
+ "Emacr": []byte("Ē"),
+ "EmptySmallSquare": []byte("◻"),
+ "EmptyVerySmallSquare": []byte("▫"),
+ "Eogon": []byte("Ę"),
+ "Epsilon": []byte("Ε"),
+ "EqualTilde": []byte("≂"),
+ "Equilibrium": []byte("⇌"),
+ "Exists": []byte("∃"),
+ "ExponentialE": []byte("ⅇ"),
+ "FilledSmallSquare": []byte("◼"),
+ "FilledVerySmallSquare": []byte("▪"),
+ "ForAll": []byte("∀"),
+ "Fouriertrf": []byte("ℱ"),
+ "GT": []byte(">"),
+ "Gamma": []byte("Γ"),
+ "Gammad": []byte("Ϝ"),
+ "Gbreve": []byte("Ğ"),
+ "Gcedil": []byte("Ģ"),
+ "Gcirc": []byte("Ĝ"),
+ "GreaterEqual": []byte("≥"),
+ "GreaterEqualLess": []byte("⋛"),
+ "GreaterFullEqual": []byte("≧"),
+ "GreaterGreater": []byte("⪢"),
+ "GreaterLess": []byte("≷"),
+ "GreaterSlantEqual": []byte("⩾"),
+ "GreaterTilde": []byte("≳"),
+ "HARDcy": []byte("Ъ"),
+ "Hacek": []byte("ˇ"),
+ "Hat": []byte("^"),
+ "Hcirc": []byte("Ĥ"),
+ "HilbertSpace": []byte("ℋ"),
+ "HorizontalLine": []byte("─"),
+ "Hstrok": []byte("Ħ"),
+ "HumpDownHump": []byte("≎"),
+ "HumpEqual": []byte("≏"),
+ "IJlig": []byte("IJ"),
+ "Iacute": []byte("Í"),
+ "Icirc": []byte("Î"),
+ "Ifr": []byte("ℑ"),
+ "Igrave": []byte("Ì"),
+ "Imacr": []byte("Ī"),
+ "ImaginaryI": []byte("ⅈ"),
+ "Implies": []byte("⇒"),
+ "Integral": []byte("∫"),
+ "Intersection": []byte("⋂"),
+ "InvisibleComma": []byte("⁣"),
+ "InvisibleTimes": []byte("⁢"),
+ "Iogon": []byte("Į"),
+ "Itilde": []byte("Ĩ"),
+ "Jcirc": []byte("Ĵ"),
+ "Jsercy": []byte("Ј"),
+ "Kappa": []byte("Κ"),
+ "Kcedil": []byte("Ķ"),
+ "LT": []byte("<"),
+ "Lacute": []byte("Ĺ"),
+ "Lambda": []byte("Λ"),
+ "Laplacetrf": []byte("ℒ"),
+ "Lcaron": []byte("Ľ"),
+ "Lcedil": []byte("Ļ"),
+ "LeftAngleBracket": []byte("〈"),
+ "LeftArrow": []byte("←"),
+ "LeftArrowBar": []byte("⇤"),
+ "LeftArrowRightArrow": []byte("⇆"),
+ "LeftCeiling": []byte("⌈"),
+ "LeftDoubleBracket": []byte("⟦"),
+ "LeftDownTeeVector": []byte("⥡"),
+ "LeftDownVector": []byte("⇃"),
+ "LeftDownVectorBar": []byte("⥙"),
+ "LeftFloor": []byte("⌊"),
+ "LeftRightArrow": []byte("↔"),
+ "LeftRightVector": []byte("⥎"),
+ "LeftTee": []byte("⊣"),
+ "LeftTeeArrow": []byte("↤"),
+ "LeftTeeVector": []byte("⥚"),
+ "LeftTriangle": []byte("⊲"),
+ "LeftTriangleBar": []byte("⧏"),
+ "LeftTriangleEqual": []byte("⊴"),
+ "LeftUpDownVector": []byte("⥑"),
+ "LeftUpTeeVector": []byte("⥠"),
+ "LeftUpVector": []byte("↿"),
+ "LeftUpVectorBar": []byte("⥘"),
+ "LeftVector": []byte("↼"),
+ "LeftVectorBar": []byte("⥒"),
+ "Leftarrow": []byte("⇐"),
+ "Leftrightarrow": []byte("⇔"),
+ "LessEqualGreater": []byte("⋚"),
+ "LessFullEqual": []byte("≦"),
+ "LessGreater": []byte("≶"),
+ "LessLess": []byte("⪡"),
+ "LessSlantEqual": []byte("⩽"),
+ "LessTilde": []byte("≲"),
+ "Lleftarrow": []byte("⇚"),
+ "Lmidot": []byte("Ŀ"),
+ "LongLeftArrow": []byte("⟵"),
+ "LongLeftRightArrow": []byte("⟷"),
+ "LongRightArrow": []byte("⟶"),
+ "Longleftarrow": []byte("⟸"),
+ "Longleftrightarrow": []byte("⟺"),
+ "Longrightarrow": []byte("⟹"),
+ "LowerLeftArrow": []byte("↙"),
+ "LowerRightArrow": []byte("↘"),
+ "Lstrok": []byte("Ł"),
+ "MediumSpace": []byte(" "),
+ "Mellintrf": []byte("ℳ"),
+ "MinusPlus": []byte("∓"),
+ "Nacute": []byte("Ń"),
+ "Ncaron": []byte("Ň"),
+ "Ncedil": []byte("Ņ"),
+ "NegativeMediumSpace": []byte(""),
+ "NegativeThickSpace": []byte(""),
+ "NegativeThinSpace": []byte(""),
+ "NegativeVeryThinSpace": []byte(""),
+ "NestedGreaterGreater": []byte("≫"),
+ "NestedLessLess": []byte("≪"),
+ "NewLine": []byte("\n"),
+ "NoBreak": []byte(""),
+ "NonBreakingSpace": []byte(" "),
+ "NotCongruent": []byte("≢"),
+ "NotCupCap": []byte("≭"),
+ "NotDoubleVerticalBar": []byte("∦"),
+ "NotElement": []byte("∉"),
+ "NotEqual": []byte("≠"),
+ "NotExists": []byte("∄"),
+ "NotGreater": []byte("≯"),
+ "NotGreaterEqual": []byte("≱"),
+ "NotGreaterLess": []byte("≹"),
+ "NotGreaterTilde": []byte("≵"),
+ "NotLeftTriangle": []byte("⋪"),
+ "NotLeftTriangleEqual": []byte("⋬"),
+ "NotLess": []byte("≮"),
+ "NotLessEqual": []byte("≰"),
+ "NotLessGreater": []byte("≸"),
+ "NotLessTilde": []byte("≴"),
+ "NotPrecedes": []byte("⊀"),
+ "NotPrecedesSlantEqual": []byte("⋠"),
+ "NotReverseElement": []byte("∌"),
+ "NotRightTriangle": []byte("⋫"),
+ "NotRightTriangleEqual": []byte("⋭"),
+ "NotSquareSubsetEqual": []byte("⋢"),
+ "NotSquareSupersetEqual": []byte("⋣"),
+ "NotSubsetEqual": []byte("⊈"),
+ "NotSucceeds": []byte("⊁"),
+ "NotSucceedsSlantEqual": []byte("⋡"),
+ "NotSupersetEqual": []byte("⊉"),
+ "NotTilde": []byte("≁"),
+ "NotTildeEqual": []byte("≄"),
+ "NotTildeFullEqual": []byte("≇"),
+ "NotTildeTilde": []byte("≉"),
+ "NotVerticalBar": []byte("∤"),
+ "Ntilde": []byte("Ñ"),
+ "OElig": []byte("Œ"),
+ "Oacute": []byte("Ó"),
+ "Ocirc": []byte("Ô"),
+ "Odblac": []byte("Ő"),
+ "Ograve": []byte("Ò"),
+ "Omacr": []byte("Ō"),
+ "Omega": []byte("Ω"),
+ "Omicron": []byte("Ο"),
+ "OpenCurlyDoubleQuote": []byte("“"),
+ "OpenCurlyQuote": []byte("‘"),
+ "Oslash": []byte("Ø"),
+ "Otilde": []byte("Õ"),
+ "OverBar": []byte("‾"),
+ "OverBrace": []byte("⏞"),
+ "OverBracket": []byte("⎴"),
+ "OverParenthesis": []byte("⏜"),
+ "PartialD": []byte("∂"),
+ "PlusMinus": []byte("±"),
+ "Poincareplane": []byte("ℌ"),
+ "Precedes": []byte("≺"),
+ "PrecedesEqual": []byte("⪯"),
+ "PrecedesSlantEqual": []byte("≼"),
+ "PrecedesTilde": []byte("≾"),
+ "Product": []byte("∏"),
+ "Proportion": []byte("∷"),
+ "Proportional": []byte("∝"),
+ "QUOT": []byte("\""),
+ "Racute": []byte("Ŕ"),
+ "Rcaron": []byte("Ř"),
+ "Rcedil": []byte("Ŗ"),
+ "ReverseElement": []byte("∋"),
+ "ReverseEquilibrium": []byte("⇋"),
+ "ReverseUpEquilibrium": []byte("⥯"),
+ "Rfr": []byte("ℜ"),
+ "RightAngleBracket": []byte("〉"),
+ "RightArrow": []byte("→"),
+ "RightArrowBar": []byte("⇥"),
+ "RightArrowLeftArrow": []byte("⇄"),
+ "RightCeiling": []byte("⌉"),
+ "RightDoubleBracket": []byte("⟧"),
+ "RightDownTeeVector": []byte("⥝"),
+ "RightDownVector": []byte("⇂"),
+ "RightDownVectorBar": []byte("⥕"),
+ "RightFloor": []byte("⌋"),
+ "RightTee": []byte("⊢"),
+ "RightTeeArrow": []byte("↦"),
+ "RightTeeVector": []byte("⥛"),
+ "RightTriangle": []byte("⊳"),
+ "RightTriangleBar": []byte("⧐"),
+ "RightTriangleEqual": []byte("⊵"),
+ "RightUpDownVector": []byte("⥏"),
+ "RightUpTeeVector": []byte("⥜"),
+ "RightUpVector": []byte("↾"),
+ "RightUpVectorBar": []byte("⥔"),
+ "RightVector": []byte("⇀"),
+ "RightVectorBar": []byte("⥓"),
+ "Rightarrow": []byte("⇒"),
+ "RoundImplies": []byte("⥰"),
+ "Rrightarrow": []byte("⇛"),
+ "RuleDelayed": []byte("⧴"),
+ "SHCHcy": []byte("Щ"),
+ "SOFTcy": []byte("Ь"),
+ "Sacute": []byte("Ś"),
+ "Scaron": []byte("Š"),
+ "Scedil": []byte("Ş"),
+ "Scirc": []byte("Ŝ"),
+ "ShortDownArrow": []byte("↓"),
+ "ShortLeftArrow": []byte("←"),
+ "ShortRightArrow": []byte("→"),
+ "ShortUpArrow": []byte("↑"),
+ "Sigma": []byte("Σ"),
+ "SmallCircle": []byte("∘"),
+ "Square": []byte("□"),
+ "SquareIntersection": []byte("⊓"),
+ "SquareSubset": []byte("⊏"),
+ "SquareSubsetEqual": []byte("⊑"),
+ "SquareSuperset": []byte("⊐"),
+ "SquareSupersetEqual": []byte("⊒"),
+ "SquareUnion": []byte("⊔"),
+ "Subset": []byte("⋐"),
+ "SubsetEqual": []byte("⊆"),
+ "Succeeds": []byte("≻"),
+ "SucceedsEqual": []byte("⪰"),
+ "SucceedsSlantEqual": []byte("≽"),
+ "SucceedsTilde": []byte("≿"),
+ "SuchThat": []byte("∋"),
+ "Superset": []byte("⊃"),
+ "SupersetEqual": []byte("⊇"),
+ "Supset": []byte("⋑"),
+ "THORN": []byte("Þ"),
+ "Tab": []byte(" "),
+ "Tcaron": []byte("Ť"),
+ "Tcedil": []byte("Ţ"),
+ "Therefore": []byte("∴"),
+ "Theta": []byte("Θ"),
+ "ThinSpace": []byte(" "),
+ "Tilde": []byte("∼"),
+ "TildeEqual": []byte("≃"),
+ "TildeFullEqual": []byte("≅"),
+ "TildeTilde": []byte("≈"),
+ "TripleDot": []byte("⃛"),
+ "Tstrok": []byte("Ŧ"),
+ "Uacute": []byte("Ú"),
+ "Uarrocir": []byte("⥉"),
+ "Ubreve": []byte("Ŭ"),
+ "Ucirc": []byte("Û"),
+ "Udblac": []byte("Ű"),
+ "Ugrave": []byte("Ù"),
+ "Umacr": []byte("Ū"),
+ "UnderBar": []byte("_"),
+ "UnderBrace": []byte("⏟"),
+ "UnderBracket": []byte("⎵"),
+ "UnderParenthesis": []byte("⏝"),
+ "Union": []byte("⋃"),
+ "UnionPlus": []byte("⊎"),
+ "Uogon": []byte("Ų"),
+ "UpArrow": []byte("↑"),
+ "UpArrowBar": []byte("⤒"),
+ "UpArrowDownArrow": []byte("⇅"),
+ "UpDownArrow": []byte("↕"),
+ "UpEquilibrium": []byte("⥮"),
+ "UpTee": []byte("⊥"),
+ "UpTeeArrow": []byte("↥"),
+ "Uparrow": []byte("⇑"),
+ "Updownarrow": []byte("⇕"),
+ "UpperLeftArrow": []byte("↖"),
+ "UpperRightArrow": []byte("↗"),
+ "Upsilon": []byte("Υ"),
+ "Uring": []byte("Ů"),
+ "Utilde": []byte("Ũ"),
+ "Verbar": []byte("‖"),
+ "VerticalBar": []byte("∣"),
+ "VerticalLine": []byte("|"),
+ "VerticalSeparator": []byte("❘"),
+ "VerticalTilde": []byte("≀"),
+ "VeryThinSpace": []byte(" "),
+ "Vvdash": []byte("⊪"),
+ "Wcirc": []byte("Ŵ"),
+ "Yacute": []byte("Ý"),
+ "Ycirc": []byte("Ŷ"),
+ "Zacute": []byte("Ź"),
+ "Zcaron": []byte("Ž"),
+ "ZeroWidthSpace": []byte(""),
+ "aacute": []byte("á"),
+ "abreve": []byte("ă"),
+ "acirc": []byte("â"),
+ "acute": []byte("´"),
+ "aelig": []byte("æ"),
+ "agrave": []byte("à"),
+ "alefsym": []byte("ℵ"),
+ "alpha": []byte("α"),
+ "amacr": []byte("ā"),
+ "amp": []byte("&"),
+ "andslope": []byte("⩘"),
+ "angle": []byte("∠"),
+ "angmsd": []byte("∡"),
+ "angmsdaa": []byte("⦨"),
+ "angmsdab": []byte("⦩"),
+ "angmsdac": []byte("⦪"),
+ "angmsdad": []byte("⦫"),
+ "angmsdae": []byte("⦬"),
+ "angmsdaf": []byte("⦭"),
+ "angmsdag": []byte("⦮"),
+ "angmsdah": []byte("⦯"),
+ "angrtvb": []byte("⊾"),
+ "angrtvbd": []byte("⦝"),
+ "angsph": []byte("∢"),
+ "angst": []byte("Å"),
+ "angzarr": []byte("⍼"),
+ "aogon": []byte("ą"),
+ "apos": []byte("'"),
+ "approx": []byte("≈"),
+ "approxeq": []byte("≊"),
+ "aring": []byte("å"),
+ "ast": []byte("*"),
+ "asymp": []byte("≈"),
+ "asympeq": []byte("≍"),
+ "atilde": []byte("ã"),
+ "awconint": []byte("∳"),
+ "backcong": []byte("≌"),
+ "backepsilon": []byte("϶"),
+ "backprime": []byte("‵"),
+ "backsim": []byte("∽"),
+ "backsimeq": []byte("⋍"),
+ "barvee": []byte("⊽"),
+ "barwed": []byte("⌅"),
+ "barwedge": []byte("⌅"),
+ "bbrktbrk": []byte("⎶"),
+ "becaus": []byte("∵"),
+ "because": []byte("∵"),
+ "bemptyv": []byte("⦰"),
+ "bernou": []byte("ℬ"),
+ "between": []byte("≬"),
+ "bigcap": []byte("⋂"),
+ "bigcirc": []byte("◯"),
+ "bigcup": []byte("⋃"),
+ "bigodot": []byte("⨀"),
+ "bigoplus": []byte("⨁"),
+ "bigotimes": []byte("⨂"),
+ "bigsqcup": []byte("⨆"),
+ "bigstar": []byte("★"),
+ "bigtriangledown": []byte("▽"),
+ "bigtriangleup": []byte("△"),
+ "biguplus": []byte("⨄"),
+ "bigvee": []byte("⋁"),
+ "bigwedge": []byte("⋀"),
+ "bkarow": []byte("⤍"),
+ "blacklozenge": []byte("⧫"),
+ "blacksquare": []byte("▪"),
+ "blacktriangle": []byte("▴"),
+ "blacktriangledown": []byte("▾"),
+ "blacktriangleleft": []byte("◂"),
+ "blacktriangleright": []byte("▸"),
+ "bottom": []byte("⊥"),
+ "bowtie": []byte("⋈"),
+ "boxminus": []byte("⊟"),
+ "boxplus": []byte("⊞"),
+ "boxtimes": []byte("⊠"),
+ "bprime": []byte("‵"),
+ "breve": []byte("˘"),
+ "brvbar": []byte("¦"),
+ "bsol": []byte("\\"),
+ "bsolhsub": []byte("⟈"),
+ "bullet": []byte("•"),
+ "bumpeq": []byte("≏"),
+ "cacute": []byte("ć"),
+ "capbrcup": []byte("⩉"),
+ "caron": []byte("ˇ"),
+ "ccaron": []byte("č"),
+ "ccedil": []byte("ç"),
+ "ccirc": []byte("ĉ"),
+ "ccupssm": []byte("⩐"),
+ "cedil": []byte("¸"),
+ "cemptyv": []byte("⦲"),
+ "centerdot": []byte("·"),
+ "checkmark": []byte("✓"),
+ "circeq": []byte("≗"),
+ "circlearrowleft": []byte("↺"),
+ "circlearrowright": []byte("↻"),
+ "circledR": []byte("®"),
+ "circledS": []byte("Ⓢ"),
+ "circledast": []byte("⊛"),
+ "circledcirc": []byte("⊚"),
+ "circleddash": []byte("⊝"),
+ "cirfnint": []byte("⨐"),
+ "cirscir": []byte("⧂"),
+ "clubsuit": []byte("♣"),
+ "colon": []byte(":"),
+ "colone": []byte("≔"),
+ "coloneq": []byte("≔"),
+ "comma": []byte(","),
+ "commat": []byte("@"),
+ "compfn": []byte("∘"),
+ "complement": []byte("∁"),
+ "complexes": []byte("ℂ"),
+ "congdot": []byte("⩭"),
+ "conint": []byte("∮"),
+ "coprod": []byte("∐"),
+ "copysr": []byte("℗"),
+ "cudarrl": []byte("⤸"),
+ "cudarrr": []byte("⤵"),
+ "cularr": []byte("↶"),
+ "cularrp": []byte("⤽"),
+ "cupbrcap": []byte("⩈"),
+ "cupdot": []byte("⊍"),
+ "curarr": []byte("↷"),
+ "curarrm": []byte("⤼"),
+ "curlyeqprec": []byte("⋞"),
+ "curlyeqsucc": []byte("⋟"),
+ "curlyvee": []byte("⋎"),
+ "curlywedge": []byte("⋏"),
+ "curren": []byte("¤"),
+ "curvearrowleft": []byte("↶"),
+ "curvearrowright": []byte("↷"),
+ "cwconint": []byte("∲"),
+ "cylcty": []byte("⌭"),
+ "dagger": []byte("†"),
+ "daleth": []byte("ℸ"),
+ "dbkarow": []byte("⤏"),
+ "dblac": []byte("˝"),
+ "dcaron": []byte("ď"),
+ "ddagger": []byte("‡"),
+ "ddotseq": []byte("⩷"),
+ "delta": []byte("δ"),
+ "demptyv": []byte("⦱"),
+ "diamond": []byte("⋄"),
+ "diamondsuit": []byte("♦"),
+ "digamma": []byte("ϝ"),
+ "divide": []byte("÷"),
+ "divideontimes": []byte("⋇"),
+ "divonx": []byte("⋇"),
+ "dlcorn": []byte("⌞"),
+ "dlcrop": []byte("⌍"),
+ "dollar": []byte("$"),
+ "doteqdot": []byte("≑"),
+ "dotminus": []byte("∸"),
+ "dotplus": []byte("∔"),
+ "dotsquare": []byte("⊡"),
+ "doublebarwedge": []byte("⌆"),
+ "downarrow": []byte("↓"),
+ "downdownarrows": []byte("⇊"),
+ "downharpoonleft": []byte("⇃"),
+ "downharpoonright": []byte("⇂"),
+ "drbkarow": []byte("⤐"),
+ "drcorn": []byte("⌟"),
+ "drcrop": []byte("⌌"),
+ "dstrok": []byte("đ"),
+ "dwangle": []byte("⦦"),
+ "dzigrarr": []byte("⟿"),
+ "eacute": []byte("é"),
+ "ecaron": []byte("ě"),
+ "ecirc": []byte("ê"),
+ "ecolon": []byte("≕"),
+ "egrave": []byte("è"),
+ "elinters": []byte("⏧"),
+ "emacr": []byte("ē"),
+ "emptyset": []byte("∅"),
+ "emptyv": []byte("∅"),
+ "emsp13": []byte(" "),
+ "emsp14": []byte(" "),
+ "eogon": []byte("ę"),
+ "epsilon": []byte("ε"),
+ "eqcirc": []byte("≖"),
+ "eqcolon": []byte("≕"),
+ "eqsim": []byte("≂"),
+ "eqslantgtr": []byte("⪖"),
+ "eqslantless": []byte("⪕"),
+ "equals": []byte("="),
+ "equest": []byte("≟"),
+ "equivDD": []byte("⩸"),
+ "eqvparsl": []byte("⧥"),
+ "excl": []byte("!"),
+ "expectation": []byte("ℰ"),
+ "exponentiale": []byte("ⅇ"),
+ "fallingdotseq": []byte("≒"),
+ "female": []byte("♀"),
+ "forall": []byte("∀"),
+ "fpartint": []byte("⨍"),
+ "frac12": []byte("½"),
+ "frac13": []byte("⅓"),
+ "frac14": []byte("¼"),
+ "frac15": []byte("⅕"),
+ "frac16": []byte("⅙"),
+ "frac18": []byte("⅛"),
+ "frac23": []byte("⅔"),
+ "frac25": []byte("⅖"),
+ "frac34": []byte("¾"),
+ "frac35": []byte("⅗"),
+ "frac38": []byte("⅜"),
+ "frac45": []byte("⅘"),
+ "frac56": []byte("⅚"),
+ "frac58": []byte("⅝"),
+ "frac78": []byte("⅞"),
+ "gacute": []byte("ǵ"),
+ "gamma": []byte("γ"),
+ "gammad": []byte("ϝ"),
+ "gbreve": []byte("ğ"),
+ "gcirc": []byte("ĝ"),
+ "geq": []byte("≥"),
+ "geqq": []byte("≧"),
+ "geqslant": []byte("⩾"),
+ "gesdoto": []byte("⪂"),
+ "gesdotol": []byte("⪄"),
+ "ggg": []byte("⋙"),
+ "gnapprox": []byte("⪊"),
+ "gneq": []byte("⪈"),
+ "gneqq": []byte("≩"),
+ "grave": []byte("`"),
+ "gt": []byte(">"),
+ "gtquest": []byte("⩼"),
+ "gtrapprox": []byte("⪆"),
+ "gtrdot": []byte("⋗"),
+ "gtreqless": []byte("⋛"),
+ "gtreqqless": []byte("⪌"),
+ "gtrless": []byte("≷"),
+ "gtrsim": []byte("≳"),
+ "hArr": []byte("⇔"),
+ "hairsp": []byte(" "),
+ "hamilt": []byte("ℋ"),
+ "hardcy": []byte("ъ"),
+ "harrcir": []byte("⥈"),
+ "hcirc": []byte("ĥ"),
+ "hearts": []byte("♥"),
+ "heartsuit": []byte("♥"),
+ "hellip": []byte("…"),
+ "hercon": []byte("⊹"),
+ "hksearow": []byte("⤥"),
+ "hkswarow": []byte("⤦"),
+ "homtht": []byte("∻"),
+ "hookleftarrow": []byte("↩"),
+ "hookrightarrow": []byte("↪"),
+ "horbar": []byte("―"),
+ "hslash": []byte("ℏ"),
+ "hstrok": []byte("ħ"),
+ "hybull": []byte("⁃"),
+ "hyphen": []byte("‐"),
+ "iacute": []byte("í"),
+ "icirc": []byte("î"),
+ "iexcl": []byte("¡"),
+ "igrave": []byte("ì"),
+ "iiiint": []byte("⨌"),
+ "iiint": []byte("∭"),
+ "ijlig": []byte("ij"),
+ "imacr": []byte("ī"),
+ "image": []byte("ℑ"),
+ "imagline": []byte("ℐ"),
+ "imagpart": []byte("ℑ"),
+ "imath": []byte("ı"),
+ "imped": []byte("Ƶ"),
+ "incare": []byte("℅"),
+ "infintie": []byte("⧝"),
+ "inodot": []byte("ı"),
+ "intcal": []byte("⊺"),
+ "integers": []byte("ℤ"),
+ "intercal": []byte("⊺"),
+ "intlarhk": []byte("⨗"),
+ "intprod": []byte("⨼"),
+ "iogon": []byte("į"),
+ "iquest": []byte("¿"),
+ "isin": []byte("∈"),
+ "isindot": []byte("⋵"),
+ "isinsv": []byte("⋳"),
+ "isinv": []byte("∈"),
+ "itilde": []byte("ĩ"),
+ "jcirc": []byte("ĵ"),
+ "jmath": []byte("ȷ"),
+ "jsercy": []byte("ј"),
+ "kappa": []byte("κ"),
+ "kappav": []byte("ϰ"),
+ "kcedil": []byte("ķ"),
+ "kgreen": []byte("ĸ"),
+ "lacute": []byte("ĺ"),
+ "laemptyv": []byte("⦴"),
+ "lagran": []byte("ℒ"),
+ "lambda": []byte("λ"),
+ "langle": []byte("〈"),
+ "laquo": []byte("«"),
+ "larrbfs": []byte("⤟"),
+ "larrhk": []byte("↩"),
+ "larrlp": []byte("↫"),
+ "larrsim": []byte("⥳"),
+ "larrtl": []byte("↢"),
+ "lbrace": []byte("{"),
+ "lbrack": []byte("["),
+ "lbrksld": []byte("⦏"),
+ "lbrkslu": []byte("⦍"),
+ "lcaron": []byte("ľ"),
+ "lcedil": []byte("ļ"),
+ "lcub": []byte("{"),
+ "ldquor": []byte("„"),
+ "ldrdhar": []byte("⥧"),
+ "ldrushar": []byte("⥋"),
+ "leftarrow": []byte("←"),
+ "leftarrowtail": []byte("↢"),
+ "leftharpoondown": []byte("↽"),
+ "leftharpoonup": []byte("↼"),
+ "leftleftarrows": []byte("⇇"),
+ "leftrightarrow": []byte("↔"),
+ "leftrightarrows": []byte("⇆"),
+ "leftrightharpoons": []byte("⇋"),
+ "leftrightsquigarrow": []byte("↭"),
+ "leftthreetimes": []byte("⋋"),
+ "leq": []byte("≤"),
+ "leqq": []byte("≦"),
+ "leqslant": []byte("⩽"),
+ "lesdoto": []byte("⪁"),
+ "lesdotor": []byte("⪃"),
+ "lessapprox": []byte("⪅"),
+ "lessdot": []byte("⋖"),
+ "lesseqgtr": []byte("⋚"),
+ "lesseqqgtr": []byte("⪋"),
+ "lessgtr": []byte("≶"),
+ "lesssim": []byte("≲"),
+ "lfloor": []byte("⌊"),
+ "llcorner": []byte("⌞"),
+ "lmidot": []byte("ŀ"),
+ "lmoust": []byte("⎰"),
+ "lmoustache": []byte("⎰"),
+ "lnapprox": []byte("⪉"),
+ "lneq": []byte("⪇"),
+ "lneqq": []byte("≨"),
+ "longleftarrow": []byte("⟵"),
+ "longleftrightarrow": []byte("⟷"),
+ "longmapsto": []byte("⟼"),
+ "longrightarrow": []byte("⟶"),
+ "looparrowleft": []byte("↫"),
+ "looparrowright": []byte("↬"),
+ "lotimes": []byte("⨴"),
+ "lowast": []byte("∗"),
+ "lowbar": []byte("_"),
+ "lozenge": []byte("◊"),
+ "lpar": []byte("("),
+ "lrcorner": []byte("⌟"),
+ "lsaquo": []byte("‹"),
+ "lsqb": []byte("["),
+ "lsquor": []byte("‚"),
+ "lstrok": []byte("ł"),
+ "lt": []byte("<"),
+ "lthree": []byte("⋋"),
+ "ltimes": []byte("⋉"),
+ "ltquest": []byte("⩻"),
+ "lurdshar": []byte("⥊"),
+ "luruhar": []byte("⥦"),
+ "maltese": []byte("✠"),
+ "mapsto": []byte("↦"),
+ "mapstodown": []byte("↧"),
+ "mapstoleft": []byte("↤"),
+ "mapstoup": []byte("↥"),
+ "marker": []byte("▮"),
+ "measuredangle": []byte("∡"),
+ "micro": []byte("µ"),
+ "midast": []byte("*"),
+ "middot": []byte("·"),
+ "minusb": []byte("⊟"),
+ "minusd": []byte("∸"),
+ "minusdu": []byte("⨪"),
+ "mnplus": []byte("∓"),
+ "models": []byte("⊧"),
+ "mstpos": []byte("∾"),
+ "multimap": []byte("⊸"),
+ "nLeftarrow": []byte("⇍"),
+ "nLeftrightarrow": []byte("⇎"),
+ "nRightarrow": []byte("⇏"),
+ "nVDash": []byte("⊯"),
+ "nVdash": []byte("⊮"),
+ "nabla": []byte("∇"),
+ "nacute": []byte("ń"),
+ "napos": []byte("ʼn"),
+ "napprox": []byte("≉"),
+ "natural": []byte("♮"),
+ "naturals": []byte("ℕ"),
+ "ncaron": []byte("ň"),
+ "ncedil": []byte("ņ"),
+ "nearrow": []byte("↗"),
+ "nequiv": []byte("≢"),
+ "nesear": []byte("⤨"),
+ "nexist": []byte("∄"),
+ "nexists": []byte("∄"),
+ "ngeq": []byte("≱"),
+ "ngtr": []byte("≯"),
+ "niv": []byte("∋"),
+ "nleftarrow": []byte("↚"),
+ "nleftrightarrow": []byte("↮"),
+ "nleq": []byte("≰"),
+ "nless": []byte("≮"),
+ "nltrie": []byte("⋬"),
+ "notinva": []byte("∉"),
+ "notinvb": []byte("⋷"),
+ "notinvc": []byte("⋶"),
+ "notniva": []byte("∌"),
+ "notnivb": []byte("⋾"),
+ "notnivc": []byte("⋽"),
+ "nparallel": []byte("∦"),
+ "npolint": []byte("⨔"),
+ "nprcue": []byte("⋠"),
+ "nprec": []byte("⊀"),
+ "nrightarrow": []byte("↛"),
+ "nrtrie": []byte("⋭"),
+ "nsccue": []byte("⋡"),
+ "nshortmid": []byte("∤"),
+ "nshortparallel": []byte("∦"),
+ "nsimeq": []byte("≄"),
+ "nsmid": []byte("∤"),
+ "nspar": []byte("∦"),
+ "nsqsube": []byte("⋢"),
+ "nsqsupe": []byte("⋣"),
+ "nsubseteq": []byte("⊈"),
+ "nsucc": []byte("⊁"),
+ "nsupseteq": []byte("⊉"),
+ "ntilde": []byte("ñ"),
+ "ntriangleleft": []byte("⋪"),
+ "ntrianglelefteq": []byte("⋬"),
+ "ntriangleright": []byte("⋫"),
+ "ntrianglerighteq": []byte("⋭"),
+ "num": []byte("#"),
+ "numero": []byte("№"),
+ "nvDash": []byte("⊭"),
+ "nvdash": []byte("⊬"),
+ "nvinfin": []byte("⧞"),
+ "nwarrow": []byte("↖"),
+ "oacute": []byte("ó"),
+ "ocirc": []byte("ô"),
+ "odblac": []byte("ő"),
+ "oelig": []byte("œ"),
+ "ograve": []byte("ò"),
+ "olcross": []byte("⦻"),
+ "omacr": []byte("ō"),
+ "omega": []byte("ω"),
+ "omicron": []byte("ο"),
+ "ominus": []byte("⊖"),
+ "order": []byte("ℴ"),
+ "orderof": []byte("ℴ"),
+ "origof": []byte("⊶"),
+ "orslope": []byte("⩗"),
+ "oslash": []byte("ø"),
+ "otilde": []byte("õ"),
+ "otimes": []byte("⊗"),
+ "otimesas": []byte("⨶"),
+ "parallel": []byte("∥"),
+ "percnt": []byte("%"),
+ "period": []byte("."),
+ "permil": []byte("‰"),
+ "perp": []byte("⊥"),
+ "pertenk": []byte("‱"),
+ "phmmat": []byte("ℳ"),
+ "pitchfork": []byte("⋔"),
+ "planck": []byte("ℏ"),
+ "planckh": []byte("ℎ"),
+ "plankv": []byte("ℏ"),
+ "plus": []byte("+"),
+ "plusacir": []byte("⨣"),
+ "pluscir": []byte("⨢"),
+ "plusdo": []byte("∔"),
+ "plusmn": []byte("±"),
+ "plussim": []byte("⨦"),
+ "plustwo": []byte("⨧"),
+ "pointint": []byte("⨕"),
+ "pound": []byte("£"),
+ "prec": []byte("≺"),
+ "precapprox": []byte("⪷"),
+ "preccurlyeq": []byte("≼"),
+ "preceq": []byte("⪯"),
+ "precnapprox": []byte("⪹"),
+ "precneqq": []byte("⪵"),
+ "precnsim": []byte("⋨"),
+ "precsim": []byte("≾"),
+ "primes": []byte("ℙ"),
+ "prnsim": []byte("⋨"),
+ "profalar": []byte("⌮"),
+ "profline": []byte("⌒"),
+ "profsurf": []byte("⌓"),
+ "propto": []byte("∝"),
+ "prurel": []byte("⊰"),
+ "puncsp": []byte(" "),
+ "qprime": []byte("⁗"),
+ "quaternions": []byte("ℍ"),
+ "quatint": []byte("⨖"),
+ "quest": []byte("?"),
+ "questeq": []byte("≟"),
+ "quot": []byte("\""),
+ "racute": []byte("ŕ"),
+ "radic": []byte("√"),
+ "raemptyv": []byte("⦳"),
+ "rangle": []byte("〉"),
+ "raquo": []byte("»"),
+ "rarrbfs": []byte("⤠"),
+ "rarrhk": []byte("↪"),
+ "rarrlp": []byte("↬"),
+ "rarrsim": []byte("⥴"),
+ "rarrtl": []byte("↣"),
+ "rationals": []byte("ℚ"),
+ "rbrace": []byte("}"),
+ "rbrack": []byte("]"),
+ "rbrksld": []byte("⦎"),
+ "rbrkslu": []byte("⦐"),
+ "rcaron": []byte("ř"),
+ "rcedil": []byte("ŗ"),
+ "rcub": []byte("}"),
+ "rdldhar": []byte("⥩"),
+ "rdquor": []byte("”"),
+ "real": []byte("ℜ"),
+ "realine": []byte("ℛ"),
+ "realpart": []byte("ℜ"),
+ "reals": []byte("ℝ"),
+ "rfloor": []byte("⌋"),
+ "rightarrow": []byte("→"),
+ "rightarrowtail": []byte("↣"),
+ "rightharpoondown": []byte("⇁"),
+ "rightharpoonup": []byte("⇀"),
+ "rightleftarrows": []byte("⇄"),
+ "rightleftharpoons": []byte("⇌"),
+ "rightrightarrows": []byte("⇉"),
+ "rightsquigarrow": []byte("↝"),
+ "rightthreetimes": []byte("⋌"),
+ "risingdotseq": []byte("≓"),
+ "rmoust": []byte("⎱"),
+ "rmoustache": []byte("⎱"),
+ "rotimes": []byte("⨵"),
+ "rpar": []byte(")"),
+ "rppolint": []byte("⨒"),
+ "rsaquo": []byte("›"),
+ "rsqb": []byte("]"),
+ "rsquor": []byte("’"),
+ "rthree": []byte("⋌"),
+ "rtimes": []byte("⋊"),
+ "rtriltri": []byte("⧎"),
+ "ruluhar": []byte("⥨"),
+ "sacute": []byte("ś"),
+ "scaron": []byte("š"),
+ "scedil": []byte("ş"),
+ "scirc": []byte("ŝ"),
+ "scnsim": []byte("⋩"),
+ "scpolint": []byte("⨓"),
+ "searrow": []byte("↘"),
+ "semi": []byte(";"),
+ "seswar": []byte("⤩"),
+ "setminus": []byte("∖"),
+ "sfrown": []byte("⌢"),
+ "shchcy": []byte("щ"),
+ "shortmid": []byte("∣"),
+ "shortparallel": []byte("∥"),
+ "sigma": []byte("σ"),
+ "sigmaf": []byte("ς"),
+ "sigmav": []byte("ς"),
+ "simeq": []byte("≃"),
+ "simplus": []byte("⨤"),
+ "simrarr": []byte("⥲"),
+ "slarr": []byte("←"),
+ "smallsetminus": []byte("∖"),
+ "smeparsl": []byte("⧤"),
+ "smid": []byte("∣"),
+ "softcy": []byte("ь"),
+ "sol": []byte("/"),
+ "solbar": []byte("⌿"),
+ "spades": []byte("♠"),
+ "spadesuit": []byte("♠"),
+ "spar": []byte("∥"),
+ "sqsube": []byte("⊑"),
+ "sqsubset": []byte("⊏"),
+ "sqsubseteq": []byte("⊑"),
+ "sqsupe": []byte("⊒"),
+ "sqsupset": []byte("⊐"),
+ "sqsupseteq": []byte("⊒"),
+ "square": []byte("□"),
+ "squarf": []byte("▪"),
+ "srarr": []byte("→"),
+ "ssetmn": []byte("∖"),
+ "ssmile": []byte("⌣"),
+ "sstarf": []byte("⋆"),
+ "straightepsilon": []byte("ϵ"),
+ "straightphi": []byte("ϕ"),
+ "strns": []byte("¯"),
+ "subedot": []byte("⫃"),
+ "submult": []byte("⫁"),
+ "subplus": []byte("⪿"),
+ "subrarr": []byte("⥹"),
+ "subset": []byte("⊂"),
+ "subseteq": []byte("⊆"),
+ "subseteqq": []byte("⫅"),
+ "subsetneq": []byte("⊊"),
+ "subsetneqq": []byte("⫋"),
+ "succ": []byte("≻"),
+ "succapprox": []byte("⪸"),
+ "succcurlyeq": []byte("≽"),
+ "succeq": []byte("⪰"),
+ "succnapprox": []byte("⪺"),
+ "succneqq": []byte("⪶"),
+ "succnsim": []byte("⋩"),
+ "succsim": []byte("≿"),
+ "supdsub": []byte("⫘"),
+ "supedot": []byte("⫄"),
+ "suphsol": []byte("⟉"),
+ "suphsub": []byte("⫗"),
+ "suplarr": []byte("⥻"),
+ "supmult": []byte("⫂"),
+ "supplus": []byte("⫀"),
+ "supset": []byte("⊃"),
+ "supseteq": []byte("⊇"),
+ "supseteqq": []byte("⫆"),
+ "supsetneq": []byte("⊋"),
+ "supsetneqq": []byte("⫌"),
+ "swarrow": []byte("↙"),
+ "szlig": []byte("ß"),
+ "target": []byte("⌖"),
+ "tcaron": []byte("ť"),
+ "tcedil": []byte("ţ"),
+ "telrec": []byte("⌕"),
+ "there4": []byte("∴"),
+ "therefore": []byte("∴"),
+ "theta": []byte("θ"),
+ "thetasym": []byte("ϑ"),
+ "thetav": []byte("ϑ"),
+ "thickapprox": []byte("≈"),
+ "thicksim": []byte("∼"),
+ "thinsp": []byte(" "),
+ "thkap": []byte("≈"),
+ "thksim": []byte("∼"),
+ "thorn": []byte("þ"),
+ "tilde": []byte("˜"),
+ "times": []byte("×"),
+ "timesb": []byte("⊠"),
+ "timesbar": []byte("⨱"),
+ "topbot": []byte("⌶"),
+ "topfork": []byte("⫚"),
+ "tprime": []byte("‴"),
+ "triangle": []byte("▵"),
+ "triangledown": []byte("▿"),
+ "triangleleft": []byte("◃"),
+ "trianglelefteq": []byte("⊴"),
+ "triangleq": []byte("≜"),
+ "triangleright": []byte("▹"),
+ "trianglerighteq": []byte("⊵"),
+ "tridot": []byte("◬"),
+ "triminus": []byte("⨺"),
+ "triplus": []byte("⨹"),
+ "tritime": []byte("⨻"),
+ "trpezium": []byte("⏢"),
+ "tstrok": []byte("ŧ"),
+ "twoheadleftarrow": []byte("↞"),
+ "twoheadrightarrow": []byte("↠"),
+ "uacute": []byte("ú"),
+ "ubreve": []byte("ŭ"),
+ "ucirc": []byte("û"),
+ "udblac": []byte("ű"),
+ "ugrave": []byte("ù"),
+ "ulcorn": []byte("⌜"),
+ "ulcorner": []byte("⌜"),
+ "ulcrop": []byte("⌏"),
+ "umacr": []byte("ū"),
+ "uogon": []byte("ų"),
+ "uparrow": []byte("↑"),
+ "updownarrow": []byte("↕"),
+ "upharpoonleft": []byte("↿"),
+ "upharpoonright": []byte("↾"),
+ "upsih": []byte("ϒ"),
+ "upsilon": []byte("υ"),
+ "upuparrows": []byte("⇈"),
+ "urcorn": []byte("⌝"),
+ "urcorner": []byte("⌝"),
+ "urcrop": []byte("⌎"),
+ "uring": []byte("ů"),
+ "utilde": []byte("ũ"),
+ "uwangle": []byte("⦧"),
+ "varepsilon": []byte("ϵ"),
+ "varkappa": []byte("ϰ"),
+ "varnothing": []byte("∅"),
+ "varphi": []byte("ϕ"),
+ "varpi": []byte("ϖ"),
+ "varpropto": []byte("∝"),
+ "varrho": []byte("ϱ"),
+ "varsigma": []byte("ς"),
+ "vartheta": []byte("ϑ"),
+ "vartriangleleft": []byte("⊲"),
+ "vartriangleright": []byte("⊳"),
+ "vee": []byte("∨"),
+ "veebar": []byte("⊻"),
+ "vellip": []byte("⋮"),
+ "verbar": []byte("|"),
+ "vert": []byte("|"),
+ "vprop": []byte("∝"),
+ "vzigzag": []byte("⦚"),
+ "wcirc": []byte("ŵ"),
+ "wedge": []byte("∧"),
+ "wedgeq": []byte("≙"),
+ "weierp": []byte("℘"),
+ "wreath": []byte("≀"),
+ "xvee": []byte("⋁"),
+ "xwedge": []byte("⋀"),
+ "yacute": []byte("ý"),
+ "ycirc": []byte("ŷ"),
+ "zacute": []byte("ź"),
+ "zcaron": []byte("ž"),
+ "zeetrf": []byte("ℨ"),
+ "zigrarr": []byte("⇝"),
+}
+
+var TextRevEntitiesMap = map[byte][]byte{
+ '<': []byte("<"),
+}
diff -Nru golang-github-tdewolff-parse-2.3.9/html/util_test.go golang-github-tdewolff-parse-2.4.2/html/util_test.go
--- golang-github-tdewolff-parse-2.3.9/html/util_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/html/util_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -13,20 +13,17 @@
}{
{`xyz`, `xyz`},
{``, ``},
- {`x&z`, `x&z`},
{`x/z`, `x/z`},
{`x'z`, `"x'z"`},
{`x"z`, `'x"z'`},
{`'x"z'`, `'x"z'`},
- {`'x'"'z'`, `"x'"'z"`},
- {`"x"'"z"`, `'x"'"z'`},
- {`"x'z"`, `"x'z"`},
- {`'x"z'`, `'x"z'`},
- {`'x">'`, `'x">'`},
- {`You're encouraged to log in; however, it's not mandatory. [o]`, `"You're encouraged to log in; however, it's not mandatory. [o]"`},
+ {`'x'"'z'`, `"x'"'z"`},
+ {`"x"'"z"`, `'x"'"z'`},
+ {`"x'z"`, `"x'z"`},
+ {`'x'z'`, `"x'z"`},
{`a'b=""`, `'a'b=""'`},
{`x
= 0xC0 {
- if r, n := l.PeekRune(0); r == '\u2028' || r == '\u2029' {
- nNewline = n
+ var r rune
+ if r, n = l.PeekRune(0); r == '\u2028' || r == '\u2029' {
+ newline = true
}
- } else {
- l.Move(1)
}
- if nNewline > 0 {
- if offset < l.Pos()+nNewline {
- // move onto offset position, let next iteration handle it
- l.Move(offset - l.Pos())
- continue
- }
- l.Move(nNewline)
+ if 1 < n && offset < l.Pos()+n {
+ // move onto offset position, let next iteration handle it
+ l.Move(offset - l.Pos())
+ continue
+ }
+ l.Move(n)
+
+ if newline {
line++
offset -= l.Pos()
l.Skip()
diff -Nru golang-github-tdewolff-parse-2.3.9/position_test.go golang-github-tdewolff-parse-2.4.2/position_test.go
--- golang-github-tdewolff-parse-2.3.9/position_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/position_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -35,6 +35,7 @@
{1, "x\u2028x", 1, 2},
{2, "x\u2028x", 1, 3},
{3, "x\u2028x", 1, 4},
+ {2, "x\u2318x", 1, 3},
}
for _, tt := range newlineTests {
t.Run(fmt.Sprint(tt.buf, " ", tt.offset), func(t *testing.T) {
diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/float.go golang-github-tdewolff-parse-2.4.2/strconv/float.go
--- golang-github-tdewolff-parse-2.3.9/strconv/float.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/strconv/float.go 2019-12-17 13:35:25.000000000 +0000
@@ -1,6 +1,8 @@
package strconv
-import "math"
+import (
+ "math"
+)
var float64pow10 = []float64{
1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
@@ -83,8 +85,7 @@
return f * math.Pow10(int(expExp)), i
}
-const log2 = 0.301029995
-const int64maxlen = 18
+const log2 = 0.3010299956639812
func float64exp(f float64) int {
exp2 := 0
@@ -100,11 +101,10 @@
return int(exp10)
}
+// AppendFloat appends a float to `b` with precision `prec`. It returns the new slice and whether succesful or not. Precision is the number of decimals to display, thus prec + 1 == number of significant digits.
func AppendFloat(b []byte, f float64, prec int) ([]byte, bool) {
if math.IsNaN(f) || math.IsInf(f, 0) {
return b, false
- } else if prec >= int64maxlen {
- return b, false
}
neg := false
@@ -112,8 +112,8 @@
f = -f
neg = true
}
- if prec == -1 {
- prec = int64maxlen - 1
+ if prec < 0 || 17 < prec {
+ prec = 17 // maximum number of significant digits in double
}
prec -= float64exp(f) // number of digits in front of the dot
f *= math.Pow10(prec)
diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/float_test.go golang-github-tdewolff-parse-2.4.2/strconv/float_test.go
--- golang-github-tdewolff-parse-2.3.9/strconv/float_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/strconv/float_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -28,9 +28,11 @@
// {"4.9406564584124e-308", 4.9406564584124e-308)
}
for _, tt := range floatTests {
- f, n := ParseFloat([]byte(tt.f))
- test.That(t, n == len(tt.f), "parsed", n, "characters instead for", tt.f)
- test.That(t, f == tt.expected, "return", tt.expected, "for", tt.f)
+ t.Run(fmt.Sprint(tt.f), func(t *testing.T) {
+ f, n := ParseFloat([]byte(tt.f))
+ test.T(t, n, len(tt.f))
+ test.T(t, f, tt.expected)
+ })
}
}
@@ -73,12 +75,18 @@
{math.NaN(), 0, ""},
{math.Inf(1), 0, ""},
{math.Inf(-1), 0, ""},
- {0, 19, ""},
- {.000923361977200859392, -1, "9.23361977200859392e-4"},
+ {0, 19, "0"},
+ {0.000923361977200859392, -1, "9.23361977200859392e-4"},
+ {1234, 2, "1.23e3"},
+ {12345, 2, "1.23e4"},
+ {12.345, 2, "12.3"},
+ {12.345, 3, "12.34"},
}
for _, tt := range floatTests {
- f, _ := AppendFloat([]byte{}, tt.f, tt.prec)
- test.String(t, string(f), tt.expected, "for", tt.f)
+ t.Run(fmt.Sprint(tt.f), func(t *testing.T) {
+ f, _ := AppendFloat([]byte{}, tt.f, tt.prec)
+ test.String(t, string(f), tt.expected)
+ })
}
b := make([]byte, 0, 22)
diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/int_test.go golang-github-tdewolff-parse-2.4.2/strconv/int_test.go
--- golang-github-tdewolff-parse-2.3.9/strconv/int_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/strconv/int_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -1,6 +1,7 @@
package strconv
import (
+ "fmt"
"math"
"math/rand"
"testing"
@@ -27,14 +28,16 @@
{"a", 0},
}
for _, tt := range intTests {
- i, _ := ParseInt([]byte(tt.i))
- test.That(t, i == tt.expected, "return", tt.expected, "for", tt.i)
+ t.Run(fmt.Sprint(tt.i), func(t *testing.T) {
+ i, _ := ParseInt([]byte(tt.i))
+ test.T(t, i, tt.expected)
+ })
}
}
func TestLenInt(t *testing.T) {
lenIntTests := []struct {
- number int64
+ i int64
expected int
}{
{0, 1},
@@ -64,7 +67,9 @@
{1000000000000000000, 19},
}
for _, tt := range lenIntTests {
- test.That(t, LenInt(tt.number) == tt.expected, "return", tt.expected, "for", tt.number)
+ t.Run(fmt.Sprint(tt.i), func(t *testing.T) {
+ test.T(t, LenInt(tt.i), tt.expected)
+ })
}
}
diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/price_test.go golang-github-tdewolff-parse-2.4.2/strconv/price_test.go
--- golang-github-tdewolff-parse-2.3.9/strconv/price_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/strconv/price_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -1,6 +1,7 @@
package strconv
import (
+ "fmt"
"testing"
"github.com/tdewolff/test"
@@ -26,10 +27,11 @@
}
for _, tt := range priceTests {
- price := AppendPrice(make([]byte, 0, 4), tt.price, tt.dec, ',', '.')
- test.String(t, string(price), tt.expected, "for", tt.price)
+ t.Run(fmt.Sprint(tt.price), func(t *testing.T) {
+ price := AppendPrice(make([]byte, 0, 4), tt.price, tt.dec, ',', '.')
+ test.String(t, string(price), tt.expected, "for", tt.price)
+ })
}
// coverage
-
}
diff -Nru golang-github-tdewolff-parse-2.3.9/.travis.yml golang-github-tdewolff-parse-2.4.2/.travis.yml
--- golang-github-tdewolff-parse-2.3.9/.travis.yml 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/.travis.yml 2019-12-17 13:35:25.000000000 +0000
@@ -1,6 +1,6 @@
language: go
go:
- - 1.12.x
+ - 1.13.x
env:
- GO111MODULE=on
before_install:
diff -Nru golang-github-tdewolff-parse-2.3.9/util.go golang-github-tdewolff-parse-2.4.2/util.go
--- golang-github-tdewolff-parse-2.3.9/util.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/util.go 2019-12-17 13:35:25.000000000 +0000
@@ -1,5 +1,10 @@
package parse
+import (
+ "bytes"
+ "strconv"
+)
+
// Copy returns a copy of the given byte slice.
func Copy(src []byte) (dst []byte) {
dst = make([]byte, len(src))
@@ -161,37 +166,260 @@
// ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r).
func ReplaceMultipleWhitespace(b []byte) []byte {
- j := 0
- prevWS := false
- hasNewline := false
- for i, c := range b {
- if IsWhitespace(c) {
- prevWS = true
- if IsNewline(c) {
- hasNewline = true
+ j, k := 0, 0 // j is write position, k is start of next text section
+ for i := 0; i < len(b); i++ {
+ if IsWhitespace(b[i]) {
+ start := i
+ newline := IsNewline(b[i])
+ i++
+ for ; i < len(b) && IsWhitespace(b[i]); i++ {
+ if IsNewline(b[i]) {
+ newline = true
+ }
}
- } else {
- if prevWS {
- prevWS = false
- if hasNewline {
- hasNewline = false
- b[j] = '\n'
+ if newline {
+ b[start] = '\n'
+ } else {
+ b[start] = ' '
+ }
+ if 1 < i-start { // more than one whitespace
+ if j == 0 {
+ j = start + 1
} else {
- b[j] = ' '
+ j += copy(b[j:], b[k:start+1])
}
- j++
+ k = i
}
- b[j] = b[i]
- j++
}
}
- if prevWS {
- if hasNewline {
- b[j] = '\n'
+ if j == 0 {
+ return b
+ } else if j == 1 { // only if starts with whitespace
+ b[k-1] = b[0]
+ return b[k-1:]
+ } else if k < len(b) {
+ j += copy(b[j:], b[k:])
+ }
+ return b[:j]
+}
+
+// replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
+ if b[j] <= '9' {
+ c = c<<4 + int(b[j]-'0')
+ } else if b[j] <= 'F' {
+ c = c<<4 + int(b[j]-'A') + 10
+ } else if b[j] <= 'f' {
+ c = c<<4 + int(b[j]-'a') + 10
+ }
+ }
+ if j <= i+3 || 10000 <= c {
+ return b, j - 1
+ }
+ if c < 128 {
+ r = []byte{byte(c)}
+ } else {
+ r = append(r, '&', '#')
+ r = strconv.AppendInt(r, int64(c), 10)
+ r = append(r, ';')
+ }
} else {
- b[j] = ' '
+ c := 0
+ for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ {
+ c = c*10 + int(b[j]-'0')
+ }
+ if j <= i+2 || 128 <= c {
+ return b, j - 1
+ }
+ r = []byte{byte(c)}
}
- j++
+ } else {
+ for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ {
+ }
+ if j <= i+1 || len(b) <= j {
+ return b, j - 1
+ }
+
+ var ok bool
+ r, ok = entitiesMap[string(b[i+1:j])]
+ if !ok {
+ return b, j
+ }
+ }
+
+ // j is at semicolon
+ n := j + 1 - i
+ if j < len(b) && b[j] == ';' && 2 < n {
+ if len(r) == 1 {
+ if q, ok := revEntitiesMap[r[0]]; ok {
+ if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) {
+ return b, j
+ }
+ r = q
+ } else if r[0] == '&' {
+ // check if for example & is followed by something that could potentially be an entity
+ k := j + 1
+ if k < len(b) && b[k] == '#' {
+ k++
+ }
+ for ; k < len(b) && k-j <= MaxEntityLength && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z'); k++ {
+ }
+ if k < len(b) && b[k] == ';' {
+ return b, k
+ }
+ }
+ }
+
+ copy(b[i:], r)
+ copy(b[i+len(r):], b[j+1:])
+ b = b[:len(b)-n+len(r)]
+ return b, i + len(r) - 1
+ }
+ return b, i
+}
+
+// ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes.
+func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
+ for i := 0; i < len(b); i++ {
+ if b[i] == '&' && i+3 < len(b) {
+ b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
+ }
+ }
+ return b
+}
+
+// ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially.
+func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
+ j, k := 0, 0 // j is write position, k is start of next text section
+ for i := 0; i < len(b); i++ {
+ if IsWhitespace(b[i]) {
+ start := i
+ newline := IsNewline(b[i])
+ i++
+ for ; i < len(b) && IsWhitespace(b[i]); i++ {
+ if IsNewline(b[i]) {
+ newline = true
+ }
+ }
+ if newline {
+ b[start] = '\n'
+ } else {
+ b[start] = ' '
+ }
+ if 1 < i-start { // more than one whitespace
+ if j == 0 {
+ j = start + 1
+ } else {
+ j += copy(b[j:], b[k:start+1])
+ }
+ k = i
+ }
+ }
+ if i+3 < len(b) && b[i] == '&' {
+ b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
+ }
+ }
+ if j == 0 {
+ return b
+ } else if j == 1 { // only if starts with whitespace
+ b[k-1] = b[0]
+ return b[k-1:]
+ } else if k < len(b) {
+ j += copy(b[j:], b[k:])
}
return b[:j]
}
+
+func DecodeURL(b []byte) []byte {
+ for i := 0; i < len(b); i++ {
+ if b[i] == '%' && i+2 < len(b) {
+ j := i + 1
+ c := 0
+ for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'z' || b[j] >= 'A' && b[j] <= 'Z'); j++ {
+ if b[j] <= '9' {
+ c = c<<4 + int(b[j]-'0')
+ } else if b[j] <= 'F' {
+ c = c<<4 + int(b[j]-'A') + 10
+ } else if b[j] <= 'f' {
+ c = c<<4 + int(b[j]-'a') + 10
+ }
+ }
+ if j == i+3 && c < 128 {
+ b[i] = byte(c)
+ b = append(b[:i+1], b[i+3:]...)
+ }
+ } else if b[i] == '+' {
+ b[i] = ' '
+ }
+ }
+ return b
+}
+
+var URLEncodingTable = [256]bool{
+ // ASCII
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+
+ false, false, true, true, true, true, true, false, // space, !, '
+ false, false, false, true, true, false, false, true, // (, ), *, -, .
+ false, false, false, false, false, false, false, false, // 0, 1, 2, 3, 4, 5, 6, 7
+ false, false, true, true, true, true, true, true, // 8, 9
+
+ true, false, false, false, false, false, false, false, // A, B, C, D, E, F, G
+ false, false, false, false, false, false, false, false, // H, I, J, K, L, M, N, O
+ false, false, false, false, false, false, false, false, // P, Q, R, S, T, U, V, W
+ false, false, false, true, true, true, true, false, // X, Y, Z, _
+
+ true, false, false, false, false, false, false, false, // a, b, c, d, e, f, g
+ false, false, false, false, false, false, false, false, // h, i, j, k, l, m, n, o
+ false, false, false, false, false, false, false, false, // p, q, r, s, t, u, v, w
+ false, false, false, true, true, true, false, true, // x, y, z, ~
+
+ // non-ASCII
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+}
+
+func EncodeURL(b []byte, table [256]bool) []byte {
+ for i := 0; i < len(b); i++ {
+ c := b[i]
+ if table[c] {
+ b = append(b, 0, 0)
+ copy(b[i+3:], b[i+1:])
+ b[i+0] = '%'
+ b[i+1] = "0123456789ABCDEF"[c>>4]
+ b[i+2] = "0123456789ABCDEF"[c&15]
+ } else if c == ' ' {
+ b[i] = '+'
+ }
+ }
+ return b
+}
diff -Nru golang-github-tdewolff-parse-2.3.9/util_test.go golang-github-tdewolff-parse-2.4.2/util_test.go
--- golang-github-tdewolff-parse-2.3.9/util_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/util_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -3,13 +3,14 @@
import (
"bytes"
"math/rand"
+ "net/url"
"regexp"
"testing"
"github.com/tdewolff/test"
)
-func helperRand(n, m int, chars []byte) [][]byte {
+func helperRandChars(n, m int, chars string) [][]byte {
r := make([][]byte, n)
for i := range r {
for j := 0; j < m; j++ {
@@ -19,12 +20,28 @@
return r
}
+func helperRandStrings(n, m int, ss []string) [][]byte {
+ r := make([][]byte, n)
+ for i := range r {
+ for j := 0; j < m; j++ {
+ r[i] = append(r[i], []byte(ss[rand.Intn(len(ss))])...)
+ }
+ }
+ return r
+}
+
////////////////////////////////////////////////////////////////
var wsSlices [][]byte
+var entitySlices [][]byte
+var encodedUrlSlices [][]byte
+var urlSlices [][]byte
func init() {
- wsSlices = helperRand(100, 20, []byte("abcdefg \n\r\f\t"))
+ wsSlices = helperRandChars(10000, 50, "abcdefg \n\r\f\t")
+ entitySlices = helperRandStrings(100, 5, []string{""", "'", "'", " ", " ", "test"})
+ encodedUrlSlices = helperRandStrings(100, 5, []string{"%20", "%3D", "test"})
+ urlSlices = helperRandStrings(100, 5, []string{"~", "\"", "<", "test"})
}
func TestCopy(t *testing.T) {
@@ -55,21 +72,191 @@
test.That(t, !IsAllWhitespace([]byte("\t \r\n\fx")))
}
+func TestTrim(t *testing.T) {
+ test.Bytes(t, TrimWhitespace([]byte("a")), []byte("a"))
+ test.Bytes(t, TrimWhitespace([]byte(" a")), []byte("a"))
+ test.Bytes(t, TrimWhitespace([]byte("a ")), []byte("a"))
+ test.Bytes(t, TrimWhitespace([]byte(" ")), []byte(""))
+}
+
func TestReplaceMultipleWhitespace(t *testing.T) {
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a")), []byte(" a"))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte("a ")), []byte("a "))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte("a b ")), []byte("a b "))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a b ")), []byte(" a b "))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a b ")), []byte(" a b "))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a b ")), []byte(" a b "))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a")), []byte(" a"))
+ test.Bytes(t, ReplaceMultipleWhitespace([]byte("a b")), []byte("a b"))
+}
+
+func TestReplaceMultipleWhitespaceRandom(t *testing.T) {
wsRegexp := regexp.MustCompile("[ \t\f]+")
wsNewlinesRegexp := regexp.MustCompile("[ ]*[\r\n][ \r\n]*")
for _, e := range wsSlices {
reference := wsRegexp.ReplaceAll(e, []byte(" "))
reference = wsNewlinesRegexp.ReplaceAll(reference, []byte("\n"))
- test.Bytes(t, ReplaceMultipleWhitespace(e), reference, "must remove all multiple whitespace but keep newlines")
+ test.Bytes(t, ReplaceMultipleWhitespace(Copy(e)), reference, "in '"+string(e)+"'")
}
}
-func TestTrim(t *testing.T) {
- test.Bytes(t, TrimWhitespace([]byte("a")), []byte("a"))
- test.Bytes(t, TrimWhitespace([]byte(" a")), []byte("a"))
- test.Bytes(t, TrimWhitespace([]byte("a ")), []byte("a"))
- test.Bytes(t, TrimWhitespace([]byte(" ")), []byte(""))
+func TestReplaceEntities(t *testing.T) {
+ entitiesMap := map[string][]byte{
+ "varphi": []byte("ϕ"),
+ "varpi": []byte("ϖ"),
+ "quot": []byte("\""),
+ "apos": []byte("'"),
+ "amp": []byte("&"),
+ }
+ revEntitiesMap := map[byte][]byte{
+ '\'': []byte("'"),
+ }
+ var entityTests = []struct {
+ entity string
+ expected string
+ }{
+ {""", `"`},
+ {"'", `'`},
+ {""", `"`},
+ {"'", `'`},
+ {" ", ` `},
+ {""", `"`},
+ {"'", `'`},
+ {"⏧", `⏧`},
+ {"⏧", `⏧`},
+ {"⏧", `⏧`},
+ {"⏧", `⏧`},
+ {"✏", `✏`},
+ {"✐", `✐`},
+ {"'"", `'"`},
+ {""", `"`},
+ {""", `"`},
+ {"&apos", `&apos`},
+ {"&", `&`},
+ {"'", `'`},
+ {"&", `&`},
+ {""", `"`},
+ {"&a mp;", `&a mp;`},
+ {"´", `´`},
+ {"∳", `∳`},
+ {"&CounterClockwiseContourIntegralL;", `&CounterClockwiseContourIntegralL;`},
+ {"ϕ", "ϕ"},
+ {"ϖ", "ϖ"},
+ {"&varnone;", "&varnone;"},
+ }
+ for _, tt := range entityTests {
+ t.Run(tt.entity, func(t *testing.T) {
+ b := ReplaceEntities([]byte(tt.entity), entitiesMap, revEntitiesMap)
+ test.T(t, string(b), tt.expected, "in '"+tt.entity+"'")
+ })
+ }
+}
+
+func TestReplaceEntitiesRandom(t *testing.T) {
+ entitiesMap := map[string][]byte{
+ "quot": []byte("\""),
+ "apos": []byte("'"),
+ }
+ revEntitiesMap := map[byte][]byte{
+ '\'': []byte("'"),
+ }
+
+ quotRegexp := regexp.MustCompile(""")
+ aposRegexp := regexp.MustCompile("('|')")
+ for _, e := range entitySlices {
+ reference := quotRegexp.ReplaceAll(e, []byte("\""))
+ reference = aposRegexp.ReplaceAll(reference, []byte("'"))
+ test.Bytes(t, ReplaceEntities(Copy(e), entitiesMap, revEntitiesMap), reference, "in '"+string(e)+"'")
+ }
+}
+
+func TestReplaceMultipleWhitespaceAndEntities(t *testing.T) {
+ entitiesMap := map[string][]byte{
+ "varphi": []byte("ϕ"),
+ }
+ var entityTests = []struct {
+ entity string
+ expected string
+ }{
+ {" ϕ " \n ", " ϕ \"\n"},
+ }
+ for _, tt := range entityTests {
+ t.Run(tt.entity, func(t *testing.T) {
+ b := ReplaceMultipleWhitespaceAndEntities([]byte(tt.entity), entitiesMap, nil)
+ test.T(t, string(b), tt.expected, "in '"+tt.entity+"'")
+ })
+ }
+}
+
+func TestReplaceMultipleWhitespaceAndEntitiesRandom(t *testing.T) {
+ entitiesMap := map[string][]byte{
+ "quot": []byte("\""),
+ "apos": []byte("'"),
+ }
+ revEntitiesMap := map[byte][]byte{
+ '\'': []byte("'"),
+ }
+
+ wsRegexp := regexp.MustCompile("[ ]+")
+ quotRegexp := regexp.MustCompile(""")
+ aposRegexp := regexp.MustCompile("('|')")
+ for _, e := range entitySlices {
+ reference := wsRegexp.ReplaceAll(e, []byte(" "))
+ reference = quotRegexp.ReplaceAll(reference, []byte("\""))
+ reference = aposRegexp.ReplaceAll(reference, []byte("'"))
+ test.Bytes(t, ReplaceMultipleWhitespaceAndEntities(Copy(e), entitiesMap, revEntitiesMap), reference, "in '"+string(e)+"'")
+ }
+}
+
+func TestDecodeURL(t *testing.T) {
+ var urlTests = []struct {
+ url string
+ expected string
+ }{
+ {"%20%3F%7E", " ?~"},
+ {"%80", "%80"},
+ {"%2B%2b", "++"},
+ {"%' ", "%' "},
+ {"a+b", "a b"},
+ }
+ for _, tt := range urlTests {
+ t.Run(tt.url, func(t *testing.T) {
+ b := DecodeURL([]byte(tt.url))
+ test.T(t, string(b), tt.expected, "in '"+tt.url+"'")
+ })
+ }
+}
+
+func TestDecodeURLRandom(t *testing.T) {
+ for _, e := range encodedUrlSlices {
+ reference, _ := url.QueryUnescape(string(e))
+ test.Bytes(t, DecodeURL(Copy(e)), []byte(reference), "in '"+string(e)+"'")
+ }
+}
+
+func TestEncodeURL(t *testing.T) {
+ var urlTests = []struct {
+ url string
+ expected string
+ }{
+ {"AZaz09-_.!~*'()", "AZaz09-_.!~*'()"},
+ {"<>", "%3C%3E"},
+ {"\u2318", "%E2%8C%98"},
+ {"a b", "a+b"},
+ }
+ for _, tt := range urlTests {
+ t.Run(tt.url, func(t *testing.T) {
+ b := EncodeURL([]byte(tt.url), URLEncodingTable)
+ test.T(t, string(b), tt.expected, "in '"+tt.url+"'")
+ })
+ }
+}
+
+func TestEncodeURLRandom(t *testing.T) {
+ for _, e := range urlSlices {
+ reference := url.QueryEscape(string(e))
+ test.Bytes(t, EncodeURL(Copy(e), URLEncodingTable), []byte(reference), "in '"+string(e)+"'")
+ }
}
////////////////////////////////////////////////////////////////
@@ -90,7 +277,7 @@
}
}
-func BenchmarkReplace(b *testing.B) {
+func BenchmarkReplaceMultipleWhitespace(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, e := range wsSlices {
ReplaceMultipleWhitespace(e)
diff -Nru golang-github-tdewolff-parse-2.3.9/xml/lex.go golang-github-tdewolff-parse-2.4.2/xml/lex.go
--- golang-github-tdewolff-parse-2.3.9/xml/lex.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/xml/lex.go 2019-12-17 13:35:25.000000000 +0000
@@ -92,6 +92,21 @@
l.r.Restore()
}
+// Offset returns the current position in the input stream.
+func (l *Lexer) Offset() int {
+ return l.r.Offset()
+}
+
+// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
+func (l *Lexer) Text() []byte {
+ return l.text
+}
+
+// AttrVal returns the attribute value when an AttributeToken was returned from Next.
+func (l *Lexer) AttrVal() []byte {
+ return l.attrVal
+}
+
// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
func (l *Lexer) Next() (TokenType, []byte) {
l.text = nil
@@ -107,25 +122,22 @@
}
if c == 0 {
if l.r.Err() == nil {
- l.err = parse.NewErrorLexer("unexpected null character", l.r)
+ l.err = parse.NewErrorLexer(l.r, "XML parse error: unexpected NULL character")
}
return ErrorToken, nil
} else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') {
return AttributeToken, l.shiftAttribute()
}
- start := l.r.Pos()
+ l.r.Skip()
l.inTag = false
if c == '/' {
l.r.Move(2)
- l.text = l.r.Lexeme()[start:]
return StartTagCloseVoidToken, l.r.Shift()
} else if c == '?' {
l.r.Move(2)
- l.text = l.r.Lexeme()[start:]
return StartTagClosePIToken, l.r.Shift()
} else {
l.r.Move(1)
- l.text = l.r.Lexeme()[start:]
return StartTagCloseToken, l.r.Shift()
}
}
@@ -134,7 +146,8 @@
c = l.r.Peek(0)
if c == '<' {
if l.r.Pos() > 0 {
- return TextToken, l.r.Shift()
+ l.text = l.r.Shift()
+ return TextToken, l.text
}
c = l.r.Peek(1)
if c == '/' {
@@ -163,10 +176,11 @@
return StartTagToken, l.shiftStartTag()
} else if c == 0 {
if l.r.Pos() > 0 {
- return TextToken, l.r.Shift()
+ l.text = l.r.Shift()
+ return TextToken, l.text
}
if l.r.Err() == nil {
- l.err = parse.NewErrorLexer("unexpected null character", l.r)
+ l.err = parse.NewErrorLexer(l.r, "XML parse error: unexpected NULL character")
}
return ErrorToken, nil
}
@@ -174,16 +188,6 @@
}
}
-// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
-func (l *Lexer) Text() []byte {
- return l.text
-}
-
-// AttrVal returns the attribute value when an AttributeToken was returned from Next.
-func (l *Lexer) AttrVal() []byte {
- return l.attrVal
-}
-
////////////////////////////////////////////////////////////////
// The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html
diff -Nru golang-github-tdewolff-parse-2.3.9/xml/lex_test.go golang-github-tdewolff-parse-2.4.2/xml/lex_test.go
--- golang-github-tdewolff-parse-2.3.9/xml/lex_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/xml/lex_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -160,7 +160,20 @@
col int
}{
{"a\x00b", 2},
- {"", 3},
+ {"<\x00 b='5'>", 2},
+ {"", 3},
+ {"", 4},
+ {"", 5},
+ {"", 6},
+ {"", 7},
+ {"", 3},
+ {" \x00>", 4},
+ {" a\x00", 5},
+ {"text`))
+ _, data := l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), nil)
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte("text"))
+ test.Bytes(t, l.Text(), []byte("text"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte("comment"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte(" doctype"))
+ test.Bytes(t, l.AttrVal(), nil)
+
+ _, data = l.Next()
+ test.Bytes(t, data, []byte(""))
+ test.Bytes(t, l.Text(), []byte("cdata"))
+ test.Bytes(t, l.AttrVal(), nil)
+}
+
+func TestOffset(t *testing.T) {
+ l := NewLexer(bytes.NewBufferString(`text
`))
+ test.T(t, l.Offset(), 0)
+ _, _ = l.Next()
+ test.T(t, l.Offset(), 4) //
+ _, _ = l.Next()
+ test.T(t, l.Offset(), 20) // text
+ _, _ = l.Next()
+ test.T(t, l.Offset(), 26) //
+}
+
////////////////////////////////////////////////////////////////
func ExampleNewLexer() {
diff -Nru golang-github-tdewolff-parse-2.3.9/xml/util.go golang-github-tdewolff-parse-2.4.2/xml/util.go
--- golang-github-tdewolff-parse-2.3.9/xml/util.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/xml/util.go 2019-12-17 13:35:25.000000000 +0000
@@ -1,7 +1,5 @@
package xml
-import "github.com/tdewolff/parse/v2"
-
var (
ltEntityBytes = []byte("<")
ampEntityBytes = []byte("&")
@@ -9,20 +7,19 @@
doubleQuoteEntityBytes = []byte(""")
)
+// Entities are all named character entities.
+var EntitiesMap = map[string][]byte{
+ "apos": []byte("'"),
+ "gt": []byte(">"),
+ "quot": []byte("\""),
+}
+
// EscapeAttrVal returns the escape attribute value bytes without quotes.
func EscapeAttrVal(buf *[]byte, b []byte) []byte {
singles := 0
doubles := 0
- for i, c := range b {
- if c == '&' {
- if quote, n := parse.QuoteEntity(b[i:]); n > 0 {
- if quote == '"' {
- doubles++
- } else {
- singles++
- }
- }
- } else if c == '"' {
+ for _, c := range b {
+ if c == '"' {
doubles++
} else if c == '\'' {
singles++
@@ -49,18 +46,7 @@
j := 1
start := 0
for i, c := range b {
- if c == '&' {
- if entityQuote, n := parse.QuoteEntity(b[i:]); n > 0 {
- j += copy(t[j:], b[start:i])
- if entityQuote != quote {
- t[j] = entityQuote
- j++
- } else {
- j += copy(t[j:], escapedQuote)
- }
- start = i + n
- }
- } else if c == quote {
+ if c == quote {
j += copy(t[j:], b[start:i])
j += copy(t[j:], escapedQuote)
start = i + 1
diff -Nru golang-github-tdewolff-parse-2.3.9/xml/util_test.go golang-github-tdewolff-parse-2.4.2/xml/util_test.go
--- golang-github-tdewolff-parse-2.3.9/xml/util_test.go 2019-08-22 18:19:17.000000000 +0000
+++ golang-github-tdewolff-parse-2.4.2/xml/util_test.go 2019-12-17 13:35:25.000000000 +0000
@@ -11,15 +11,14 @@
attrVal string
expected string
}{
- {"xyz", "\"xyz\""},
- {"", "\"\""},
- {"x&z", "\"x&z\""},
- {"x'z", "\"x'z\""},
- {"x\"z", "'x\"z'"},
- {"a'b=\"\"", "'a'b=\"\"'"},
- {"'x'\"'z'", "\"x'"'z\""},
- {"\"x"'"z\"", "'x\"'\"z'"},
- {"a'b=\"\"", "'a'b=\"\"'"},
+ {`xyz`, `"xyz"`},
+ {``, `""`},
+ {`x'z`, `"x'z"`},
+ {`x"z`, `'x"z'`},
+ {`a'b=""`, `'a'b=""'`},
+ {`'x'"'z'`, `"x'"'z"`},
+ {`"x"'"z"`, `'x"'"z'`},
+ {`a'b=""`, `'a'b=""'`},
}
var buf []byte
for _, tt := range attrValTests {
@@ -47,6 +46,7 @@
{"", ""},
{"", " a "},
{"", ""},
+ {"", " a ]]> b "},
}
var buf []byte
for _, tt := range CDATAValTests {