text`)) + _, data := l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), nil) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("text")) + test.Bytes(t, l.Text(), []byte("text")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte("comment")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte(" doctype")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte("cdata")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), nil) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("js")) + test.Bytes(t, l.Text(), nil) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte("script")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte("svg")) + test.Bytes(t, l.AttrVal(), nil) +} + +func TestOffset(t *testing.T) { + l := NewLexer(bytes.NewBufferString(`

text

`)) + test.T(t, l.Offset(), 0) + _, _ = l.Next() + test.T(t, l.Offset(), 4) //

+ _, _ = l.Next() + test.T(t, l.Offset(), 20) // text + _, _ = l.Next() + test.T(t, l.Offset(), 26) //

+} + //////////////////////////////////////////////////////////////// var J int diff -Nru golang-github-tdewolff-parse-2.3.9/html/util.go golang-github-tdewolff-parse-2.4.2/html/util.go --- golang-github-tdewolff-parse-2.3.9/html/util.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/html/util.go 2019-12-17 13:35:25.000000000 +0000 @@ -1,82 +1,23 @@ package html -import "github.com/tdewolff/parse/v2" - var ( singleQuoteEntityBytes = []byte("'") doubleQuoteEntityBytes = []byte(""") ) -var charTable = [256]bool{ - // ASCII - false, false, false, false, false, false, false, false, - false, true, true, true, true, true, false, false, // tab, new line, vertical tab, form feed, carriage return - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - true, false, true, false, false, false, true, true, // space, ", &, ' - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, true, true, true, false, // <, =, > - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - true, false, false, false, false, false, false, false, // ` - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - // non-ASCII - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, -} - // EscapeAttrVal returns the escaped attribute value bytes without quotes. func EscapeAttrVal(buf *[]byte, orig, b []byte, isXML bool) []byte { singles := 0 doubles := 0 unquoted := true entities := false - for i, c := range b { + for _, c := range b { if charTable[c] { - if c == '&' { - entities = true - if quote, n := parse.QuoteEntity(b[i:]); n > 0 { - if quote == '"' { - unquoted = false - doubles++ - } else { - unquoted = false - singles++ - } - } - } else { - unquoted = false - if c == '"' { - doubles++ - } else if c == '\'' { - singles++ - } + unquoted = false + if c == '"' { + doubles++ + } else if c == '\'' { + singles++ } } } @@ -106,18 +47,7 @@ j := 1 start := 0 for i, c := range b { - if c == '&' { - if entityQuote, n := parse.QuoteEntity(b[i:]); n > 0 { - j += copy(t[j:], b[start:i]) - if entityQuote != quote { - t[j] = entityQuote - j++ - } else { - j += copy(t[j:], escapedQuote) - } - start = i + n - } - } else if c == quote { + if c == quote { j += copy(t[j:], b[start:i]) j += copy(t[j:], escapedQuote) start = i + 1 @@ -127,3 +57,1147 @@ t[j] = quote return t[:j+1] } + +var charTable = [256]bool{ + // ASCII + false, false, false, false, false, false, false, false, + false, true, true, false, true, true, false, false, // tab, line feed, form feed, carriage return + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, true, false, false, false, false, true, // space, "), ' + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, true, true, true, false, // <, =, > + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, false, false, false, false, false, false, // ` + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + // non-ASCII + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, +} + +// Entities are all named character entities. +var EntitiesMap = map[string][]byte{ + "AElig": []byte("Æ"), + "AMP": []byte("&"), + "Aacute": []byte("Á"), + "Abreve": []byte("Ă"), + "Acirc": []byte("Â"), + "Agrave": []byte("À"), + "Alpha": []byte("Α"), + "Amacr": []byte("Ā"), + "Aogon": []byte("Ą"), + "ApplyFunction": []byte("⁡"), + "Aring": []byte("Å"), + "Assign": []byte("≔"), + "Atilde": []byte("Ã"), + "Backslash": []byte("∖"), + "Barwed": []byte("⌆"), + "Because": []byte("∵"), + "Bernoullis": []byte("ℬ"), + "Breve": []byte("˘"), + "Bumpeq": []byte("≎"), + "Cacute": []byte("Ć"), + "CapitalDifferentialD": []byte("ⅅ"), + "Cayleys": []byte("ℭ"), + "Ccaron": []byte("Č"), + "Ccedil": []byte("Ç"), + "Ccirc": []byte("Ĉ"), + "Cconint": []byte("∰"), + "Cedilla": []byte("¸"), + "CenterDot": []byte("·"), + "CircleDot": []byte("⊙"), + "CircleMinus": []byte("⊖"), + "CirclePlus": []byte("⊕"), + "CircleTimes": []byte("⊗"), + "ClockwiseContourIntegral": []byte("∲"), + "CloseCurlyDoubleQuote": []byte("”"), + "CloseCurlyQuote": []byte("’"), + "Congruent": []byte("≡"), + "Conint": []byte("∯"), + "ContourIntegral": []byte("∮"), + "Coproduct": []byte("∐"), + "CounterClockwiseContourIntegral": []byte("∳"), + "CupCap": []byte("≍"), + "DDotrahd": []byte("⤑"), + "Dagger": []byte("‡"), + "Dcaron": []byte("Ď"), + "Delta": []byte("Δ"), + "DiacriticalAcute": []byte("´"), + "DiacriticalDot": []byte("˙"), + "DiacriticalDoubleAcute": []byte("˝"), + "DiacriticalGrave": []byte("`"), + "DiacriticalTilde": []byte("˜"), + "Diamond": []byte("⋄"), + "DifferentialD": []byte("ⅆ"), + "DotDot": []byte("⃜"), + "DotEqual": []byte("≐"), + "DoubleContourIntegral": []byte("∯"), + "DoubleDot": []byte("¨"), + "DoubleDownArrow": []byte("⇓"), + "DoubleLeftArrow": []byte("⇐"), + "DoubleLeftRightArrow": []byte("⇔"), + "DoubleLeftTee": []byte("⫤"), + "DoubleLongLeftArrow": []byte("⟸"), + "DoubleLongLeftRightArrow": []byte("⟺"), + "DoubleLongRightArrow": []byte("⟹"), + "DoubleRightArrow": []byte("⇒"), + "DoubleRightTee": []byte("⊨"), + "DoubleUpArrow": []byte("⇑"), + "DoubleUpDownArrow": []byte("⇕"), + "DoubleVerticalBar": []byte("∥"), + "DownArrow": []byte("↓"), + "DownArrowBar": []byte("⤓"), + "DownArrowUpArrow": []byte("⇵"), + "DownBreve": []byte("̑"), + "DownLeftRightVector": []byte("⥐"), + "DownLeftTeeVector": []byte("⥞"), + "DownLeftVector": []byte("↽"), + "DownLeftVectorBar": []byte("⥖"), + "DownRightTeeVector": []byte("⥟"), + "DownRightVector": []byte("⇁"), + "DownRightVectorBar": []byte("⥗"), + "DownTee": []byte("⊤"), + "DownTeeArrow": []byte("↧"), + "Downarrow": []byte("⇓"), + "Dstrok": []byte("Đ"), + "Eacute": []byte("É"), + "Ecaron": []byte("Ě"), + "Ecirc": []byte("Ê"), + "Egrave": []byte("È"), + "Element": []byte("∈"), + "Emacr": []byte("Ē"), + "EmptySmallSquare": []byte("◻"), + "EmptyVerySmallSquare": []byte("▫"), + "Eogon": []byte("Ę"), + "Epsilon": []byte("Ε"), + "EqualTilde": []byte("≂"), + "Equilibrium": []byte("⇌"), + "Exists": []byte("∃"), + "ExponentialE": []byte("ⅇ"), + "FilledSmallSquare": []byte("◼"), + "FilledVerySmallSquare": []byte("▪"), + "ForAll": []byte("∀"), + "Fouriertrf": []byte("ℱ"), + "GT": []byte(">"), + "Gamma": []byte("Γ"), + "Gammad": []byte("Ϝ"), + "Gbreve": []byte("Ğ"), + "Gcedil": []byte("Ģ"), + "Gcirc": []byte("Ĝ"), + "GreaterEqual": []byte("≥"), + "GreaterEqualLess": []byte("⋛"), + "GreaterFullEqual": []byte("≧"), + "GreaterGreater": []byte("⪢"), + "GreaterLess": []byte("≷"), + "GreaterSlantEqual": []byte("⩾"), + "GreaterTilde": []byte("≳"), + "HARDcy": []byte("Ъ"), + "Hacek": []byte("ˇ"), + "Hat": []byte("^"), + "Hcirc": []byte("Ĥ"), + "HilbertSpace": []byte("ℋ"), + "HorizontalLine": []byte("─"), + "Hstrok": []byte("Ħ"), + "HumpDownHump": []byte("≎"), + "HumpEqual": []byte("≏"), + "IJlig": []byte("Ĳ"), + "Iacute": []byte("Í"), + "Icirc": []byte("Î"), + "Ifr": []byte("ℑ"), + "Igrave": []byte("Ì"), + "Imacr": []byte("Ī"), + "ImaginaryI": []byte("ⅈ"), + "Implies": []byte("⇒"), + "Integral": []byte("∫"), + "Intersection": []byte("⋂"), + "InvisibleComma": []byte("⁣"), + "InvisibleTimes": []byte("⁢"), + "Iogon": []byte("Į"), + "Itilde": []byte("Ĩ"), + "Jcirc": []byte("Ĵ"), + "Jsercy": []byte("Ј"), + "Kappa": []byte("Κ"), + "Kcedil": []byte("Ķ"), + "LT": []byte("<"), + "Lacute": []byte("Ĺ"), + "Lambda": []byte("Λ"), + "Laplacetrf": []byte("ℒ"), + "Lcaron": []byte("Ľ"), + "Lcedil": []byte("Ļ"), + "LeftAngleBracket": []byte("⟨"), + "LeftArrow": []byte("←"), + "LeftArrowBar": []byte("⇤"), + "LeftArrowRightArrow": []byte("⇆"), + "LeftCeiling": []byte("⌈"), + "LeftDoubleBracket": []byte("⟦"), + "LeftDownTeeVector": []byte("⥡"), + "LeftDownVector": []byte("⇃"), + "LeftDownVectorBar": []byte("⥙"), + "LeftFloor": []byte("⌊"), + "LeftRightArrow": []byte("↔"), + "LeftRightVector": []byte("⥎"), + "LeftTee": []byte("⊣"), + "LeftTeeArrow": []byte("↤"), + "LeftTeeVector": []byte("⥚"), + "LeftTriangle": []byte("⊲"), + "LeftTriangleBar": []byte("⧏"), + "LeftTriangleEqual": []byte("⊴"), + "LeftUpDownVector": []byte("⥑"), + "LeftUpTeeVector": []byte("⥠"), + "LeftUpVector": []byte("↿"), + "LeftUpVectorBar": []byte("⥘"), + "LeftVector": []byte("↼"), + "LeftVectorBar": []byte("⥒"), + "Leftarrow": []byte("⇐"), + "Leftrightarrow": []byte("⇔"), + "LessEqualGreater": []byte("⋚"), + "LessFullEqual": []byte("≦"), + "LessGreater": []byte("≶"), + "LessLess": []byte("⪡"), + "LessSlantEqual": []byte("⩽"), + "LessTilde": []byte("≲"), + "Lleftarrow": []byte("⇚"), + "Lmidot": []byte("Ŀ"), + "LongLeftArrow": []byte("⟵"), + "LongLeftRightArrow": []byte("⟷"), + "LongRightArrow": []byte("⟶"), + "Longleftarrow": []byte("⟸"), + "Longleftrightarrow": []byte("⟺"), + "Longrightarrow": []byte("⟹"), + "LowerLeftArrow": []byte("↙"), + "LowerRightArrow": []byte("↘"), + "Lstrok": []byte("Ł"), + "MediumSpace": []byte(" "), + "Mellintrf": []byte("ℳ"), + "MinusPlus": []byte("∓"), + "Nacute": []byte("Ń"), + "Ncaron": []byte("Ň"), + "Ncedil": []byte("Ņ"), + "NegativeMediumSpace": []byte(""), + "NegativeThickSpace": []byte(""), + "NegativeThinSpace": []byte(""), + "NegativeVeryThinSpace": []byte(""), + "NestedGreaterGreater": []byte("≫"), + "NestedLessLess": []byte("≪"), + "NewLine": []byte("\n"), + "NoBreak": []byte("⁠"), + "NonBreakingSpace": []byte(" "), + "NotCongruent": []byte("≢"), + "NotCupCap": []byte("≭"), + "NotDoubleVerticalBar": []byte("∦"), + "NotElement": []byte("∉"), + "NotEqual": []byte("≠"), + "NotExists": []byte("∄"), + "NotGreater": []byte("≯"), + "NotGreaterEqual": []byte("≱"), + "NotGreaterLess": []byte("≹"), + "NotGreaterTilde": []byte("≵"), + "NotLeftTriangle": []byte("⋪"), + "NotLeftTriangleEqual": []byte("⋬"), + "NotLess": []byte("≮"), + "NotLessEqual": []byte("≰"), + "NotLessGreater": []byte("≸"), + "NotLessTilde": []byte("≴"), + "NotPrecedes": []byte("⊀"), + "NotPrecedesSlantEqual": []byte("⋠"), + "NotReverseElement": []byte("∌"), + "NotRightTriangle": []byte("⋫"), + "NotRightTriangleEqual": []byte("⋭"), + "NotSquareSubsetEqual": []byte("⋢"), + "NotSquareSupersetEqual": []byte("⋣"), + "NotSubsetEqual": []byte("⊈"), + "NotSucceeds": []byte("⊁"), + "NotSucceedsSlantEqual": []byte("⋡"), + "NotSupersetEqual": []byte("⊉"), + "NotTilde": []byte("≁"), + "NotTildeEqual": []byte("≄"), + "NotTildeFullEqual": []byte("≇"), + "NotTildeTilde": []byte("≉"), + "NotVerticalBar": []byte("∤"), + "Ntilde": []byte("Ñ"), + "OElig": []byte("Œ"), + "Oacute": []byte("Ó"), + "Ocirc": []byte("Ô"), + "Odblac": []byte("Ő"), + "Ograve": []byte("Ò"), + "Omacr": []byte("Ō"), + "Omega": []byte("Ω"), + "Omicron": []byte("Ο"), + "OpenCurlyDoubleQuote": []byte("“"), + "OpenCurlyQuote": []byte("‘"), + "Oslash": []byte("Ø"), + "Otilde": []byte("Õ"), + "OverBar": []byte("‾"), + "OverBrace": []byte("⏞"), + "OverBracket": []byte("⎴"), + "OverParenthesis": []byte("⏜"), + "PartialD": []byte("∂"), + "PlusMinus": []byte("±"), + "Poincareplane": []byte("ℌ"), + "Precedes": []byte("≺"), + "PrecedesEqual": []byte("⪯"), + "PrecedesSlantEqual": []byte("≼"), + "PrecedesTilde": []byte("≾"), + "Product": []byte("∏"), + "Proportion": []byte("∷"), + "Proportional": []byte("∝"), + "QUOT": []byte("\""), + "Racute": []byte("Ŕ"), + "Rcaron": []byte("Ř"), + "Rcedil": []byte("Ŗ"), + "ReverseElement": []byte("∋"), + "ReverseEquilibrium": []byte("⇋"), + "ReverseUpEquilibrium": []byte("⥯"), + "Rfr": []byte("ℜ"), + "RightAngleBracket": []byte("⟩"), + "RightArrow": []byte("→"), + "RightArrowBar": []byte("⇥"), + "RightArrowLeftArrow": []byte("⇄"), + "RightCeiling": []byte("⌉"), + "RightDoubleBracket": []byte("⟧"), + "RightDownTeeVector": []byte("⥝"), + "RightDownVector": []byte("⇂"), + "RightDownVectorBar": []byte("⥕"), + "RightFloor": []byte("⌋"), + "RightTee": []byte("⊢"), + "RightTeeArrow": []byte("↦"), + "RightTeeVector": []byte("⥛"), + "RightTriangle": []byte("⊳"), + "RightTriangleBar": []byte("⧐"), + "RightTriangleEqual": []byte("⊵"), + "RightUpDownVector": []byte("⥏"), + "RightUpTeeVector": []byte("⥜"), + "RightUpVector": []byte("↾"), + "RightUpVectorBar": []byte("⥔"), + "RightVector": []byte("⇀"), + "RightVectorBar": []byte("⥓"), + "Rightarrow": []byte("⇒"), + "RoundImplies": []byte("⥰"), + "Rrightarrow": []byte("⇛"), + "RuleDelayed": []byte("⧴"), + "SHCHcy": []byte("Щ"), + "SOFTcy": []byte("Ь"), + "Sacute": []byte("Ś"), + "Scaron": []byte("Š"), + "Scedil": []byte("Ş"), + "Scirc": []byte("Ŝ"), + "ShortDownArrow": []byte("↓"), + "ShortLeftArrow": []byte("←"), + "ShortRightArrow": []byte("→"), + "ShortUpArrow": []byte("↑"), + "Sigma": []byte("Σ"), + "SmallCircle": []byte("∘"), + "Square": []byte("□"), + "SquareIntersection": []byte("⊓"), + "SquareSubset": []byte("⊏"), + "SquareSubsetEqual": []byte("⊑"), + "SquareSuperset": []byte("⊐"), + "SquareSupersetEqual": []byte("⊒"), + "SquareUnion": []byte("⊔"), + "Subset": []byte("⋐"), + "SubsetEqual": []byte("⊆"), + "Succeeds": []byte("≻"), + "SucceedsEqual": []byte("⪰"), + "SucceedsSlantEqual": []byte("≽"), + "SucceedsTilde": []byte("≿"), + "SuchThat": []byte("∋"), + "Superset": []byte("⊃"), + "SupersetEqual": []byte("⊇"), + "Supset": []byte("⋑"), + "THORN": []byte("Þ"), + "Tab": []byte(" "), + "Tcaron": []byte("Ť"), + "Tcedil": []byte("Ţ"), + "Therefore": []byte("∴"), + "Theta": []byte("Θ"), + "ThinSpace": []byte(" "), + "Tilde": []byte("∼"), + "TildeEqual": []byte("≃"), + "TildeFullEqual": []byte("≅"), + "TildeTilde": []byte("≈"), + "TripleDot": []byte("⃛"), + "Tstrok": []byte("Ŧ"), + "Uacute": []byte("Ú"), + "Uarrocir": []byte("⥉"), + "Ubreve": []byte("Ŭ"), + "Ucirc": []byte("Û"), + "Udblac": []byte("Ű"), + "Ugrave": []byte("Ù"), + "Umacr": []byte("Ū"), + "UnderBar": []byte("_"), + "UnderBrace": []byte("⏟"), + "UnderBracket": []byte("⎵"), + "UnderParenthesis": []byte("⏝"), + "Union": []byte("⋃"), + "UnionPlus": []byte("⊎"), + "Uogon": []byte("Ų"), + "UpArrow": []byte("↑"), + "UpArrowBar": []byte("⤒"), + "UpArrowDownArrow": []byte("⇅"), + "UpDownArrow": []byte("↕"), + "UpEquilibrium": []byte("⥮"), + "UpTee": []byte("⊥"), + "UpTeeArrow": []byte("↥"), + "Uparrow": []byte("⇑"), + "Updownarrow": []byte("⇕"), + "UpperLeftArrow": []byte("↖"), + "UpperRightArrow": []byte("↗"), + "Upsilon": []byte("Υ"), + "Uring": []byte("Ů"), + "Utilde": []byte("Ũ"), + "Verbar": []byte("‖"), + "VerticalBar": []byte("∣"), + "VerticalLine": []byte("|"), + "VerticalSeparator": []byte("❘"), + "VerticalTilde": []byte("≀"), + "VeryThinSpace": []byte(" "), + "Vvdash": []byte("⊪"), + "Wcirc": []byte("Ŵ"), + "Yacute": []byte("Ý"), + "Ycirc": []byte("Ŷ"), + "Zacute": []byte("Ź"), + "Zcaron": []byte("Ž"), + "ZeroWidthSpace": []byte(""), + "aacute": []byte("á"), + "abreve": []byte("ă"), + "acirc": []byte("â"), + "acute": []byte("´"), + "aelig": []byte("æ"), + "agrave": []byte("à"), + "alefsym": []byte("ℵ"), + "alpha": []byte("α"), + "amacr": []byte("ā"), + "amp": []byte("&"), + "andslope": []byte("⩘"), + "angle": []byte("∠"), + "angmsd": []byte("∡"), + "angmsdaa": []byte("⦨"), + "angmsdab": []byte("⦩"), + "angmsdac": []byte("⦪"), + "angmsdad": []byte("⦫"), + "angmsdae": []byte("⦬"), + "angmsdaf": []byte("⦭"), + "angmsdag": []byte("⦮"), + "angmsdah": []byte("⦯"), + "angrtvb": []byte("⊾"), + "angrtvbd": []byte("⦝"), + "angsph": []byte("∢"), + "angst": []byte("Å"), + "angzarr": []byte("⍼"), + "aogon": []byte("ą"), + "apos": []byte("'"), + "approx": []byte("≈"), + "approxeq": []byte("≊"), + "aring": []byte("å"), + "ast": []byte("*"), + "asymp": []byte("≈"), + "asympeq": []byte("≍"), + "atilde": []byte("ã"), + "awconint": []byte("∳"), + "backcong": []byte("≌"), + "backepsilon": []byte("϶"), + "backprime": []byte("‵"), + "backsim": []byte("∽"), + "backsimeq": []byte("⋍"), + "barvee": []byte("⊽"), + "barwed": []byte("⌅"), + "barwedge": []byte("⌅"), + "bbrktbrk": []byte("⎶"), + "becaus": []byte("∵"), + "because": []byte("∵"), + "bemptyv": []byte("⦰"), + "bernou": []byte("ℬ"), + "between": []byte("≬"), + "bigcap": []byte("⋂"), + "bigcirc": []byte("◯"), + "bigcup": []byte("⋃"), + "bigodot": []byte("⨀"), + "bigoplus": []byte("⨁"), + "bigotimes": []byte("⨂"), + "bigsqcup": []byte("⨆"), + "bigstar": []byte("★"), + "bigtriangledown": []byte("▽"), + "bigtriangleup": []byte("△"), + "biguplus": []byte("⨄"), + "bigvee": []byte("⋁"), + "bigwedge": []byte("⋀"), + "bkarow": []byte("⤍"), + "blacklozenge": []byte("⧫"), + "blacksquare": []byte("▪"), + "blacktriangle": []byte("▴"), + "blacktriangledown": []byte("▾"), + "blacktriangleleft": []byte("◂"), + "blacktriangleright": []byte("▸"), + "bottom": []byte("⊥"), + "bowtie": []byte("⋈"), + "boxminus": []byte("⊟"), + "boxplus": []byte("⊞"), + "boxtimes": []byte("⊠"), + "bprime": []byte("‵"), + "breve": []byte("˘"), + "brvbar": []byte("¦"), + "bsol": []byte("\\"), + "bsolhsub": []byte("⟈"), + "bullet": []byte("•"), + "bumpeq": []byte("≏"), + "cacute": []byte("ć"), + "capbrcup": []byte("⩉"), + "caron": []byte("ˇ"), + "ccaron": []byte("č"), + "ccedil": []byte("ç"), + "ccirc": []byte("ĉ"), + "ccupssm": []byte("⩐"), + "cedil": []byte("¸"), + "cemptyv": []byte("⦲"), + "centerdot": []byte("·"), + "checkmark": []byte("✓"), + "circeq": []byte("≗"), + "circlearrowleft": []byte("↺"), + "circlearrowright": []byte("↻"), + "circledR": []byte("®"), + "circledS": []byte("Ⓢ"), + "circledast": []byte("⊛"), + "circledcirc": []byte("⊚"), + "circleddash": []byte("⊝"), + "cirfnint": []byte("⨐"), + "cirscir": []byte("⧂"), + "clubsuit": []byte("♣"), + "colon": []byte(":"), + "colone": []byte("≔"), + "coloneq": []byte("≔"), + "comma": []byte(","), + "commat": []byte("@"), + "compfn": []byte("∘"), + "complement": []byte("∁"), + "complexes": []byte("ℂ"), + "congdot": []byte("⩭"), + "conint": []byte("∮"), + "coprod": []byte("∐"), + "copysr": []byte("℗"), + "cudarrl": []byte("⤸"), + "cudarrr": []byte("⤵"), + "cularr": []byte("↶"), + "cularrp": []byte("⤽"), + "cupbrcap": []byte("⩈"), + "cupdot": []byte("⊍"), + "curarr": []byte("↷"), + "curarrm": []byte("⤼"), + "curlyeqprec": []byte("⋞"), + "curlyeqsucc": []byte("⋟"), + "curlyvee": []byte("⋎"), + "curlywedge": []byte("⋏"), + "curren": []byte("¤"), + "curvearrowleft": []byte("↶"), + "curvearrowright": []byte("↷"), + "cwconint": []byte("∲"), + "cylcty": []byte("⌭"), + "dagger": []byte("†"), + "daleth": []byte("ℸ"), + "dbkarow": []byte("⤏"), + "dblac": []byte("˝"), + "dcaron": []byte("ď"), + "ddagger": []byte("‡"), + "ddotseq": []byte("⩷"), + "delta": []byte("δ"), + "demptyv": []byte("⦱"), + "diamond": []byte("⋄"), + "diamondsuit": []byte("♦"), + "digamma": []byte("ϝ"), + "divide": []byte("÷"), + "divideontimes": []byte("⋇"), + "divonx": []byte("⋇"), + "dlcorn": []byte("⌞"), + "dlcrop": []byte("⌍"), + "dollar": []byte("$"), + "doteqdot": []byte("≑"), + "dotminus": []byte("∸"), + "dotplus": []byte("∔"), + "dotsquare": []byte("⊡"), + "doublebarwedge": []byte("⌆"), + "downarrow": []byte("↓"), + "downdownarrows": []byte("⇊"), + "downharpoonleft": []byte("⇃"), + "downharpoonright": []byte("⇂"), + "drbkarow": []byte("⤐"), + "drcorn": []byte("⌟"), + "drcrop": []byte("⌌"), + "dstrok": []byte("đ"), + "dwangle": []byte("⦦"), + "dzigrarr": []byte("⟿"), + "eacute": []byte("é"), + "ecaron": []byte("ě"), + "ecirc": []byte("ê"), + "ecolon": []byte("≕"), + "egrave": []byte("è"), + "elinters": []byte("⏧"), + "emacr": []byte("ē"), + "emptyset": []byte("∅"), + "emptyv": []byte("∅"), + "emsp13": []byte(" "), + "emsp14": []byte(" "), + "eogon": []byte("ę"), + "epsilon": []byte("ε"), + "eqcirc": []byte("≖"), + "eqcolon": []byte("≕"), + "eqsim": []byte("≂"), + "eqslantgtr": []byte("⪖"), + "eqslantless": []byte("⪕"), + "equals": []byte("="), + "equest": []byte("≟"), + "equivDD": []byte("⩸"), + "eqvparsl": []byte("⧥"), + "excl": []byte("!"), + "expectation": []byte("ℰ"), + "exponentiale": []byte("ⅇ"), + "fallingdotseq": []byte("≒"), + "female": []byte("♀"), + "forall": []byte("∀"), + "fpartint": []byte("⨍"), + "frac12": []byte("½"), + "frac13": []byte("⅓"), + "frac14": []byte("¼"), + "frac15": []byte("⅕"), + "frac16": []byte("⅙"), + "frac18": []byte("⅛"), + "frac23": []byte("⅔"), + "frac25": []byte("⅖"), + "frac34": []byte("¾"), + "frac35": []byte("⅗"), + "frac38": []byte("⅜"), + "frac45": []byte("⅘"), + "frac56": []byte("⅚"), + "frac58": []byte("⅝"), + "frac78": []byte("⅞"), + "gacute": []byte("ǵ"), + "gamma": []byte("γ"), + "gammad": []byte("ϝ"), + "gbreve": []byte("ğ"), + "gcirc": []byte("ĝ"), + "geq": []byte("≥"), + "geqq": []byte("≧"), + "geqslant": []byte("⩾"), + "gesdoto": []byte("⪂"), + "gesdotol": []byte("⪄"), + "ggg": []byte("⋙"), + "gnapprox": []byte("⪊"), + "gneq": []byte("⪈"), + "gneqq": []byte("≩"), + "grave": []byte("`"), + "gt": []byte(">"), + "gtquest": []byte("⩼"), + "gtrapprox": []byte("⪆"), + "gtrdot": []byte("⋗"), + "gtreqless": []byte("⋛"), + "gtreqqless": []byte("⪌"), + "gtrless": []byte("≷"), + "gtrsim": []byte("≳"), + "hArr": []byte("⇔"), + "hairsp": []byte(" "), + "hamilt": []byte("ℋ"), + "hardcy": []byte("ъ"), + "harrcir": []byte("⥈"), + "hcirc": []byte("ĥ"), + "hearts": []byte("♥"), + "heartsuit": []byte("♥"), + "hellip": []byte("…"), + "hercon": []byte("⊹"), + "hksearow": []byte("⤥"), + "hkswarow": []byte("⤦"), + "homtht": []byte("∻"), + "hookleftarrow": []byte("↩"), + "hookrightarrow": []byte("↪"), + "horbar": []byte("―"), + "hslash": []byte("ℏ"), + "hstrok": []byte("ħ"), + "hybull": []byte("⁃"), + "hyphen": []byte("‐"), + "iacute": []byte("í"), + "icirc": []byte("î"), + "iexcl": []byte("¡"), + "igrave": []byte("ì"), + "iiiint": []byte("⨌"), + "iiint": []byte("∭"), + "ijlig": []byte("ĳ"), + "imacr": []byte("ī"), + "image": []byte("ℑ"), + "imagline": []byte("ℐ"), + "imagpart": []byte("ℑ"), + "imath": []byte("ı"), + "imped": []byte("Ƶ"), + "incare": []byte("℅"), + "infintie": []byte("⧝"), + "inodot": []byte("ı"), + "intcal": []byte("⊺"), + "integers": []byte("ℤ"), + "intercal": []byte("⊺"), + "intlarhk": []byte("⨗"), + "intprod": []byte("⨼"), + "iogon": []byte("į"), + "iquest": []byte("¿"), + "isin": []byte("∈"), + "isindot": []byte("⋵"), + "isinsv": []byte("⋳"), + "isinv": []byte("∈"), + "itilde": []byte("ĩ"), + "jcirc": []byte("ĵ"), + "jmath": []byte("ȷ"), + "jsercy": []byte("ј"), + "kappa": []byte("κ"), + "kappav": []byte("ϰ"), + "kcedil": []byte("ķ"), + "kgreen": []byte("ĸ"), + "lacute": []byte("ĺ"), + "laemptyv": []byte("⦴"), + "lagran": []byte("ℒ"), + "lambda": []byte("λ"), + "langle": []byte("⟨"), + "laquo": []byte("«"), + "larrbfs": []byte("⤟"), + "larrhk": []byte("↩"), + "larrlp": []byte("↫"), + "larrsim": []byte("⥳"), + "larrtl": []byte("↢"), + "lbrace": []byte("{"), + "lbrack": []byte("["), + "lbrksld": []byte("⦏"), + "lbrkslu": []byte("⦍"), + "lcaron": []byte("ľ"), + "lcedil": []byte("ļ"), + "lcub": []byte("{"), + "ldquor": []byte("„"), + "ldrdhar": []byte("⥧"), + "ldrushar": []byte("⥋"), + "leftarrow": []byte("←"), + "leftarrowtail": []byte("↢"), + "leftharpoondown": []byte("↽"), + "leftharpoonup": []byte("↼"), + "leftleftarrows": []byte("⇇"), + "leftrightarrow": []byte("↔"), + "leftrightarrows": []byte("⇆"), + "leftrightharpoons": []byte("⇋"), + "leftrightsquigarrow": []byte("↭"), + "leftthreetimes": []byte("⋋"), + "leq": []byte("≤"), + "leqq": []byte("≦"), + "leqslant": []byte("⩽"), + "lesdoto": []byte("⪁"), + "lesdotor": []byte("⪃"), + "lessapprox": []byte("⪅"), + "lessdot": []byte("⋖"), + "lesseqgtr": []byte("⋚"), + "lesseqqgtr": []byte("⪋"), + "lessgtr": []byte("≶"), + "lesssim": []byte("≲"), + "lfloor": []byte("⌊"), + "llcorner": []byte("⌞"), + "lmidot": []byte("ŀ"), + "lmoust": []byte("⎰"), + "lmoustache": []byte("⎰"), + "lnapprox": []byte("⪉"), + "lneq": []byte("⪇"), + "lneqq": []byte("≨"), + "longleftarrow": []byte("⟵"), + "longleftrightarrow": []byte("⟷"), + "longmapsto": []byte("⟼"), + "longrightarrow": []byte("⟶"), + "looparrowleft": []byte("↫"), + "looparrowright": []byte("↬"), + "lotimes": []byte("⨴"), + "lowast": []byte("∗"), + "lowbar": []byte("_"), + "lozenge": []byte("◊"), + "lpar": []byte("("), + "lrcorner": []byte("⌟"), + "lsaquo": []byte("‹"), + "lsqb": []byte("["), + "lsquor": []byte("‚"), + "lstrok": []byte("ł"), + "lt": []byte("<"), + "lthree": []byte("⋋"), + "ltimes": []byte("⋉"), + "ltquest": []byte("⩻"), + "lurdshar": []byte("⥊"), + "luruhar": []byte("⥦"), + "maltese": []byte("✠"), + "mapsto": []byte("↦"), + "mapstodown": []byte("↧"), + "mapstoleft": []byte("↤"), + "mapstoup": []byte("↥"), + "marker": []byte("▮"), + "measuredangle": []byte("∡"), + "micro": []byte("µ"), + "midast": []byte("*"), + "middot": []byte("·"), + "minusb": []byte("⊟"), + "minusd": []byte("∸"), + "minusdu": []byte("⨪"), + "mnplus": []byte("∓"), + "models": []byte("⊧"), + "mstpos": []byte("∾"), + "multimap": []byte("⊸"), + "nLeftarrow": []byte("⇍"), + "nLeftrightarrow": []byte("⇎"), + "nRightarrow": []byte("⇏"), + "nVDash": []byte("⊯"), + "nVdash": []byte("⊮"), + "nabla": []byte("∇"), + "nacute": []byte("ń"), + "napos": []byte("ŉ"), + "napprox": []byte("≉"), + "natural": []byte("♮"), + "naturals": []byte("ℕ"), + "ncaron": []byte("ň"), + "ncedil": []byte("ņ"), + "nearrow": []byte("↗"), + "nequiv": []byte("≢"), + "nesear": []byte("⤨"), + "nexist": []byte("∄"), + "nexists": []byte("∄"), + "ngeq": []byte("≱"), + "ngtr": []byte("≯"), + "niv": []byte("∋"), + "nleftarrow": []byte("↚"), + "nleftrightarrow": []byte("↮"), + "nleq": []byte("≰"), + "nless": []byte("≮"), + "nltrie": []byte("⋬"), + "notinva": []byte("∉"), + "notinvb": []byte("⋷"), + "notinvc": []byte("⋶"), + "notniva": []byte("∌"), + "notnivb": []byte("⋾"), + "notnivc": []byte("⋽"), + "nparallel": []byte("∦"), + "npolint": []byte("⨔"), + "nprcue": []byte("⋠"), + "nprec": []byte("⊀"), + "nrightarrow": []byte("↛"), + "nrtrie": []byte("⋭"), + "nsccue": []byte("⋡"), + "nshortmid": []byte("∤"), + "nshortparallel": []byte("∦"), + "nsimeq": []byte("≄"), + "nsmid": []byte("∤"), + "nspar": []byte("∦"), + "nsqsube": []byte("⋢"), + "nsqsupe": []byte("⋣"), + "nsubseteq": []byte("⊈"), + "nsucc": []byte("⊁"), + "nsupseteq": []byte("⊉"), + "ntilde": []byte("ñ"), + "ntriangleleft": []byte("⋪"), + "ntrianglelefteq": []byte("⋬"), + "ntriangleright": []byte("⋫"), + "ntrianglerighteq": []byte("⋭"), + "num": []byte("#"), + "numero": []byte("№"), + "nvDash": []byte("⊭"), + "nvdash": []byte("⊬"), + "nvinfin": []byte("⧞"), + "nwarrow": []byte("↖"), + "oacute": []byte("ó"), + "ocirc": []byte("ô"), + "odblac": []byte("ő"), + "oelig": []byte("œ"), + "ograve": []byte("ò"), + "olcross": []byte("⦻"), + "omacr": []byte("ō"), + "omega": []byte("ω"), + "omicron": []byte("ο"), + "ominus": []byte("⊖"), + "order": []byte("ℴ"), + "orderof": []byte("ℴ"), + "origof": []byte("⊶"), + "orslope": []byte("⩗"), + "oslash": []byte("ø"), + "otilde": []byte("õ"), + "otimes": []byte("⊗"), + "otimesas": []byte("⨶"), + "parallel": []byte("∥"), + "percnt": []byte("%"), + "period": []byte("."), + "permil": []byte("‰"), + "perp": []byte("⊥"), + "pertenk": []byte("‱"), + "phmmat": []byte("ℳ"), + "pitchfork": []byte("⋔"), + "planck": []byte("ℏ"), + "planckh": []byte("ℎ"), + "plankv": []byte("ℏ"), + "plus": []byte("+"), + "plusacir": []byte("⨣"), + "pluscir": []byte("⨢"), + "plusdo": []byte("∔"), + "plusmn": []byte("±"), + "plussim": []byte("⨦"), + "plustwo": []byte("⨧"), + "pointint": []byte("⨕"), + "pound": []byte("£"), + "prec": []byte("≺"), + "precapprox": []byte("⪷"), + "preccurlyeq": []byte("≼"), + "preceq": []byte("⪯"), + "precnapprox": []byte("⪹"), + "precneqq": []byte("⪵"), + "precnsim": []byte("⋨"), + "precsim": []byte("≾"), + "primes": []byte("ℙ"), + "prnsim": []byte("⋨"), + "profalar": []byte("⌮"), + "profline": []byte("⌒"), + "profsurf": []byte("⌓"), + "propto": []byte("∝"), + "prurel": []byte("⊰"), + "puncsp": []byte(" "), + "qprime": []byte("⁗"), + "quaternions": []byte("ℍ"), + "quatint": []byte("⨖"), + "quest": []byte("?"), + "questeq": []byte("≟"), + "quot": []byte("\""), + "racute": []byte("ŕ"), + "radic": []byte("√"), + "raemptyv": []byte("⦳"), + "rangle": []byte("⟩"), + "raquo": []byte("»"), + "rarrbfs": []byte("⤠"), + "rarrhk": []byte("↪"), + "rarrlp": []byte("↬"), + "rarrsim": []byte("⥴"), + "rarrtl": []byte("↣"), + "rationals": []byte("ℚ"), + "rbrace": []byte("}"), + "rbrack": []byte("]"), + "rbrksld": []byte("⦎"), + "rbrkslu": []byte("⦐"), + "rcaron": []byte("ř"), + "rcedil": []byte("ŗ"), + "rcub": []byte("}"), + "rdldhar": []byte("⥩"), + "rdquor": []byte("”"), + "real": []byte("ℜ"), + "realine": []byte("ℛ"), + "realpart": []byte("ℜ"), + "reals": []byte("ℝ"), + "rfloor": []byte("⌋"), + "rightarrow": []byte("→"), + "rightarrowtail": []byte("↣"), + "rightharpoondown": []byte("⇁"), + "rightharpoonup": []byte("⇀"), + "rightleftarrows": []byte("⇄"), + "rightleftharpoons": []byte("⇌"), + "rightrightarrows": []byte("⇉"), + "rightsquigarrow": []byte("↝"), + "rightthreetimes": []byte("⋌"), + "risingdotseq": []byte("≓"), + "rmoust": []byte("⎱"), + "rmoustache": []byte("⎱"), + "rotimes": []byte("⨵"), + "rpar": []byte(")"), + "rppolint": []byte("⨒"), + "rsaquo": []byte("›"), + "rsqb": []byte("]"), + "rsquor": []byte("’"), + "rthree": []byte("⋌"), + "rtimes": []byte("⋊"), + "rtriltri": []byte("⧎"), + "ruluhar": []byte("⥨"), + "sacute": []byte("ś"), + "scaron": []byte("š"), + "scedil": []byte("ş"), + "scirc": []byte("ŝ"), + "scnsim": []byte("⋩"), + "scpolint": []byte("⨓"), + "searrow": []byte("↘"), + "semi": []byte(";"), + "seswar": []byte("⤩"), + "setminus": []byte("∖"), + "sfrown": []byte("⌢"), + "shchcy": []byte("щ"), + "shortmid": []byte("∣"), + "shortparallel": []byte("∥"), + "sigma": []byte("σ"), + "sigmaf": []byte("ς"), + "sigmav": []byte("ς"), + "simeq": []byte("≃"), + "simplus": []byte("⨤"), + "simrarr": []byte("⥲"), + "slarr": []byte("←"), + "smallsetminus": []byte("∖"), + "smeparsl": []byte("⧤"), + "smid": []byte("∣"), + "softcy": []byte("ь"), + "sol": []byte("/"), + "solbar": []byte("⌿"), + "spades": []byte("♠"), + "spadesuit": []byte("♠"), + "spar": []byte("∥"), + "sqsube": []byte("⊑"), + "sqsubset": []byte("⊏"), + "sqsubseteq": []byte("⊑"), + "sqsupe": []byte("⊒"), + "sqsupset": []byte("⊐"), + "sqsupseteq": []byte("⊒"), + "square": []byte("□"), + "squarf": []byte("▪"), + "srarr": []byte("→"), + "ssetmn": []byte("∖"), + "ssmile": []byte("⌣"), + "sstarf": []byte("⋆"), + "straightepsilon": []byte("ϵ"), + "straightphi": []byte("ϕ"), + "strns": []byte("¯"), + "subedot": []byte("⫃"), + "submult": []byte("⫁"), + "subplus": []byte("⪿"), + "subrarr": []byte("⥹"), + "subset": []byte("⊂"), + "subseteq": []byte("⊆"), + "subseteqq": []byte("⫅"), + "subsetneq": []byte("⊊"), + "subsetneqq": []byte("⫋"), + "succ": []byte("≻"), + "succapprox": []byte("⪸"), + "succcurlyeq": []byte("≽"), + "succeq": []byte("⪰"), + "succnapprox": []byte("⪺"), + "succneqq": []byte("⪶"), + "succnsim": []byte("⋩"), + "succsim": []byte("≿"), + "supdsub": []byte("⫘"), + "supedot": []byte("⫄"), + "suphsol": []byte("⟉"), + "suphsub": []byte("⫗"), + "suplarr": []byte("⥻"), + "supmult": []byte("⫂"), + "supplus": []byte("⫀"), + "supset": []byte("⊃"), + "supseteq": []byte("⊇"), + "supseteqq": []byte("⫆"), + "supsetneq": []byte("⊋"), + "supsetneqq": []byte("⫌"), + "swarrow": []byte("↙"), + "szlig": []byte("ß"), + "target": []byte("⌖"), + "tcaron": []byte("ť"), + "tcedil": []byte("ţ"), + "telrec": []byte("⌕"), + "there4": []byte("∴"), + "therefore": []byte("∴"), + "theta": []byte("θ"), + "thetasym": []byte("ϑ"), + "thetav": []byte("ϑ"), + "thickapprox": []byte("≈"), + "thicksim": []byte("∼"), + "thinsp": []byte(" "), + "thkap": []byte("≈"), + "thksim": []byte("∼"), + "thorn": []byte("þ"), + "tilde": []byte("˜"), + "times": []byte("×"), + "timesb": []byte("⊠"), + "timesbar": []byte("⨱"), + "topbot": []byte("⌶"), + "topfork": []byte("⫚"), + "tprime": []byte("‴"), + "triangle": []byte("▵"), + "triangledown": []byte("▿"), + "triangleleft": []byte("◃"), + "trianglelefteq": []byte("⊴"), + "triangleq": []byte("≜"), + "triangleright": []byte("▹"), + "trianglerighteq": []byte("⊵"), + "tridot": []byte("◬"), + "triminus": []byte("⨺"), + "triplus": []byte("⨹"), + "tritime": []byte("⨻"), + "trpezium": []byte("⏢"), + "tstrok": []byte("ŧ"), + "twoheadleftarrow": []byte("↞"), + "twoheadrightarrow": []byte("↠"), + "uacute": []byte("ú"), + "ubreve": []byte("ŭ"), + "ucirc": []byte("û"), + "udblac": []byte("ű"), + "ugrave": []byte("ù"), + "ulcorn": []byte("⌜"), + "ulcorner": []byte("⌜"), + "ulcrop": []byte("⌏"), + "umacr": []byte("ū"), + "uogon": []byte("ų"), + "uparrow": []byte("↑"), + "updownarrow": []byte("↕"), + "upharpoonleft": []byte("↿"), + "upharpoonright": []byte("↾"), + "upsih": []byte("ϒ"), + "upsilon": []byte("υ"), + "upuparrows": []byte("⇈"), + "urcorn": []byte("⌝"), + "urcorner": []byte("⌝"), + "urcrop": []byte("⌎"), + "uring": []byte("ů"), + "utilde": []byte("ũ"), + "uwangle": []byte("⦧"), + "varepsilon": []byte("ϵ"), + "varkappa": []byte("ϰ"), + "varnothing": []byte("∅"), + "varphi": []byte("ϕ"), + "varpi": []byte("ϖ"), + "varpropto": []byte("∝"), + "varrho": []byte("ϱ"), + "varsigma": []byte("ς"), + "vartheta": []byte("ϑ"), + "vartriangleleft": []byte("⊲"), + "vartriangleright": []byte("⊳"), + "vee": []byte("∨"), + "veebar": []byte("⊻"), + "vellip": []byte("⋮"), + "verbar": []byte("|"), + "vert": []byte("|"), + "vprop": []byte("∝"), + "vzigzag": []byte("⦚"), + "wcirc": []byte("ŵ"), + "wedge": []byte("∧"), + "wedgeq": []byte("≙"), + "weierp": []byte("℘"), + "wreath": []byte("≀"), + "xvee": []byte("⋁"), + "xwedge": []byte("⋀"), + "yacute": []byte("ý"), + "ycirc": []byte("ŷ"), + "zacute": []byte("ź"), + "zcaron": []byte("ž"), + "zeetrf": []byte("ℨ"), + "zigrarr": []byte("⇝"), +} + +var TextRevEntitiesMap = map[byte][]byte{ + '<': []byte("<"), +} diff -Nru golang-github-tdewolff-parse-2.3.9/html/util_test.go golang-github-tdewolff-parse-2.4.2/html/util_test.go --- golang-github-tdewolff-parse-2.3.9/html/util_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/html/util_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -13,20 +13,17 @@ }{ {`xyz`, `xyz`}, {``, ``}, - {`x&z`, `x&z`}, {`x/z`, `x/z`}, {`x'z`, `"x'z"`}, {`x"z`, `'x"z'`}, {`'x"z'`, `'x"z'`}, - {`'x'"'z'`, `"x'"'z"`}, - {`"x"'"z"`, `'x"'"z'`}, - {`"x'z"`, `"x'z"`}, - {`'x"z'`, `'x"z'`}, - {`'x">'`, `'x">'`}, - {`You're encouraged to log in; however, it's not mandatory. [o]`, `"You're encouraged to log in; however, it's not mandatory. [o]"`}, + {`'x'"'z'`, `"x'"'z"`}, + {`"x"'"z"`, `'x"'"z'`}, + {`"x'z"`, `"x'z"`}, + {`'x'z'`, `"x'z"`}, {`a'b=""`, `'a'b=""'`}, {`x= 0xC0 { - if r, n := l.PeekRune(0); r == '\u2028' || r == '\u2029' { - nNewline = n + var r rune + if r, n = l.PeekRune(0); r == '\u2028' || r == '\u2029' { + newline = true } - } else { - l.Move(1) } - if nNewline > 0 { - if offset < l.Pos()+nNewline { - // move onto offset position, let next iteration handle it - l.Move(offset - l.Pos()) - continue - } - l.Move(nNewline) + if 1 < n && offset < l.Pos()+n { + // move onto offset position, let next iteration handle it + l.Move(offset - l.Pos()) + continue + } + l.Move(n) + + if newline { line++ offset -= l.Pos() l.Skip() diff -Nru golang-github-tdewolff-parse-2.3.9/position_test.go golang-github-tdewolff-parse-2.4.2/position_test.go --- golang-github-tdewolff-parse-2.3.9/position_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/position_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -35,6 +35,7 @@ {1, "x\u2028x", 1, 2}, {2, "x\u2028x", 1, 3}, {3, "x\u2028x", 1, 4}, + {2, "x\u2318x", 1, 3}, } for _, tt := range newlineTests { t.Run(fmt.Sprint(tt.buf, " ", tt.offset), func(t *testing.T) { diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/float.go golang-github-tdewolff-parse-2.4.2/strconv/float.go --- golang-github-tdewolff-parse-2.3.9/strconv/float.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/strconv/float.go 2019-12-17 13:35:25.000000000 +0000 @@ -1,6 +1,8 @@ package strconv -import "math" +import ( + "math" +) var float64pow10 = []float64{ 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, @@ -83,8 +85,7 @@ return f * math.Pow10(int(expExp)), i } -const log2 = 0.301029995 -const int64maxlen = 18 +const log2 = 0.3010299956639812 func float64exp(f float64) int { exp2 := 0 @@ -100,11 +101,10 @@ return int(exp10) } +// AppendFloat appends a float to `b` with precision `prec`. It returns the new slice and whether succesful or not. Precision is the number of decimals to display, thus prec + 1 == number of significant digits. func AppendFloat(b []byte, f float64, prec int) ([]byte, bool) { if math.IsNaN(f) || math.IsInf(f, 0) { return b, false - } else if prec >= int64maxlen { - return b, false } neg := false @@ -112,8 +112,8 @@ f = -f neg = true } - if prec == -1 { - prec = int64maxlen - 1 + if prec < 0 || 17 < prec { + prec = 17 // maximum number of significant digits in double } prec -= float64exp(f) // number of digits in front of the dot f *= math.Pow10(prec) diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/float_test.go golang-github-tdewolff-parse-2.4.2/strconv/float_test.go --- golang-github-tdewolff-parse-2.3.9/strconv/float_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/strconv/float_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -28,9 +28,11 @@ // {"4.9406564584124e-308", 4.9406564584124e-308) } for _, tt := range floatTests { - f, n := ParseFloat([]byte(tt.f)) - test.That(t, n == len(tt.f), "parsed", n, "characters instead for", tt.f) - test.That(t, f == tt.expected, "return", tt.expected, "for", tt.f) + t.Run(fmt.Sprint(tt.f), func(t *testing.T) { + f, n := ParseFloat([]byte(tt.f)) + test.T(t, n, len(tt.f)) + test.T(t, f, tt.expected) + }) } } @@ -73,12 +75,18 @@ {math.NaN(), 0, ""}, {math.Inf(1), 0, ""}, {math.Inf(-1), 0, ""}, - {0, 19, ""}, - {.000923361977200859392, -1, "9.23361977200859392e-4"}, + {0, 19, "0"}, + {0.000923361977200859392, -1, "9.23361977200859392e-4"}, + {1234, 2, "1.23e3"}, + {12345, 2, "1.23e4"}, + {12.345, 2, "12.3"}, + {12.345, 3, "12.34"}, } for _, tt := range floatTests { - f, _ := AppendFloat([]byte{}, tt.f, tt.prec) - test.String(t, string(f), tt.expected, "for", tt.f) + t.Run(fmt.Sprint(tt.f), func(t *testing.T) { + f, _ := AppendFloat([]byte{}, tt.f, tt.prec) + test.String(t, string(f), tt.expected) + }) } b := make([]byte, 0, 22) diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/int_test.go golang-github-tdewolff-parse-2.4.2/strconv/int_test.go --- golang-github-tdewolff-parse-2.3.9/strconv/int_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/strconv/int_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -1,6 +1,7 @@ package strconv import ( + "fmt" "math" "math/rand" "testing" @@ -27,14 +28,16 @@ {"a", 0}, } for _, tt := range intTests { - i, _ := ParseInt([]byte(tt.i)) - test.That(t, i == tt.expected, "return", tt.expected, "for", tt.i) + t.Run(fmt.Sprint(tt.i), func(t *testing.T) { + i, _ := ParseInt([]byte(tt.i)) + test.T(t, i, tt.expected) + }) } } func TestLenInt(t *testing.T) { lenIntTests := []struct { - number int64 + i int64 expected int }{ {0, 1}, @@ -64,7 +67,9 @@ {1000000000000000000, 19}, } for _, tt := range lenIntTests { - test.That(t, LenInt(tt.number) == tt.expected, "return", tt.expected, "for", tt.number) + t.Run(fmt.Sprint(tt.i), func(t *testing.T) { + test.T(t, LenInt(tt.i), tt.expected) + }) } } diff -Nru golang-github-tdewolff-parse-2.3.9/strconv/price_test.go golang-github-tdewolff-parse-2.4.2/strconv/price_test.go --- golang-github-tdewolff-parse-2.3.9/strconv/price_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/strconv/price_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -1,6 +1,7 @@ package strconv import ( + "fmt" "testing" "github.com/tdewolff/test" @@ -26,10 +27,11 @@ } for _, tt := range priceTests { - price := AppendPrice(make([]byte, 0, 4), tt.price, tt.dec, ',', '.') - test.String(t, string(price), tt.expected, "for", tt.price) + t.Run(fmt.Sprint(tt.price), func(t *testing.T) { + price := AppendPrice(make([]byte, 0, 4), tt.price, tt.dec, ',', '.') + test.String(t, string(price), tt.expected, "for", tt.price) + }) } // coverage - } diff -Nru golang-github-tdewolff-parse-2.3.9/.travis.yml golang-github-tdewolff-parse-2.4.2/.travis.yml --- golang-github-tdewolff-parse-2.3.9/.travis.yml 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/.travis.yml 2019-12-17 13:35:25.000000000 +0000 @@ -1,6 +1,6 @@ language: go go: - - 1.12.x + - 1.13.x env: - GO111MODULE=on before_install: diff -Nru golang-github-tdewolff-parse-2.3.9/util.go golang-github-tdewolff-parse-2.4.2/util.go --- golang-github-tdewolff-parse-2.3.9/util.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/util.go 2019-12-17 13:35:25.000000000 +0000 @@ -1,5 +1,10 @@ package parse +import ( + "bytes" + "strconv" +) + // Copy returns a copy of the given byte slice. func Copy(src []byte) (dst []byte) { dst = make([]byte, len(src)) @@ -161,37 +166,260 @@ // ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r). func ReplaceMultipleWhitespace(b []byte) []byte { - j := 0 - prevWS := false - hasNewline := false - for i, c := range b { - if IsWhitespace(c) { - prevWS = true - if IsNewline(c) { - hasNewline = true + j, k := 0, 0 // j is write position, k is start of next text section + for i := 0; i < len(b); i++ { + if IsWhitespace(b[i]) { + start := i + newline := IsNewline(b[i]) + i++ + for ; i < len(b) && IsWhitespace(b[i]); i++ { + if IsNewline(b[i]) { + newline = true + } } - } else { - if prevWS { - prevWS = false - if hasNewline { - hasNewline = false - b[j] = '\n' + if newline { + b[start] = '\n' + } else { + b[start] = ' ' + } + if 1 < i-start { // more than one whitespace + if j == 0 { + j = start + 1 } else { - b[j] = ' ' + j += copy(b[j:], b[k:start+1]) } - j++ + k = i } - b[j] = b[i] - j++ } } - if prevWS { - if hasNewline { - b[j] = '\n' + if j == 0 { + return b + } else if j == 1 { // only if starts with whitespace + b[k-1] = b[0] + return b[k-1:] + } else if k < len(b) { + j += copy(b[j:], b[k:]) + } + return b[:j] +} + +// replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { + if b[j] <= '9' { + c = c<<4 + int(b[j]-'0') + } else if b[j] <= 'F' { + c = c<<4 + int(b[j]-'A') + 10 + } else if b[j] <= 'f' { + c = c<<4 + int(b[j]-'a') + 10 + } + } + if j <= i+3 || 10000 <= c { + return b, j - 1 + } + if c < 128 { + r = []byte{byte(c)} + } else { + r = append(r, '&', '#') + r = strconv.AppendInt(r, int64(c), 10) + r = append(r, ';') + } } else { - b[j] = ' ' + c := 0 + for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ { + c = c*10 + int(b[j]-'0') + } + if j <= i+2 || 128 <= c { + return b, j - 1 + } + r = []byte{byte(c)} } - j++ + } else { + for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ { + } + if j <= i+1 || len(b) <= j { + return b, j - 1 + } + + var ok bool + r, ok = entitiesMap[string(b[i+1:j])] + if !ok { + return b, j + } + } + + // j is at semicolon + n := j + 1 - i + if j < len(b) && b[j] == ';' && 2 < n { + if len(r) == 1 { + if q, ok := revEntitiesMap[r[0]]; ok { + if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) { + return b, j + } + r = q + } else if r[0] == '&' { + // check if for example & is followed by something that could potentially be an entity + k := j + 1 + if k < len(b) && b[k] == '#' { + k++ + } + for ; k < len(b) && k-j <= MaxEntityLength && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z'); k++ { + } + if k < len(b) && b[k] == ';' { + return b, k + } + } + } + + copy(b[i:], r) + copy(b[i+len(r):], b[j+1:]) + b = b[:len(b)-n+len(r)] + return b, i + len(r) - 1 + } + return b, i +} + +// ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes. +func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { + for i := 0; i < len(b); i++ { + if b[i] == '&' && i+3 < len(b) { + b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) + } + } + return b +} + +// ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially. +func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { + j, k := 0, 0 // j is write position, k is start of next text section + for i := 0; i < len(b); i++ { + if IsWhitespace(b[i]) { + start := i + newline := IsNewline(b[i]) + i++ + for ; i < len(b) && IsWhitespace(b[i]); i++ { + if IsNewline(b[i]) { + newline = true + } + } + if newline { + b[start] = '\n' + } else { + b[start] = ' ' + } + if 1 < i-start { // more than one whitespace + if j == 0 { + j = start + 1 + } else { + j += copy(b[j:], b[k:start+1]) + } + k = i + } + } + if i+3 < len(b) && b[i] == '&' { + b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) + } + } + if j == 0 { + return b + } else if j == 1 { // only if starts with whitespace + b[k-1] = b[0] + return b[k-1:] + } else if k < len(b) { + j += copy(b[j:], b[k:]) } return b[:j] } + +func DecodeURL(b []byte) []byte { + for i := 0; i < len(b); i++ { + if b[i] == '%' && i+2 < len(b) { + j := i + 1 + c := 0 + for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'z' || b[j] >= 'A' && b[j] <= 'Z'); j++ { + if b[j] <= '9' { + c = c<<4 + int(b[j]-'0') + } else if b[j] <= 'F' { + c = c<<4 + int(b[j]-'A') + 10 + } else if b[j] <= 'f' { + c = c<<4 + int(b[j]-'a') + 10 + } + } + if j == i+3 && c < 128 { + b[i] = byte(c) + b = append(b[:i+1], b[i+3:]...) + } + } else if b[i] == '+' { + b[i] = ' ' + } + } + return b +} + +var URLEncodingTable = [256]bool{ + // ASCII + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + false, false, true, true, true, true, true, false, // space, !, ' + false, false, false, true, true, false, false, true, // (, ), *, -, . + false, false, false, false, false, false, false, false, // 0, 1, 2, 3, 4, 5, 6, 7 + false, false, true, true, true, true, true, true, // 8, 9 + + true, false, false, false, false, false, false, false, // A, B, C, D, E, F, G + false, false, false, false, false, false, false, false, // H, I, J, K, L, M, N, O + false, false, false, false, false, false, false, false, // P, Q, R, S, T, U, V, W + false, false, false, true, true, true, true, false, // X, Y, Z, _ + + true, false, false, false, false, false, false, false, // a, b, c, d, e, f, g + false, false, false, false, false, false, false, false, // h, i, j, k, l, m, n, o + false, false, false, false, false, false, false, false, // p, q, r, s, t, u, v, w + false, false, false, true, true, true, false, true, // x, y, z, ~ + + // non-ASCII + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, +} + +func EncodeURL(b []byte, table [256]bool) []byte { + for i := 0; i < len(b); i++ { + c := b[i] + if table[c] { + b = append(b, 0, 0) + copy(b[i+3:], b[i+1:]) + b[i+0] = '%' + b[i+1] = "0123456789ABCDEF"[c>>4] + b[i+2] = "0123456789ABCDEF"[c&15] + } else if c == ' ' { + b[i] = '+' + } + } + return b +} diff -Nru golang-github-tdewolff-parse-2.3.9/util_test.go golang-github-tdewolff-parse-2.4.2/util_test.go --- golang-github-tdewolff-parse-2.3.9/util_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/util_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -3,13 +3,14 @@ import ( "bytes" "math/rand" + "net/url" "regexp" "testing" "github.com/tdewolff/test" ) -func helperRand(n, m int, chars []byte) [][]byte { +func helperRandChars(n, m int, chars string) [][]byte { r := make([][]byte, n) for i := range r { for j := 0; j < m; j++ { @@ -19,12 +20,28 @@ return r } +func helperRandStrings(n, m int, ss []string) [][]byte { + r := make([][]byte, n) + for i := range r { + for j := 0; j < m; j++ { + r[i] = append(r[i], []byte(ss[rand.Intn(len(ss))])...) + } + } + return r +} + //////////////////////////////////////////////////////////////// var wsSlices [][]byte +var entitySlices [][]byte +var encodedUrlSlices [][]byte +var urlSlices [][]byte func init() { - wsSlices = helperRand(100, 20, []byte("abcdefg \n\r\f\t")) + wsSlices = helperRandChars(10000, 50, "abcdefg \n\r\f\t") + entitySlices = helperRandStrings(100, 5, []string{""", "'", "'", " ", " ", "test"}) + encodedUrlSlices = helperRandStrings(100, 5, []string{"%20", "%3D", "test"}) + urlSlices = helperRandStrings(100, 5, []string{"~", "\"", "<", "test"}) } func TestCopy(t *testing.T) { @@ -55,21 +72,191 @@ test.That(t, !IsAllWhitespace([]byte("\t \r\n\fx"))) } +func TestTrim(t *testing.T) { + test.Bytes(t, TrimWhitespace([]byte("a")), []byte("a")) + test.Bytes(t, TrimWhitespace([]byte(" a")), []byte("a")) + test.Bytes(t, TrimWhitespace([]byte("a ")), []byte("a")) + test.Bytes(t, TrimWhitespace([]byte(" ")), []byte("")) +} + func TestReplaceMultipleWhitespace(t *testing.T) { + test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a")), []byte(" a")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte("a ")), []byte("a ")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte("a b ")), []byte("a b ")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a b ")), []byte(" a b ")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a b ")), []byte(" a b ")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a b ")), []byte(" a b ")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte(" a")), []byte(" a")) + test.Bytes(t, ReplaceMultipleWhitespace([]byte("a b")), []byte("a b")) +} + +func TestReplaceMultipleWhitespaceRandom(t *testing.T) { wsRegexp := regexp.MustCompile("[ \t\f]+") wsNewlinesRegexp := regexp.MustCompile("[ ]*[\r\n][ \r\n]*") for _, e := range wsSlices { reference := wsRegexp.ReplaceAll(e, []byte(" ")) reference = wsNewlinesRegexp.ReplaceAll(reference, []byte("\n")) - test.Bytes(t, ReplaceMultipleWhitespace(e), reference, "must remove all multiple whitespace but keep newlines") + test.Bytes(t, ReplaceMultipleWhitespace(Copy(e)), reference, "in '"+string(e)+"'") } } -func TestTrim(t *testing.T) { - test.Bytes(t, TrimWhitespace([]byte("a")), []byte("a")) - test.Bytes(t, TrimWhitespace([]byte(" a")), []byte("a")) - test.Bytes(t, TrimWhitespace([]byte("a ")), []byte("a")) - test.Bytes(t, TrimWhitespace([]byte(" ")), []byte("")) +func TestReplaceEntities(t *testing.T) { + entitiesMap := map[string][]byte{ + "varphi": []byte("ϕ"), + "varpi": []byte("ϖ"), + "quot": []byte("\""), + "apos": []byte("'"), + "amp": []byte("&"), + } + revEntitiesMap := map[byte][]byte{ + '\'': []byte("'"), + } + var entityTests = []struct { + entity string + expected string + }{ + {""", `"`}, + {"'", `'`}, + {""", `"`}, + {"'", `'`}, + {" ", ` `}, + {""", `"`}, + {"'", `'`}, + {"⏧", `⏧`}, + {"⏧", `⏧`}, + {"⏧", `⏧`}, + {"⏧", `⏧`}, + {"✏", `✏`}, + {"✐", `✐`}, + {"'"", `'"`}, + {""", `"`}, + {""", `"`}, + {"&apos", `&apos`}, + {"&", `&`}, + {"'", `'`}, + {"&", `&`}, + {""", `"`}, + {"&a mp;", `&a mp;`}, + {"&DiacriticalAcute;", `&DiacriticalAcute;`}, + {"&CounterClockwiseContourIntegral;", `&CounterClockwiseContourIntegral;`}, + {"&CounterClockwiseContourIntegralL;", `&CounterClockwiseContourIntegralL;`}, + {"ϕ", "ϕ"}, + {"ϖ", "ϖ"}, + {"&varnone;", "&varnone;"}, + } + for _, tt := range entityTests { + t.Run(tt.entity, func(t *testing.T) { + b := ReplaceEntities([]byte(tt.entity), entitiesMap, revEntitiesMap) + test.T(t, string(b), tt.expected, "in '"+tt.entity+"'") + }) + } +} + +func TestReplaceEntitiesRandom(t *testing.T) { + entitiesMap := map[string][]byte{ + "quot": []byte("\""), + "apos": []byte("'"), + } + revEntitiesMap := map[byte][]byte{ + '\'': []byte("'"), + } + + quotRegexp := regexp.MustCompile(""") + aposRegexp := regexp.MustCompile("('|')") + for _, e := range entitySlices { + reference := quotRegexp.ReplaceAll(e, []byte("\"")) + reference = aposRegexp.ReplaceAll(reference, []byte("'")) + test.Bytes(t, ReplaceEntities(Copy(e), entitiesMap, revEntitiesMap), reference, "in '"+string(e)+"'") + } +} + +func TestReplaceMultipleWhitespaceAndEntities(t *testing.T) { + entitiesMap := map[string][]byte{ + "varphi": []byte("ϕ"), + } + var entityTests = []struct { + entity string + expected string + }{ + {" ϕ " \n ", " ϕ \"\n"}, + } + for _, tt := range entityTests { + t.Run(tt.entity, func(t *testing.T) { + b := ReplaceMultipleWhitespaceAndEntities([]byte(tt.entity), entitiesMap, nil) + test.T(t, string(b), tt.expected, "in '"+tt.entity+"'") + }) + } +} + +func TestReplaceMultipleWhitespaceAndEntitiesRandom(t *testing.T) { + entitiesMap := map[string][]byte{ + "quot": []byte("\""), + "apos": []byte("'"), + } + revEntitiesMap := map[byte][]byte{ + '\'': []byte("'"), + } + + wsRegexp := regexp.MustCompile("[ ]+") + quotRegexp := regexp.MustCompile(""") + aposRegexp := regexp.MustCompile("('|')") + for _, e := range entitySlices { + reference := wsRegexp.ReplaceAll(e, []byte(" ")) + reference = quotRegexp.ReplaceAll(reference, []byte("\"")) + reference = aposRegexp.ReplaceAll(reference, []byte("'")) + test.Bytes(t, ReplaceMultipleWhitespaceAndEntities(Copy(e), entitiesMap, revEntitiesMap), reference, "in '"+string(e)+"'") + } +} + +func TestDecodeURL(t *testing.T) { + var urlTests = []struct { + url string + expected string + }{ + {"%20%3F%7E", " ?~"}, + {"%80", "%80"}, + {"%2B%2b", "++"}, + {"%' ", "%' "}, + {"a+b", "a b"}, + } + for _, tt := range urlTests { + t.Run(tt.url, func(t *testing.T) { + b := DecodeURL([]byte(tt.url)) + test.T(t, string(b), tt.expected, "in '"+tt.url+"'") + }) + } +} + +func TestDecodeURLRandom(t *testing.T) { + for _, e := range encodedUrlSlices { + reference, _ := url.QueryUnescape(string(e)) + test.Bytes(t, DecodeURL(Copy(e)), []byte(reference), "in '"+string(e)+"'") + } +} + +func TestEncodeURL(t *testing.T) { + var urlTests = []struct { + url string + expected string + }{ + {"AZaz09-_.!~*'()", "AZaz09-_.!~*'()"}, + {"<>", "%3C%3E"}, + {"\u2318", "%E2%8C%98"}, + {"a b", "a+b"}, + } + for _, tt := range urlTests { + t.Run(tt.url, func(t *testing.T) { + b := EncodeURL([]byte(tt.url), URLEncodingTable) + test.T(t, string(b), tt.expected, "in '"+tt.url+"'") + }) + } +} + +func TestEncodeURLRandom(t *testing.T) { + for _, e := range urlSlices { + reference := url.QueryEscape(string(e)) + test.Bytes(t, EncodeURL(Copy(e), URLEncodingTable), []byte(reference), "in '"+string(e)+"'") + } } //////////////////////////////////////////////////////////////// @@ -90,7 +277,7 @@ } } -func BenchmarkReplace(b *testing.B) { +func BenchmarkReplaceMultipleWhitespace(b *testing.B) { for i := 0; i < b.N; i++ { for _, e := range wsSlices { ReplaceMultipleWhitespace(e) diff -Nru golang-github-tdewolff-parse-2.3.9/xml/lex.go golang-github-tdewolff-parse-2.4.2/xml/lex.go --- golang-github-tdewolff-parse-2.3.9/xml/lex.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/xml/lex.go 2019-12-17 13:35:25.000000000 +0000 @@ -92,6 +92,21 @@ l.r.Restore() } +// Offset returns the current position in the input stream. +func (l *Lexer) Offset() int { + return l.r.Offset() +} + +// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. +func (l *Lexer) Text() []byte { + return l.text +} + +// AttrVal returns the attribute value when an AttributeToken was returned from Next. +func (l *Lexer) AttrVal() []byte { + return l.attrVal +} + // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. func (l *Lexer) Next() (TokenType, []byte) { l.text = nil @@ -107,25 +122,22 @@ } if c == 0 { if l.r.Err() == nil { - l.err = parse.NewErrorLexer("unexpected null character", l.r) + l.err = parse.NewErrorLexer(l.r, "XML parse error: unexpected NULL character") } return ErrorToken, nil } else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') { return AttributeToken, l.shiftAttribute() } - start := l.r.Pos() + l.r.Skip() l.inTag = false if c == '/' { l.r.Move(2) - l.text = l.r.Lexeme()[start:] return StartTagCloseVoidToken, l.r.Shift() } else if c == '?' { l.r.Move(2) - l.text = l.r.Lexeme()[start:] return StartTagClosePIToken, l.r.Shift() } else { l.r.Move(1) - l.text = l.r.Lexeme()[start:] return StartTagCloseToken, l.r.Shift() } } @@ -134,7 +146,8 @@ c = l.r.Peek(0) if c == '<' { if l.r.Pos() > 0 { - return TextToken, l.r.Shift() + l.text = l.r.Shift() + return TextToken, l.text } c = l.r.Peek(1) if c == '/' { @@ -163,10 +176,11 @@ return StartTagToken, l.shiftStartTag() } else if c == 0 { if l.r.Pos() > 0 { - return TextToken, l.r.Shift() + l.text = l.r.Shift() + return TextToken, l.text } if l.r.Err() == nil { - l.err = parse.NewErrorLexer("unexpected null character", l.r) + l.err = parse.NewErrorLexer(l.r, "XML parse error: unexpected NULL character") } return ErrorToken, nil } @@ -174,16 +188,6 @@ } } -// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. -func (l *Lexer) Text() []byte { - return l.text -} - -// AttrVal returns the attribute value when an AttributeToken was returned from Next. -func (l *Lexer) AttrVal() []byte { - return l.attrVal -} - //////////////////////////////////////////////////////////////// // The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html diff -Nru golang-github-tdewolff-parse-2.3.9/xml/lex_test.go golang-github-tdewolff-parse-2.4.2/xml/lex_test.go --- golang-github-tdewolff-parse-2.3.9/xml/lex_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/xml/lex_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -160,7 +160,20 @@ col int }{ {"a\x00b", 2}, - {"", 3}, + {"<\x00 b='5'>", 2}, + {"", 3}, + {"", 4}, + {"", 5}, + {"", 6}, + {"", 7}, + {"", 3}, + {"", 4}, + {"text`)) + _, data := l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), nil) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("text")) + test.Bytes(t, l.Text(), []byte("text")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte("comment")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte(" doctype")) + test.Bytes(t, l.AttrVal(), nil) + + _, data = l.Next() + test.Bytes(t, data, []byte("")) + test.Bytes(t, l.Text(), []byte("cdata")) + test.Bytes(t, l.AttrVal(), nil) +} + +func TestOffset(t *testing.T) { + l := NewLexer(bytes.NewBufferString(`

text

`)) + test.T(t, l.Offset(), 0) + _, _ = l.Next() + test.T(t, l.Offset(), 4) //

+ _, _ = l.Next() + test.T(t, l.Offset(), 20) // text + _, _ = l.Next() + test.T(t, l.Offset(), 26) //

+} + //////////////////////////////////////////////////////////////// func ExampleNewLexer() { diff -Nru golang-github-tdewolff-parse-2.3.9/xml/util.go golang-github-tdewolff-parse-2.4.2/xml/util.go --- golang-github-tdewolff-parse-2.3.9/xml/util.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/xml/util.go 2019-12-17 13:35:25.000000000 +0000 @@ -1,7 +1,5 @@ package xml -import "github.com/tdewolff/parse/v2" - var ( ltEntityBytes = []byte("<") ampEntityBytes = []byte("&") @@ -9,20 +7,19 @@ doubleQuoteEntityBytes = []byte(""") ) +// Entities are all named character entities. +var EntitiesMap = map[string][]byte{ + "apos": []byte("'"), + "gt": []byte(">"), + "quot": []byte("\""), +} + // EscapeAttrVal returns the escape attribute value bytes without quotes. func EscapeAttrVal(buf *[]byte, b []byte) []byte { singles := 0 doubles := 0 - for i, c := range b { - if c == '&' { - if quote, n := parse.QuoteEntity(b[i:]); n > 0 { - if quote == '"' { - doubles++ - } else { - singles++ - } - } - } else if c == '"' { + for _, c := range b { + if c == '"' { doubles++ } else if c == '\'' { singles++ @@ -49,18 +46,7 @@ j := 1 start := 0 for i, c := range b { - if c == '&' { - if entityQuote, n := parse.QuoteEntity(b[i:]); n > 0 { - j += copy(t[j:], b[start:i]) - if entityQuote != quote { - t[j] = entityQuote - j++ - } else { - j += copy(t[j:], escapedQuote) - } - start = i + n - } - } else if c == quote { + if c == quote { j += copy(t[j:], b[start:i]) j += copy(t[j:], escapedQuote) start = i + 1 diff -Nru golang-github-tdewolff-parse-2.3.9/xml/util_test.go golang-github-tdewolff-parse-2.4.2/xml/util_test.go --- golang-github-tdewolff-parse-2.3.9/xml/util_test.go 2019-08-22 18:19:17.000000000 +0000 +++ golang-github-tdewolff-parse-2.4.2/xml/util_test.go 2019-12-17 13:35:25.000000000 +0000 @@ -11,15 +11,14 @@ attrVal string expected string }{ - {"xyz", "\"xyz\""}, - {"", "\"\""}, - {"x&z", "\"x&z\""}, - {"x'z", "\"x'z\""}, - {"x\"z", "'x\"z'"}, - {"a'b=\"\"", "'a'b=\"\"'"}, - {"'x'\"'z'", "\"x'"'z\""}, - {"\"x"'"z\"", "'x\"'\"z'"}, - {"a'b=\"\"", "'a'b=\"\"'"}, + {`xyz`, `"xyz"`}, + {``, `""`}, + {`x'z`, `"x'z"`}, + {`x"z`, `'x"z'`}, + {`a'b=""`, `'a'b=""'`}, + {`'x'"'z'`, `"x'"'z"`}, + {`"x"'"z"`, `'x"'"z'`}, + {`a'b=""`, `'a'b=""'`}, } var buf []byte for _, tt := range attrValTests { @@ -47,6 +46,7 @@ {"", ""}, {"", " a "}, {"", ""}, + {"", " a ]]> b "}, } var buf []byte for _, tt := range CDATAValTests {