diff options
Diffstat (limited to 'Godeps/_workspace/src/golang.org/x/net/html/token_test.go')
-rw-r--r-- | Godeps/_workspace/src/golang.org/x/net/html/token_test.go | 748 |
1 files changed, 748 insertions, 0 deletions
diff --git a/Godeps/_workspace/src/golang.org/x/net/html/token_test.go b/Godeps/_workspace/src/golang.org/x/net/html/token_test.go new file mode 100644 index 000000000..20221c328 --- /dev/null +++ b/Godeps/_workspace/src/golang.org/x/net/html/token_test.go @@ -0,0 +1,748 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "io" + "io/ioutil" + "reflect" + "runtime" + "strings" + "testing" +) + +type tokenTest struct { + // A short description of the test case. + desc string + // The HTML to parse. + html string + // The string representations of the expected tokens, joined by '$'. + golden string +} + +var tokenTests = []tokenTest{ + { + "empty", + "", + "", + }, + // A single text node. The tokenizer should not break text nodes on whitespace, + // nor should it normalize whitespace within a text node. + { + "text", + "foo bar", + "foo bar", + }, + // An entity. + { + "entity", + "one < two", + "one < two", + }, + // A start, self-closing and end tag. The tokenizer does not care if the start + // and end tokens don't match; that is the job of the parser. + { + "tags", + "<a>b<c/>d</e>", + "<a>$b$<c/>$d$</e>", + }, + // Angle brackets that aren't a tag. + { + "not a tag #0", + "<", + "<", + }, + { + "not a tag #1", + "</", + "</", + }, + { + "not a tag #2", + "</>", + "<!---->", + }, + { + "not a tag #3", + "a</>b", + "a$<!---->$b", + }, + { + "not a tag #4", + "</ >", + "<!-- -->", + }, + { + "not a tag #5", + "</.", + "<!--.-->", + }, + { + "not a tag #6", + "</.>", + "<!--.-->", + }, + { + "not a tag #7", + "a < b", + "a < b", + }, + { + "not a tag #8", + "<.>", + "<.>", + }, + { + "not a tag #9", + "a<<<b>>>c", + "a<<$<b>$>>c", + }, + { + "not a tag #10", + "if x<0 and y < 0 then x*y>0", + "if x<0 and y < 0 then x*y>0", + }, + { + "not a tag #11", + "<<p>", + "<$<p>", + }, + // EOF in a tag name. + { + "tag name eof #0", + "<a", + "", + }, + { + "tag name eof #1", + "<a ", + "", + }, + { + "tag name eof #2", + "a<b", + "a", + }, + { + "tag name eof #3", + "<a><b", + "<a>", + }, + { + "tag name eof #4", + `<a x`, + ``, + }, + // Some malformed tags that are missing a '>'. + { + "malformed tag #0", + `<p</p>`, + `<p< p="">`, + }, + { + "malformed tag #1", + `<p </p>`, + `<p <="" p="">`, + }, + { + "malformed tag #2", + `<p id`, + ``, + }, + { + "malformed tag #3", + `<p id=`, + ``, + }, + { + "malformed tag #4", + `<p id=>`, + `<p id="">`, + }, + { + "malformed tag #5", + `<p id=0`, + ``, + }, + { + "malformed tag #6", + `<p id=0</p>`, + `<p id="0</p">`, + }, + { + "malformed tag #7", + `<p id="0</p>`, + ``, + }, + { + "malformed tag #8", + `<p id="0"</p>`, + `<p id="0" <="" p="">`, + }, + { + "malformed tag #9", + `<p></p id`, + `<p>`, + }, + // Raw text and RCDATA. + { + "basic raw text", + "<script><a></b></script>", + "<script>$<a></b>$</script>", + }, + { + "unfinished script end tag", + "<SCRIPT>a</SCR", + "<script>$a</SCR", + }, + { + "broken script end tag", + "<SCRIPT>a</SCR ipt>", + "<script>$a</SCR ipt>", + }, + { + "EOF in script end tag", + "<SCRIPT>a</SCRipt", + "<script>$a</SCRipt", + }, + { + "scriptx end tag", + "<SCRIPT>a</SCRiptx", + "<script>$a</SCRiptx", + }, + { + "' ' completes script end tag", + "<SCRIPT>a</SCRipt ", + "<script>$a", + }, + { + "'>' completes script end tag", + "<SCRIPT>a</SCRipt>", + "<script>$a$</script>", + }, + { + "self-closing script end tag", + "<SCRIPT>a</SCRipt/>", + "<script>$a$</script>", + }, + { + "nested script tag", + "<SCRIPT>a</SCRipt<script>", + "<script>$a</SCRipt<script>", + }, + { + "script end tag after unfinished", + "<SCRIPT>a</SCRipt</script>", + "<script>$a</SCRipt$</script>", + }, + { + "script/style mismatched tags", + "<script>a</style>", + "<script>$a</style>", + }, + { + "style element with entity", + "<style>'", + "<style>$&apos;", + }, + { + "textarea with tag", + "<textarea><div></textarea>", + "<textarea>$<div>$</textarea>", + }, + { + "title with tag and entity", + "<title><b>K&R C</b></title>", + "<title>$<b>K&R C</b>$</title>", + }, + // DOCTYPE tests. + { + "Proper DOCTYPE", + "<!DOCTYPE html>", + "<!DOCTYPE html>", + }, + { + "DOCTYPE with no space", + "<!doctypehtml>", + "<!DOCTYPE html>", + }, + { + "DOCTYPE with two spaces", + "<!doctype html>", + "<!DOCTYPE html>", + }, + { + "looks like DOCTYPE but isn't", + "<!DOCUMENT html>", + "<!--DOCUMENT html-->", + }, + { + "DOCTYPE at EOF", + "<!DOCtype", + "<!DOCTYPE >", + }, + // XML processing instructions. + { + "XML processing instruction", + "<?xml?>", + "<!--?xml?-->", + }, + // Comments. + { + "comment0", + "abc<b><!-- skipme --></b>def", + "abc$<b>$<!-- skipme -->$</b>$def", + }, + { + "comment1", + "a<!-->z", + "a$<!---->$z", + }, + { + "comment2", + "a<!--->z", + "a$<!---->$z", + }, + { + "comment3", + "a<!--x>-->z", + "a$<!--x>-->$z", + }, + { + "comment4", + "a<!--x->-->z", + "a$<!--x->-->$z", + }, + { + "comment5", + "a<!>z", + "a$<!---->$z", + }, + { + "comment6", + "a<!->z", + "a$<!----->$z", + }, + { + "comment7", + "a<!---<>z", + "a$<!---<>z-->", + }, + { + "comment8", + "a<!--z", + "a$<!--z-->", + }, + { + "comment9", + "a<!--z-", + "a$<!--z-->", + }, + { + "comment10", + "a<!--z--", + "a$<!--z-->", + }, + { + "comment11", + "a<!--z---", + "a$<!--z--->", + }, + { + "comment12", + "a<!--z----", + "a$<!--z---->", + }, + { + "comment13", + "a<!--x--!>z", + "a$<!--x-->$z", + }, + // An attribute with a backslash. + { + "backslash", + `<p id="a\"b">`, + `<p id="a\" b"="">`, + }, + // Entities, tag name and attribute key lower-casing, and whitespace + // normalization within a tag. + { + "tricky", + "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", + `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, + }, + // A nonexistent entity. Tokenizing and converting back to a string should + // escape the "&" to become "&". + { + "noSuchEntity", + `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, + `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, + }, + { + "entity without semicolon", + `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, + `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, + }, + { + "entity with digits", + "½", + "½", + }, + // Attribute tests: + // http://dev.w3.org/html5/pf-summary/Overview.html#attributes + { + "Empty attribute", + `<input disabled FOO>`, + `<input disabled="" foo="">`, + }, + { + "Empty attribute, whitespace", + `<input disabled FOO >`, + `<input disabled="" foo="">`, + }, + { + "Unquoted attribute value", + `<input value=yes FOO=BAR>`, + `<input value="yes" foo="BAR">`, + }, + { + "Unquoted attribute value, spaces", + `<input value = yes FOO = BAR>`, + `<input value="yes" foo="BAR">`, + }, + { + "Unquoted attribute value, trailing space", + `<input value=yes FOO=BAR >`, + `<input value="yes" foo="BAR">`, + }, + { + "Single-quoted attribute value", + `<input value='yes' FOO='BAR'>`, + `<input value="yes" foo="BAR">`, + }, + { + "Single-quoted attribute value, trailing space", + `<input value='yes' FOO='BAR' >`, + `<input value="yes" foo="BAR">`, + }, + { + "Double-quoted attribute value", + `<input value="I'm an attribute" FOO="BAR">`, + `<input value="I'm an attribute" foo="BAR">`, + }, + { + "Attribute name characters", + `<meta http-equiv="content-type">`, + `<meta http-equiv="content-type">`, + }, + { + "Mixed attributes", + `a<P V="0 1" w='2' X=3 y>z`, + `a$<p v="0 1" w="2" x="3" y="">$z`, + }, + { + "Attributes with a solitary single quote", + `<p id=can't><p id=won't>`, + `<p id="can't">$<p id="won't">`, + }, +} + +func TestTokenizer(t *testing.T) { +loop: + for _, tt := range tokenTests { + z := NewTokenizer(strings.NewReader(tt.html)) + if tt.golden != "" { + for i, s := range strings.Split(tt.golden, "$") { + if z.Next() == ErrorToken { + t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) + continue loop + } + actual := z.Token().String() + if s != actual { + t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) + continue loop + } + } + } + z.Next() + if z.Err() != io.EOF { + t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) + } + } +} + +func TestMaxBuffer(t *testing.T) { + // Exceeding the maximum buffer size generates ErrBufferExceeded. + z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) + z.SetMaxBuf(5) + tt := z.Next() + if got, want := tt, ErrorToken; got != want { + t.Fatalf("token type: got: %v want: %v", got, want) + } + if got, want := z.Err(), ErrBufferExceeded; got != want { + t.Errorf("error type: got: %v want: %v", got, want) + } + if got, want := string(z.Raw()), "<tttt"; got != want { + t.Fatalf("buffered before overflow: got: %q want: %q", got, want) + } +} + +func TestMaxBufferReconstruction(t *testing.T) { + // Exceeding the maximum buffer size at any point while tokenizing permits + // reconstructing the original input. +tests: + for _, test := range tokenTests { + for maxBuf := 1; ; maxBuf++ { + r := strings.NewReader(test.html) + z := NewTokenizer(r) + z.SetMaxBuf(maxBuf) + var tokenized bytes.Buffer + for { + tt := z.Next() + tokenized.Write(z.Raw()) + if tt == ErrorToken { + if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { + t.Errorf("%s: unexpected error: %v", test.desc, err) + } + break + } + } + // Anything tokenized along with untokenized input or data left in the reader. + assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) + if err != nil { + t.Errorf("%s: ReadAll: %v", test.desc, err) + continue tests + } + if got, want := string(assembled), test.html; got != want { + t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) + continue tests + } + // EOF indicates that we completed tokenization and hence found the max + // maxBuf that generates ErrBufferExceeded, so continue to the next test. + if z.Err() == io.EOF { + break + } + } // buffer sizes + } // tests +} + +func TestPassthrough(t *testing.T) { + // Accumulating the raw output for each parse event should reconstruct the + // original input. + for _, test := range tokenTests { + z := NewTokenizer(strings.NewReader(test.html)) + var parsed bytes.Buffer + for { + tt := z.Next() + parsed.Write(z.Raw()) + if tt == ErrorToken { + break + } + } + if got, want := parsed.String(), test.html; got != want { + t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) + } + } +} + +func TestBufAPI(t *testing.T) { + s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" + z := NewTokenizer(bytes.NewBufferString(s)) + var result bytes.Buffer + depth := 0 +loop: + for { + tt := z.Next() + switch tt { + case ErrorToken: + if z.Err() != io.EOF { + t.Error(z.Err()) + } + break loop + case TextToken: + if depth > 0 { + result.Write(z.Text()) + } + case StartTagToken, EndTagToken: + tn, _ := z.TagName() + if len(tn) == 1 && tn[0] == 'a' { + if tt == StartTagToken { + depth++ + } else { + depth-- + } + } + } + } + u := "14567" + v := string(result.Bytes()) + if u != v { + t.Errorf("TestBufAPI: want %q got %q", u, v) + } +} + +func TestConvertNewlines(t *testing.T) { + testCases := map[string]string{ + "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", + "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", + "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", + "": "", + "\n": "\n", + "\n\r": "\n\n", + "\r": "\n", + "\r\n": "\n", + "\r\n\n": "\n\n", + "\r\n\r": "\n\n", + "\r\n\r\n": "\n\n", + "\r\r": "\n\n", + "\r\r\n": "\n\n", + "\r\r\n\n": "\n\n\n", + "\r\r\r\n": "\n\n\n", + "\r \n": "\n \n", + "xyz": "xyz", + } + for in, want := range testCases { + if got := string(convertNewlines([]byte(in))); got != want { + t.Errorf("input %q: got %q, want %q", in, got, want) + } + } +} + +func TestReaderEdgeCases(t *testing.T) { + const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" + testCases := []io.Reader{ + &zeroOneByteReader{s: s}, + &eofStringsReader{s: s}, + &stuckReader{}, + } + for i, tc := range testCases { + got := []TokenType{} + z := NewTokenizer(tc) + for { + tt := z.Next() + if tt == ErrorToken { + break + } + got = append(got, tt) + } + if err := z.Err(); err != nil && err != io.EOF { + if err != io.ErrNoProgress { + t.Errorf("i=%d: %v", i, err) + } + continue + } + want := []TokenType{ + StartTagToken, + TextToken, + EndTagToken, + } + if !reflect.DeepEqual(got, want) { + t.Errorf("i=%d: got %v, want %v", i, got, want) + continue + } + } +} + +// zeroOneByteReader is like a strings.Reader that alternates between +// returning 0 bytes and 1 byte at a time. +type zeroOneByteReader struct { + s string + n int +} + +func (r *zeroOneByteReader) Read(p []byte) (int, error) { + if len(p) == 0 { + return 0, nil + } + if len(r.s) == 0 { + return 0, io.EOF + } + r.n++ + if r.n%2 != 0 { + return 0, nil + } + p[0], r.s = r.s[0], r.s[1:] + return 1, nil +} + +// eofStringsReader is like a strings.Reader but can return an (n, err) where +// n > 0 && err != nil. +type eofStringsReader struct { + s string +} + +func (r *eofStringsReader) Read(p []byte) (int, error) { + n := copy(p, r.s) + r.s = r.s[n:] + if r.s != "" { + return n, nil + } + return n, io.EOF +} + +// stuckReader is an io.Reader that always returns no data and no error. +type stuckReader struct{} + +func (*stuckReader) Read(p []byte) (int, error) { + return 0, nil +} + +const ( + rawLevel = iota + lowLevel + highLevel +) + +func benchmarkTokenizer(b *testing.B, level int) { + buf, err := ioutil.ReadFile("testdata/go1.html") + if err != nil { + b.Fatalf("could not read testdata/go1.html: %v", err) + } + b.SetBytes(int64(len(buf))) + runtime.GC() + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + z := NewTokenizer(bytes.NewBuffer(buf)) + for { + tt := z.Next() + if tt == ErrorToken { + if err := z.Err(); err != nil && err != io.EOF { + b.Fatalf("tokenizer error: %v", err) + } + break + } + switch level { + case rawLevel: + // Calling z.Raw just returns the raw bytes of the token. It does + // not unescape < to <, or lower-case tag names and attribute keys. + z.Raw() + case lowLevel: + // Caling z.Text, z.TagName and z.TagAttr returns []byte values + // whose contents may change on the next call to z.Next. + switch tt { + case TextToken, CommentToken, DoctypeToken: + z.Text() + case StartTagToken, SelfClosingTagToken: + _, more := z.TagName() + for more { + _, _, more = z.TagAttr() + } + case EndTagToken: + z.TagName() + } + case highLevel: + // Calling z.Token converts []byte values to strings whose validity + // extend beyond the next call to z.Next. + z.Token() + } + } + } +} + +func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } +func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } +func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) } |