Hook up Balancer as a Tidy implementation.
[lhc/web/wiklou.git] / tests / phpunit / includes / tidy / BalancerTest.php
1 <?php
2
3 class BalancerTest extends MediaWikiTestCase {
4 private $balancer;
5
6 /**
7 * Anything that needs to happen before your tests should go here.
8 */
9 protected function setUp() {
10 // Be sure to do call the parent setup and teardown functions.
11 // This makes sure that all the various cleanup and restorations
12 // happen as they should (including the restoration for setMwGlobals).
13 parent::setUp();
14 $this->balancer = new MediaWiki\Tidy\Balancer( [
15 'strict' => false, /* not strict */
16 'allowedHtmlElements' => null, /* no sanitization */
17 'tidyCompat' => false, /* standard parser */
18 ] );
19 }
20
21 /**
22 * Anything cleanup you need to do should go here.
23 */
24 protected function tearDown() {
25 parent::tearDown();
26 }
27
28 /**
29 * @covers Balancer::balance
30 * @dataProvider provideBalancerTests
31 */
32 public function testBalancer( $description, $input, $expected ) {
33 $output = $this->balancer->balance( $input );
34 $this->assertEquals( $expected, $output, $description );
35 }
36
37 public static function provideBalancerTests() {
38 // Get the tests from html5lib-tests.json
39 $json = json_decode( file_get_contents(
40 __DIR__ . '/html5lib-tests.json'
41 ), true );
42 // Munge this slightly into the format phpunit expects
43 // for providers, and filter out HTML constructs which
44 // the balancer doesn't support.
45 $tests = [];
46 $start = '<html><head></head><body>';
47 $end = '</body></html>';
48 foreach ( $json as $filename => $cases ) {
49 foreach ( $cases as $case ) {
50 $html = $case['document']['html'];
51 if (
52 substr( $html, 0, strlen( $start ) ) !== $start ||
53 substr( $html, -strlen( $end ) ) !== $end
54 ) {
55 // Skip tests which involve stuff in the <head> or
56 // weird doctypes.
57 continue;
58 }
59 // We used to do this:
60 // $html = substr( $html, strlen( $start ), -strlen( $end ) );
61 // But now we use a different field in the test case,
62 // which reports how domino would parse this case in a
63 // no-quirks <body> context. (The original test case may
64 // have had a different context, or relied on quirks mode.)
65 $html = $case['document']['noQuirksBodyHtml'];
66 // Normalize case of SVG attributes.
67 $html = str_replace( 'foreignObject', 'foreignobject', $html );
68 // The Sanitizer sorts attributes.
69 $html = preg_replace( '/(size="[^"]+") (id="[^"]+")/', '$2 $1', $html );
70
71 if ( isset( $case['document']['props']['comment'] ) ) {
72 // Skip tests which include HTML comments, which
73 // the balancer requires to have been stripped.
74 continue;
75 }
76 if ( strpos( $case['data'], '<![CDATA[' ) !== false ) {
77 // Skip tests involving <![CDATA[ ]]> quoting.
78 continue;
79 }
80 if ( stripos( $case['data'], '<!DOCTYPE' ) !== false ) {
81 // Skip tests involving doctypes.
82 continue;
83 }
84 if ( preg_match( ',</?(html|head|body|frame|plaintext)>|<rdar:,i', $case['data'] ) ) {
85 // Skip tests involving some literal tags, which are
86 // unsupported but don't show up in the expected output.
87 continue;
88 }
89 if (
90 isset( $case['document']['props']['tags']['form'] ) ||
91 isset( $case['document']['props']['tags']['iframe'] ) ||
92 isset( $case['document']['props']['tags']['noembed'] ) ||
93 isset( $case['document']['props']['tags']['noscript'] ) ||
94 isset( $case['document']['props']['tags']['script'] ) ||
95 isset( $case['document']['props']['tags']['select'] ) ||
96 isset( $case['document']['props']['tags']['svg script'] ) ||
97 isset( $case['document']['props']['tags']['svg title'] ) ||
98 isset( $case['document']['props']['tags']['textarea'] ) ||
99 isset( $case['document']['props']['tags']['title'] ) ||
100 isset( $case['document']['props']['tags']['xmp'] )
101 ) {
102 // Skip tests with unsupported tags which *do* show
103 // up in the expected output.
104 continue;
105 }
106 if (
107 $filename === 'entities01.dat' ||
108 $filename === 'entities02.dat' ||
109 preg_match( '/&([a-z]+|#x[0-9A-F]+);/i', $case['data'] ) ||
110 preg_match( '/^(&|&#|&#X|&#x|&#45|&x-test|&AMP)$/', $case['data'] )
111 ) {
112 // Skip tests involving entity encoding.
113 continue;
114 }
115 if (
116 isset( $case['document']['props']['tagWithLt'] ) ||
117 isset( $case['document']['props']['attrWithFunnyChar'] ) ||
118 preg_match( ':^(</b test|<di|<foo bar=qux/>)$:', $case['data'] ) ||
119 preg_match( ':</p<p>:', $case['data'] )
120 ) {
121 // Skip tests with funny tag or attribute names,
122 // which are really tests of the HTML tokenizer, not
123 // the tree builder.
124 continue;
125 }
126 if (
127 stripos( $case['data'], 'encoding=" text/html "' ) !== false
128 ) {
129 // The Sanitizer normalizes whitespace in attribute
130 // values, which makes this test case invalid.
131 continue;
132 }
133 if ( $filename === 'plain-text-unsafe.dat' ) {
134 // Skip tests with ASCII null, etc.
135 continue;
136 }
137 $tests[] = [
138 $filename, # use better description?
139 $case['data'],
140 $html
141 ];
142 }
143 }
144 return $tests;
145 }
146 }