diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index e7f50adb..f4dda307 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -632,9 +632,10 @@ public function get_normalized_token(): ?string { return null; } - return $this->decode_string_or_url( + return $this->decode_range( $this->token_starts_at, - $this->token_length + $this->token_length, + self::TOKEN_STRING === $this->token_type ); } @@ -680,29 +681,39 @@ public function get_token_value() { switch ( $this->token_type ) { case self::TOKEN_HASH: // Hash value starts after the # character. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 ); + $this->token_value = $this->decode_range( $this->token_starts_at + 1, $this->token_length - 1 ); break; case self::TOKEN_AT_KEYWORD: // At-keyword value starts after the @ character. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 ); + $this->token_value = $this->decode_range( $this->token_starts_at + 1, $this->token_length - 1 ); break; case self::TOKEN_FUNCTION: // Function name is everything except the final (. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length - 1 ); + $this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length - 1 ); break; case self::TOKEN_IDENT: // Identifier is the entire token. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length ); + $this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length ); break; case self::TOKEN_STRING: + if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) { + $this->token_value = $this->decode_range( + $this->token_value_starts_at, + $this->token_value_length, + true + ); + } else { + $this->token_value = null; + } + break; + case self::TOKEN_URL: - // Decode and cache the string/URL value. if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) { - $this->token_value = $this->decode_string_or_url( + $this->token_value = $this->decode_range( $this->token_value_starts_at, $this->token_value_length ); @@ -713,7 +724,7 @@ public function get_token_value() { case self::TOKEN_DELIM: // Delim value is the single code point. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length ); + $this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length ); break; case self::TOKEN_NUMBER: @@ -1183,7 +1194,7 @@ private function consume_numeric(): bool { // Consume an ident sequence. Set the 's unit to the returned value. $unit_starts_at = $this->at; $this->consume_ident_sequence(); - $this->token_unit = $this->decode_string_or_url( $unit_starts_at, $this->at - $unit_starts_at ); + $this->token_unit = $this->decode_range( $unit_starts_at, $this->at - $unit_starts_at ); $this->token_type = self::TOKEN_DIMENSION; $this->token_length = $this->at - $this->token_starts_at; return true; @@ -1218,7 +1229,7 @@ private function consume_ident_like(): bool { // Consume an ident sequence, and let string be the result. $ident_start = $this->at; $decoded = $this->consume_ident_sequence(); - $string = $decoded ?? $this->decode_string_or_url( $ident_start, $this->at - $ident_start ); + $string = $decoded ?? $this->decode_range( $ident_start, $this->at - $ident_start ); // If string's value is an ASCII case-insensitive match for "url", // and the next input code point is U+0028 LEFT PARENTHESIS ((). @@ -1537,19 +1548,27 @@ private function consume_ident_start_codepoint( $at ): int { } /** - * Decodes a string or URL value with escape sequences and normalization. - * - * Fast path: If the slice contains no special characters, returns the raw - * substring with almost zero allocations. - * - * Slow path: Builds the decoded string by optionally processing escapes and - * normalizing line endings and null bytes. - * - * @param int $start Start byte offset. - * @param int $length Length of the substring to decode. - * @return string Decoded/normalized string. + * Decodes and normalizes ident-like or string CSS values from a byte range. + * + * For example: + * ┌──────────────┬────────┐ + * │ Input │ Output │ + * ├──────────────┼────────┤ + * │ 'xyz' │ 'xyz' │ + * │ '\x\y\z' │ 'xyz' │ + * │ 'x\79z' │ 'xyz' │ + * │ 'x\000079 z' │ 'xyz' │ + * │ 'a\r\nb' │ 'a\nb' │ + * │ 'a\0b' │ 'a�b' │ + * └──────────────┴────────┘ + * + * @param int $start Start byte offset. + * @param int $length Length of the substring to decode. + * @param bool $string_escapes Optional, default false. When true, apply additional escape + * rules that apply only to string tokens (CSS §4.3.5). + * @return string Decoded and normalized string. */ - private function decode_string_or_url( int $start, int $length ): string { + private function decode_range( int $start, int $length, bool $string_escapes = false ): string { // Fast path: check if any processing is needed. $slice = wp_scrub_utf8( substr( $this->css, $start, $length ) ); $special_chars = "\\\r\f\x00"; @@ -1579,8 +1598,39 @@ private function decode_string_or_url( int $start, int $length ): string { $char = $this->css[ $at ]; - // Handle escapes (if enabled). + // Handle escapes. if ( '\\' === $char ) { + /* + * String tokens have special escape rules per §4.3.5: + * - 0x5C (backslash) at EOF: consume the backslash, produce no value. + * - 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR): + * consume both characters as a line continuation, produce no value. + * - 0x5C (backslash) followed by 0x0D 0x0A (CRLF): + * consume all three characters as a line continuation, produce no value. + * These must be checked before the general escape path. + */ + if ( $string_escapes ) { + if ( $at + 1 >= $end ) { + // 0x5C at EOF: consume the backslash and stop. + ++$at; + continue; + } + $next = $this->css[ $at + 1 ]; + if ( "\n" === $next || "\f" === $next ) { + // 0x5C followed by 0x0A (LF) or 0x0C (FF): line continuation. + $at += 2; + continue; + } + if ( "\r" === $next ) { + // 0x5C followed by 0x0D (CR): line continuation; 0x0D 0x0A counts as one newline. + $at += 2; + if ( $at < $end && "\n" === $this->css[ $at ] ) { + ++$at; + } + continue; + } + } + if ( $this->is_valid_escape( $at ) ) { ++$at; $decoded .= $this->decode_escape_at( $at, $bytes_consumed ); diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index a6f7f7f3..d606f691 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -5,6 +5,8 @@ /** * Comprehensive CSS processor tests based on the CSS Syntax Level 3 specification. + * + * @group css */ class CSSProcessorTest extends TestCase { @@ -25,7 +27,7 @@ public function test_processor_matches_spec( string $css, array $expected_tokens * @see https://github.com/romainmenke/css-processor-tests/ * @return array */ - static public function corpus_provider(): array { + public static function corpus_provider(): array { return json_decode(file_get_contents(__DIR__ . '/css-test-cases.json'), true); } @@ -35,7 +37,7 @@ static public function corpus_provider(): array { * @param CSSProcessor $processor The CSS processor. * @return array Array of tokens with type, raw, startIndex, endIndex, structured. */ - static public function collect_tokens( CSSProcessor $processor, $keys = null ): array { + public static function collect_tokens( CSSProcessor $processor, $keys = null ): array { $tokens = array(); while ( $processor->next_token() ) { @@ -1542,6 +1544,144 @@ public function test_ident_start_codepoint_bounds_check(): void { $this->assertSame( $expected_tokens, $actual_tokens ); } + /** + * Tests that backslash-newline in a string token contributes nothing to the value. + * + * CSS spec §4.3.5 consume-string-token: + * > U+005C REVERSE SOLIDUS (\) + * > Otherwise, if the next input code point is a newline, consume it. + * + * The backslash and newline are both consumed and produce no value. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-string-token + * @see https://github.com/WordPress/php-toolkit/issues/222 + * + * @dataProvider data_string_backslash_newline + */ + public function test_string_backslash_newline( string $css, string $expected_value ): void { + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() ); + $this->assertSame( $expected_value, $processor->get_token_value() ); + } + + public static function data_string_backslash_newline(): array { + return array( + 'backslash-LF' => array( "'str\\\ning'", 'string' ), + 'backslash-FF' => array( "'str\\\fing'", 'string' ), + 'backslash-CR' => array( "'str\\\ring'", 'string' ), + 'backslash-CRLF' => array( "'str\\\r\ning'", 'string' ), + ); + } + + /** + * Tests that backslash-EOF in a string token contributes nothing to the value. + * + * CSS spec §4.3.5 consume-string-token: + * > U+005C REVERSE SOLIDUS (\) + * > If the next input code point is EOF, do nothing. + * + * The trailing backslash is consumed and produces no value. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-string-token + * @see https://github.com/WordPress/php-toolkit/issues/223 + */ + public function test_string_backslash_eof(): void { + $processor = CSSProcessor::create( "'string\\" ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() ); + $this->assertSame( 'string', $processor->get_token_value() ); + } + + /** + * Tests that backslash-newline in an unquoted URL produces a bad-url token. + * + * In unquoted URLs, the backslash-newline check goes through is_valid_escape() + * which returns false for newlines, triggering consume_remnants_of_bad_url(). + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-url-token + * + * @dataProvider data_url_backslash_newline + */ + public function test_url_backslash_newline( string $css ): void { + $processor = CSSProcessor::create( $css ); + + $found_bad_url = false; + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_BAD_URL === $processor->get_token_type() ) { + $found_bad_url = true; + break; + } + } + + $this->assertTrue( $found_bad_url, 'Expected a BAD_URL token but none was found.' ); + } + + public static function data_url_backslash_newline(): array { + return array( + 'backslash-LF' => array( "url(ab\\\ncd)" ), + 'backslash-FF' => array( "url(ab\\\fcd)" ), + 'backslash-CR' => array( "url(ab\\\rcd)" ), + 'backslash-CRLF' => array( "url(ab\\\r\ncd)" ), + ); + } + + /** + * Tests that backslash-EOF in an unquoted URL produces U+FFFD in the value. + * + * In unquoted URLs, is_valid_escape() returns true for backslash-EOF, + * and consuming the escaped code point at EOF produces U+FFFD per spec. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-url-token + */ + public function test_url_backslash_eof(): void { + $processor = CSSProcessor::create( "url(string\\" ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() ); + $this->assertSame( "string\u{FFFD}", $processor->get_token_value() ); + } + + /** + * Tests that backslash-newline stops an ident sequence. + * + * In idents, is_valid_escape() returns false for backslash-newline, + * so the ident stops before the backslash. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-name + * + * @dataProvider data_ident_backslash_newline + */ + public function test_ident_backslash_newline( string $css ): void { + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() ); + $this->assertSame( 'abc', $processor->get_token_value() ); + } + + public static function data_ident_backslash_newline(): array { + return array( + 'backslash-LF' => array( "abc\\\n" ), + 'backslash-FF' => array( "abc\\\f" ), + 'backslash-CR' => array( "abc\\\r" ), + 'backslash-CRLF' => array( "abc\\\r\n" ), + ); + } + + /** + * Tests that backslash-EOF in an ident produces U+FFFD in the value. + * + * In idents, is_valid_escape() returns true for backslash-EOF, + * and consuming the escaped code point at EOF produces U+FFFD per spec. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-name + */ + public function test_ident_backslash_eof(): void { + $processor = CSSProcessor::create( "abc\\" ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() ); + $this->assertSame( "abc\u{FFFD}", $processor->get_token_value() ); + } + /** * Tests that bad-string-token returns null for get_token_value(). * diff --git a/components/DataLiberation/Tests/CSSUrlProcessorTest.php b/components/DataLiberation/Tests/CSSUrlProcessorTest.php index 9fb9eb6a..050ce3b8 100644 --- a/components/DataLiberation/Tests/CSSUrlProcessorTest.php +++ b/components/DataLiberation/Tests/CSSUrlProcessorTest.php @@ -3,6 +3,9 @@ use PHPUnit\Framework\TestCase; use WordPress\DataLiberation\URL\CSSURLProcessor; +/** + * @group css + */ class CSSURLProcessorTest extends TestCase { /** diff --git a/components/DataLiberation/Tests/css-test-cases.json b/components/DataLiberation/Tests/css-test-cases.json index c1ead99f..542814a3 100644 --- a/components/DataLiberation/Tests/css-test-cases.json +++ b/components/DataLiberation/Tests/css-test-cases.json @@ -273,8 +273,8 @@ "raw": "\"foo\\\n\"", "startIndex": 0, "endIndex": 7, - "normalized": "\"foo\\\n\"", - "value": "foo\\\n" + "normalized": "\"foo\"", + "value": "foo" }, { "type": "whitespace-token", @@ -331,8 +331,8 @@ "raw": "\"foo\\\r\n\"", "startIndex": 0, "endIndex": 8, - "normalized": "\"foo\\\n\"", - "value": "foo\\\n" + "normalized": "\"foo\"", + "value": "foo" }, { "type": "whitespace-token", @@ -1497,8 +1497,8 @@ "raw": "\"lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5J\\\r\nBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9\"", "startIndex": 11, "endIndex": 102, - "normalized": "\"lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5J\\\nBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9\"", - "value": "lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5J\\\nBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9" + "normalized": "\"lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5JBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9\"", + "value": "lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5JBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9" }, { "type": "string-token", @@ -1721,8 +1721,8 @@ "raw": "'E{z\u0000U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1\\\r\nrey\thg7[5%rBK8RUC64Lu␌17O{E\\90873u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G'", "startIndex": 3, "endIndex": 263, - "normalized": "'E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1\\\nrey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G'", - "value": "E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1\\\nrey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G" + "normalized": "'E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1rey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G'", + "value": "E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1rey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G" }, { "type": "ident-token", @@ -1893,8 +1893,8 @@ "raw": "'\\\r\n{X