Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 74 additions & 24 deletions components/DataLiberation/CSS/class-cssprocessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -632,9 +632,10 @@ public function get_normalized_token(): ?string {
return null;
}

return $this->decode_string_or_url(
return $this->decode_range(
$this->token_starts_at,
$this->token_length
$this->token_length,
self::TOKEN_STRING === $this->token_type
);
}

Expand Down Expand Up @@ -680,29 +681,39 @@ public function get_token_value() {
switch ( $this->token_type ) {
case self::TOKEN_HASH:
// Hash value starts after the # character.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 );
$this->token_value = $this->decode_range( $this->token_starts_at + 1, $this->token_length - 1 );
break;

case self::TOKEN_AT_KEYWORD:
// At-keyword value starts after the @ character.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 );
$this->token_value = $this->decode_range( $this->token_starts_at + 1, $this->token_length - 1 );
break;

case self::TOKEN_FUNCTION:
// Function name is everything except the final (.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length - 1 );
$this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length - 1 );
break;

case self::TOKEN_IDENT:
// Identifier is the entire token.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length );
$this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length );
break;

case self::TOKEN_STRING:
if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) {
$this->token_value = $this->decode_range(
$this->token_value_starts_at,
$this->token_value_length,
true
);
} else {
$this->token_value = null;
}
break;

case self::TOKEN_URL:
// Decode and cache the string/URL value.
if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) {
$this->token_value = $this->decode_string_or_url(
$this->token_value = $this->decode_range(
$this->token_value_starts_at,
$this->token_value_length
);
Expand All @@ -713,7 +724,7 @@ public function get_token_value() {

case self::TOKEN_DELIM:
// Delim value is the single code point.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length );
$this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length );
break;

case self::TOKEN_NUMBER:
Expand Down Expand Up @@ -1183,7 +1194,7 @@ private function consume_numeric(): bool {
// Consume an ident sequence. Set the <dimension-token>'s unit to the returned value.
$unit_starts_at = $this->at;
$this->consume_ident_sequence();
$this->token_unit = $this->decode_string_or_url( $unit_starts_at, $this->at - $unit_starts_at );
$this->token_unit = $this->decode_range( $unit_starts_at, $this->at - $unit_starts_at );
$this->token_type = self::TOKEN_DIMENSION;
$this->token_length = $this->at - $this->token_starts_at;
return true;
Expand Down Expand Up @@ -1218,7 +1229,7 @@ private function consume_ident_like(): bool {
// Consume an ident sequence, and let string be the result.
$ident_start = $this->at;
$decoded = $this->consume_ident_sequence();
$string = $decoded ?? $this->decode_string_or_url( $ident_start, $this->at - $ident_start );
$string = $decoded ?? $this->decode_range( $ident_start, $this->at - $ident_start );

// If string's value is an ASCII case-insensitive match for "url",
// and the next input code point is U+0028 LEFT PARENTHESIS (().
Expand Down Expand Up @@ -1537,19 +1548,27 @@ private function consume_ident_start_codepoint( $at ): int {
}

/**
* Decodes a string or URL value with escape sequences and normalization.
*
* Fast path: If the slice contains no special characters, returns the raw
* substring with almost zero allocations.
*
* Slow path: Builds the decoded string by optionally processing escapes and
* normalizing line endings and null bytes.
*
* @param int $start Start byte offset.
* @param int $length Length of the substring to decode.
* @return string Decoded/normalized string.
* Decodes and normalizes ident-like or string CSS values from a byte range.
*
* For example:
* ┌──────────────┬────────┐
* │ Input │ Output │
* ├──────────────┼────────┤
* │ 'xyz' │ 'xyz' │
* │ '\x\y\z' │ 'xyz' │
* │ 'x\79z' │ 'xyz' │
* │ 'x\000079 z' │ 'xyz' │
* │ 'a\r\nb' │ 'a\nb' │
* │ 'a\0b' │ 'a�b' │
* └──────────────┴────────┘
*
* @param int $start Start byte offset.
* @param int $length Length of the substring to decode.
* @param bool $string_escapes Optional, default false. When true, apply additional escape
* rules that apply only to string tokens (CSS §4.3.5).
* @return string Decoded and normalized string.
*/
private function decode_string_or_url( int $start, int $length ): string {
private function decode_range( int $start, int $length, bool $string_escapes = false ): string {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice rename @sirreal!

// Fast path: check if any processing is needed.
$slice = wp_scrub_utf8( substr( $this->css, $start, $length ) );
$special_chars = "\\\r\f\x00";
Expand Down Expand Up @@ -1579,8 +1598,39 @@ private function decode_string_or_url( int $start, int $length ): string {

$char = $this->css[ $at ];

// Handle escapes (if enabled).
// Handle escapes.
if ( '\\' === $char ) {
/*
* String tokens have special escape rules per §4.3.5:
* - 0x5C (backslash) at EOF: consume the backslash, produce no value.
* - 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR):
* consume both characters as a line continuation, produce no value.
* - 0x5C (backslash) followed by 0x0D 0x0A (CRLF):
* consume all three characters as a line continuation, produce no value.
* These must be checked before the general escape path.
*/
if ( $string_escapes ) {
if ( $at + 1 >= $end ) {
// 0x5C at EOF: consume the backslash and stop.
++$at;
continue;
}
$next = $this->css[ $at + 1 ];
if ( "\n" === $next || "\f" === $next ) {
// 0x5C followed by 0x0A (LF) or 0x0C (FF): line continuation.
$at += 2;
continue;
}
if ( "\r" === $next ) {
// 0x5C followed by 0x0D (CR): line continuation; 0x0D 0x0A counts as one newline.
$at += 2;
if ( $at < $end && "\n" === $this->css[ $at ] ) {
++$at;
}
continue;
}
}

if ( $this->is_valid_escape( $at ) ) {
++$at;
$decoded .= $this->decode_escape_at( $at, $bytes_consumed );
Comment on lines 1634 to 1636
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the decode_range() slow path, both the normal segments appended via substr($this->css, ...) earlier in the loop and the decode_escape_at() return value here are not passed through wp_scrub_utf8(). This makes output inconsistent with the fast path (which scrubs) and can leak invalid UTF-8 if the range contains escapes/CR/FF/NUL plus invalid bytes. Consider scrubbing the appended segments (including the decoded escape output) and adding a regression test that combines an invalid byte with a backslash escape.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good spot, but not new in this PR: #228

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice. It would be great to explore a fix+test in another pr cc @copilot

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copilot opened #229 already

Expand Down
144 changes: 142 additions & 2 deletions components/DataLiberation/Tests/CSSProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

/**
* Comprehensive CSS processor tests based on the CSS Syntax Level 3 specification.
*
* @group css
*/
class CSSProcessorTest extends TestCase {

Expand All @@ -25,7 +27,7 @@ public function test_processor_matches_spec( string $css, array $expected_tokens
* @see https://github.com/romainmenke/css-processor-tests/
* @return array
*/
static public function corpus_provider(): array {
public static function corpus_provider(): array {
return json_decode(file_get_contents(__DIR__ . '/css-test-cases.json'), true);
}

Expand All @@ -35,7 +37,7 @@ static public function corpus_provider(): array {
* @param CSSProcessor $processor The CSS processor.
* @return array Array of tokens with type, raw, startIndex, endIndex, structured.
*/
static public function collect_tokens( CSSProcessor $processor, $keys = null ): array {
public static function collect_tokens( CSSProcessor $processor, $keys = null ): array {
$tokens = array();

while ( $processor->next_token() ) {
Expand Down Expand Up @@ -1542,6 +1544,144 @@ public function test_ident_start_codepoint_bounds_check(): void {
$this->assertSame( $expected_tokens, $actual_tokens );
}

/**
* Tests that backslash-newline in a string token contributes nothing to the value.
*
* CSS spec §4.3.5 consume-string-token:
* > U+005C REVERSE SOLIDUS (\)
* > Otherwise, if the next input code point is a newline, consume it.
*
* The backslash and newline are both consumed and produce no value.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-string-token
* @see https://github.com/WordPress/php-toolkit/issues/222
*
* @dataProvider data_string_backslash_newline
*/
public function test_string_backslash_newline( string $css, string $expected_value ): void {
$processor = CSSProcessor::create( $css );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
$this->assertSame( $expected_value, $processor->get_token_value() );
}

public static function data_string_backslash_newline(): array {
return array(
'backslash-LF' => array( "'str\\\ning'", 'string' ),
'backslash-FF' => array( "'str\\\fing'", 'string' ),
'backslash-CR' => array( "'str\\\ring'", 'string' ),
'backslash-CRLF' => array( "'str\\\r\ning'", 'string' ),
);
}

/**
* Tests that backslash-EOF in a string token contributes nothing to the value.
*
* CSS spec §4.3.5 consume-string-token:
* > U+005C REVERSE SOLIDUS (\)
* > If the next input code point is EOF, do nothing.
*
* The trailing backslash is consumed and produces no value.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-string-token
* @see https://github.com/WordPress/php-toolkit/issues/223
*/
public function test_string_backslash_eof(): void {
$processor = CSSProcessor::create( "'string\\" );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
$this->assertSame( 'string', $processor->get_token_value() );
}

/**
* Tests that backslash-newline in an unquoted URL produces a bad-url token.
*
* In unquoted URLs, the backslash-newline check goes through is_valid_escape()
* which returns false for newlines, triggering consume_remnants_of_bad_url().
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-url-token
*
* @dataProvider data_url_backslash_newline
*/
public function test_url_backslash_newline( string $css ): void {
$processor = CSSProcessor::create( $css );

$found_bad_url = false;
while ( $processor->next_token() ) {
if ( CSSProcessor::TOKEN_BAD_URL === $processor->get_token_type() ) {
$found_bad_url = true;
break;
}
}

$this->assertTrue( $found_bad_url, 'Expected a BAD_URL token but none was found.' );
}

public static function data_url_backslash_newline(): array {
return array(
'backslash-LF' => array( "url(ab\\\ncd)" ),
'backslash-FF' => array( "url(ab\\\fcd)" ),
'backslash-CR' => array( "url(ab\\\rcd)" ),
'backslash-CRLF' => array( "url(ab\\\r\ncd)" ),
);
}

/**
* Tests that backslash-EOF in an unquoted URL produces U+FFFD in the value.
*
* In unquoted URLs, is_valid_escape() returns true for backslash-EOF,
* and consuming the escaped code point at EOF produces U+FFFD per spec.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-url-token
*/
public function test_url_backslash_eof(): void {
$processor = CSSProcessor::create( "url(string\\" );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
$this->assertSame( "string\u{FFFD}", $processor->get_token_value() );
}

/**
* Tests that backslash-newline stops an ident sequence.
*
* In idents, is_valid_escape() returns false for backslash-newline,
* so the ident stops before the backslash.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-name
*
* @dataProvider data_ident_backslash_newline
*/
public function test_ident_backslash_newline( string $css ): void {
$processor = CSSProcessor::create( $css );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() );
$this->assertSame( 'abc', $processor->get_token_value() );
}

public static function data_ident_backslash_newline(): array {
return array(
'backslash-LF' => array( "abc\\\n" ),
'backslash-FF' => array( "abc\\\f" ),
'backslash-CR' => array( "abc\\\r" ),
'backslash-CRLF' => array( "abc\\\r\n" ),
);
}

/**
* Tests that backslash-EOF in an ident produces U+FFFD in the value.
*
* In idents, is_valid_escape() returns true for backslash-EOF,
* and consuming the escaped code point at EOF produces U+FFFD per spec.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-name
*/
public function test_ident_backslash_eof(): void {
$processor = CSSProcessor::create( "abc\\" );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() );
$this->assertSame( "abc\u{FFFD}", $processor->get_token_value() );
}

/**
* Tests that bad-string-token returns null for get_token_value().
*
Expand Down
3 changes: 3 additions & 0 deletions components/DataLiberation/Tests/CSSUrlProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
use PHPUnit\Framework\TestCase;
use WordPress\DataLiberation\URL\CSSURLProcessor;

/**
* @group css
*/
class CSSURLProcessorTest extends TestCase {

/**
Expand Down
Loading