Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 60 additions & 17 deletions components/DataLiberation/CSS/class-cssprocessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -632,9 +632,10 @@ public function get_normalized_token(): ?string {
return null;
}

return $this->decode_string_or_url(
return $this->decode_escapes(
$this->token_starts_at,
$this->token_length
$this->token_length,
self::TOKEN_STRING === $this->token_type || self::TOKEN_BAD_STRING === $this->token_type
);
}

Expand Down Expand Up @@ -680,42 +681,53 @@ public function get_token_value() {
switch ( $this->token_type ) {
case self::TOKEN_HASH:
// Hash value starts after the # character.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 );
$this->token_value = $this->decode_escapes( $this->token_starts_at + 1, $this->token_length - 1 );
break;

case self::TOKEN_AT_KEYWORD:
// At-keyword value starts after the @ character.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 );
$this->token_value = $this->decode_escapes( $this->token_starts_at + 1, $this->token_length - 1 );
break;

case self::TOKEN_FUNCTION:
// Function name is everything except the final (.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length - 1 );
$this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length - 1 );
break;

case self::TOKEN_IDENT:
// Identifier is the entire token.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length );
$this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length );
break;

case self::TOKEN_STRING:
case self::TOKEN_BAD_STRING:
// Decode and cache the string value.
if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) {
$this->token_value = $this->decode_escapes(
$this->token_value_starts_at,
$this->token_value_length,
true
);
} else {
$this->token_value = null;
}
break;

case self::TOKEN_URL:
// Decode and cache the string/URL value.
// Decode and cache the URL value.
if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) {
$this->token_value = $this->decode_string_or_url(
$this->token_value = $this->decode_escapes(
$this->token_value_starts_at,
$this->token_value_length
);
$this->token_value = $this->token_value;
} else {
$this->token_value = null;
}
break;

case self::TOKEN_DELIM:
// Delim value is the single code point.
$this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length );
$this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length );
break;

case self::TOKEN_NUMBER:
Expand Down Expand Up @@ -1185,7 +1197,7 @@ private function consume_numeric(): bool {
// Consume an ident sequence. Set the <dimension-token>'s unit to the returned value.
$unit_starts_at = $this->at;
$this->consume_ident_sequence();
$this->token_unit = $this->decode_string_or_url( $unit_starts_at, $this->at - $unit_starts_at );
$this->token_unit = $this->decode_escapes( $unit_starts_at, $this->at - $unit_starts_at );
$this->token_type = self::TOKEN_DIMENSION;
$this->token_length = $this->at - $this->token_starts_at;
return true;
Expand Down Expand Up @@ -1220,7 +1232,7 @@ private function consume_ident_like(): bool {
// Consume an ident sequence, and let string be the result.
$ident_start = $this->at;
$decoded = $this->consume_ident_sequence();
$string = $decoded ?? $this->decode_string_or_url( $ident_start, $this->at - $ident_start );
$string = $decoded ?? $this->decode_escapes( $ident_start, $this->at - $ident_start );

// If string's value is an ASCII case-insensitive match for "url",
// and the next input code point is U+0028 LEFT PARENTHESIS (().
Expand Down Expand Up @@ -1547,11 +1559,15 @@ private function consume_ident_start_codepoint( $at ): int {
* Slow path: Builds the decoded string by optionally processing escapes and
* normalizing line endings and null bytes.
*
* @param int $start Start byte offset.
* @param int $length Length of the substring to decode.
* @return string Decoded/normalized string.
* @param int $start Start byte offset.
* @param int $length Length of the substring to decode.
* @param bool $string_escapes Optional, default false. When true, apply special CSS string
* token escape rules:
* - \-newline is consumed as a line continuation (ignored).
* - \-EOF is silently discarded.
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot This should be consistent with the comment in the body. Below that was changed to use HEX, so lets do that here as well.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in b0d082e — updated the $string_escapes docblock to use 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR) notation, consistent with the method body.

* @return string Decoded and normalized string.
*/
private function decode_string_or_url( int $start, int $length ): string {
private function decode_escapes( int $start, int $length, bool $string_escapes = false ): string {
// Fast path: check if any processing is needed.
$slice = wp_scrub_utf8( substr( $this->css, $start, $length ) );
$special_chars = "\\\r\f\x00";
Expand Down Expand Up @@ -1581,8 +1597,35 @@ private function decode_string_or_url( int $start, int $length ): string {

$char = $this->css[ $at ];

// Handle escapes (if enabled).
// Handle escapes.
if ( '\\' === $char ) {
/*
* String tokens have special escape rules per §4.3.5:
* - \-EOF: do nothing (consume the backslash, produce no value).
* - \-newline: consume both (line continuation, produce no value).
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot Let's express that as 0x byte sequences with annotations what do they mean. This notation confuses me.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in 96b91af — the comment now uses 0x byte sequences with annotations, e.g. 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR).

* These must be checked before the general escape path.
*/
if ( $string_escapes ) {
if ( $at + 1 >= $end ) {
// \-EOF: consume the backslash and stop.
++$at;
continue;
}
$next = $this->css[ $at + 1 ];
if ( "\n" === $next || "\f" === $next ) {
$at += 2;
continue;
}
if ( "\r" === $next ) {
$at += 2;
// \r\n counts as one newline.
if ( $at < $end && "\n" === $this->css[ $at ] ) {
++$at;
}
continue;
}
}

if ( $this->is_valid_escape( $at ) ) {
++$at;
$decoded .= $this->decode_escape_at( $at, $bytes_consumed );
Comment on lines 1634 to 1636
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the decode_range() slow path, both the normal segments appended via substr($this->css, ...) earlier in the loop and the decode_escape_at() return value here are not passed through wp_scrub_utf8(). This makes output inconsistent with the fast path (which scrubs) and can leak invalid UTF-8 if the range contains escapes/CR/FF/NUL plus invalid bytes. Consider scrubbing the appended segments (including the decoded escape output) and adding a regression test that combines an invalid byte with a backslash escape.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good spot, but not new in this PR: #228

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice. It would be great to explore a fix+test in another pr cc @copilot

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copilot opened #229 already

Expand Down
144 changes: 142 additions & 2 deletions components/DataLiberation/Tests/CSSProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

/**
* Comprehensive CSS processor tests based on the CSS Syntax Level 3 specification.
*
* @group css
*/
class CSSProcessorTest extends TestCase {

Expand All @@ -25,7 +27,7 @@ public function test_processor_matches_spec( string $css, array $expected_tokens
* @see https://github.com/romainmenke/css-processor-tests/
* @return array
*/
static public function corpus_provider(): array {
public static function corpus_provider(): array {
return json_decode(file_get_contents(__DIR__ . '/css-test-cases.json'), true);
}

Expand All @@ -35,7 +37,7 @@ static public function corpus_provider(): array {
* @param CSSProcessor $processor The CSS processor.
* @return array Array of tokens with type, raw, startIndex, endIndex, structured.
*/
static public function collect_tokens( CSSProcessor $processor, $keys = null ): array {
public static function collect_tokens( CSSProcessor $processor, $keys = null ): array {
$tokens = array();

while ( $processor->next_token() ) {
Expand Down Expand Up @@ -1541,4 +1543,142 @@ public function test_ident_start_codepoint_bounds_check(): void {
);
$this->assertSame( $expected_tokens, $actual_tokens );
}

/**
* Tests that backslash-newline in a string token contributes nothing to the value.
*
* CSS spec §4.3.5 consume-string-token:
* > U+005C REVERSE SOLIDUS (\)
* > Otherwise, if the next input code point is a newline, consume it.
*
* The backslash and newline are both consumed and produce no value.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-string-token
* @see https://github.com/WordPress/php-toolkit/issues/222
*
* @dataProvider data_string_backslash_newline
*/
public function test_string_backslash_newline( string $css, string $expected_value ): void {
$processor = CSSProcessor::create( $css );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
$this->assertSame( $expected_value, $processor->get_token_value() );
}

public static function data_string_backslash_newline(): array {
return array(
'backslash-LF' => array( "'str\\\ning'", 'string' ),
'backslash-FF' => array( "'str\\\fing'", 'string' ),
'backslash-CR' => array( "'str\\\ring'", 'string' ),
'backslash-CRLF' => array( "'str\\\r\ning'", 'string' ),
);
}

/**
* Tests that backslash-EOF in a string token contributes nothing to the value.
*
* CSS spec §4.3.5 consume-string-token:
* > U+005C REVERSE SOLIDUS (\)
* > If the next input code point is EOF, do nothing.
*
* The trailing backslash is consumed and produces no value.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-string-token
* @see https://github.com/WordPress/php-toolkit/issues/223
*/
public function test_string_backslash_eof(): void {
$processor = CSSProcessor::create( "'string\\" );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
$this->assertSame( 'string', $processor->get_token_value() );
}

/**
* Tests that backslash-newline in an unquoted URL produces a bad-url token.
*
* In unquoted URLs, the backslash-newline check goes through is_valid_escape()
* which returns false for newlines, triggering consume_remnants_of_bad_url().
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-url-token
*
* @dataProvider data_url_backslash_newline
*/
public function test_url_backslash_newline( string $css ): void {
$processor = CSSProcessor::create( $css );

$found_bad_url = false;
while ( $processor->next_token() ) {
if ( CSSProcessor::TOKEN_BAD_URL === $processor->get_token_type() ) {
$found_bad_url = true;
break;
}
}

$this->assertTrue( $found_bad_url, 'Expected a BAD_URL token but none was found.' );
}

public static function data_url_backslash_newline(): array {
return array(
'backslash-LF' => array( "url(ab\\\ncd)" ),
'backslash-FF' => array( "url(ab\\\fcd)" ),
'backslash-CR' => array( "url(ab\\\rcd)" ),
'backslash-CRLF' => array( "url(ab\\\r\ncd)" ),
);
}

/**
* Tests that backslash-EOF in an unquoted URL produces U+FFFD in the value.
*
* In unquoted URLs, is_valid_escape() returns true for backslash-EOF,
* and consuming the escaped code point at EOF produces U+FFFD per spec.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-url-token
*/
public function test_url_backslash_eof(): void {
$processor = CSSProcessor::create( "url(string\\" );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
$this->assertSame( "string\u{FFFD}", $processor->get_token_value() );
}

/**
* Tests that backslash-newline stops an ident sequence.
*
* In idents, is_valid_escape() returns false for backslash-newline,
* so the ident stops before the backslash.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-name
*
* @dataProvider data_ident_backslash_newline
*/
public function test_ident_backslash_newline( string $css ): void {
$processor = CSSProcessor::create( $css );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() );
$this->assertSame( 'abc', $processor->get_token_value() );
}

public static function data_ident_backslash_newline(): array {
return array(
'backslash-LF' => array( "abc\\\n" ),
'backslash-FF' => array( "abc\\\f" ),
'backslash-CR' => array( "abc\\\r" ),
'backslash-CRLF' => array( "abc\\\r\n" ),
);
}

/**
* Tests that backslash-EOF in an ident produces U+FFFD in the value.
*
* In idents, is_valid_escape() returns true for backslash-EOF,
* and consuming the escaped code point at EOF produces U+FFFD per spec.
*
* @see https://www.w3.org/TR/css-syntax-3/#consume-name
*/
public function test_ident_backslash_eof(): void {
$processor = CSSProcessor::create( "abc\\" );
$this->assertTrue( $processor->next_token() );
$this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() );
$this->assertSame( "abc\u{FFFD}", $processor->get_token_value() );
}
}
3 changes: 3 additions & 0 deletions components/DataLiberation/Tests/CSSUrlProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
use PHPUnit\Framework\TestCase;
use WordPress\DataLiberation\URL\CSSURLProcessor;

/**
* @group css
*/
class CSSURLProcessorTest extends TestCase {

/**
Expand Down
Loading
Loading