WordPress · Copilot · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · adamziel
diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php
@@ -1571,7 +1571,7 @@ private function decode_string_or_url( int $start, int $length ): string {
 			if ( $normal_len > 0 ) {
 				// Clamp to not exceed the end boundary.
 				$normal_len = min( $normal_len, $end - $at );
-				$decoded   .= substr( $this->css, $at, $normal_len );
+				$decoded   .= wp_scrub_utf8( substr( $this->css, $at, $normal_len ) );
 				$at        += $normal_len;
 			}
 
@@ -1585,7 +1585,7 @@ private function decode_string_or_url( int $start, int $length ): string {
 			if ( '\\' === $char ) {
 				if ( $this->is_valid_escape( $at ) ) {
 					++$at;
-					$decoded .= $this->decode_escape_at( $at, $bytes_consumed );
+					$decoded .= wp_scrub_utf8( $this->decode_escape_at( $at, $bytes_consumed ) );
 					$at      += $bytes_consumed;
 					continue;
 				}

diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php
@@ -146,6 +146,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void
 		$this->assertSame( $expected, $actual_tokens );
 	}
 
+	/**
+	 * In the slow path of decode_string_or_url() (triggered by a backslash escape), normal
+	 * text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just
+	 * as the fast path does via wp_scrub_utf8().
+	 */
+	public function test_invalid_utf8_in_normal_segment_combined_with_escape(): void {
+		// The ident token contains an invalid UTF-8 byte (0xF1) in the "normal"
+		// segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash
+		// triggers the slow path, which previously skipped wp_scrub_utf8() on the
+		// normal segment.
+		$css = ".test\xF1\\41name";
+
+		$expected = array(
+			array(
+				'type'  => CSSProcessor::TOKEN_DELIM,
+				'raw'   => '.',
+				'value' => '.',
+			),
+			array(
+				'type'  => CSSProcessor::TOKEN_IDENT,
+				// raw contains the original bytes.
+				'raw'   => "test\xF1\\41name",
+				// value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'.
+				'value' => "test\u{FFFD}Aname",
+			),
+		);
+
+		$processor = CSSProcessor::create( $css );
+		$actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) );
+		$this->assertSame( $expected, $actual_tokens );
+	}
+
+	/**
+	 * When an invalid UTF-8 byte is the character directly after a backslash
+	 * (i.e. it is the escaped character itself), decode_escape_at() returns the
+	 * raw byte. The caller must scrub it to U+FFFD.
+	 */
+	public function test_invalid_utf8_as_escaped_character(): void {
+		// The CSS `.\xF1` is a delim + ident containing a lone invalid byte.
+		// Adding a backslash before the invalid byte makes it an escape sequence:
+		// `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte.
+		$css = ".a\\\xF1b";
+
+		$expected = array(
+			array(
+				'type'  => CSSProcessor::TOKEN_DELIM,
+				'raw'   => '.',
+				'value' => '.',
+			),
+			array(
+				'type'  => CSSProcessor::TOKEN_IDENT,
+				'raw'   => "a\\\xF1b",
+				// The escaped 0xF1 must be replaced with U+FFFD.
+				'value' => "a\u{FFFD}b",
+			),
+		);
+
+		$processor = CSSProcessor::create( $css );
+		$actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) );
+		$this->assertSame( $expected, $actual_tokens );
+	}
+
 	/**
 	 * Legacy test to ensure basic tokenization still works.
 	 */