WordPress · ellatrix · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/packages/editor/src/components/post-revisions-preview/block-diff.js b/packages/editor/src/components/post-revisions-preview/block-diff.js
@@ -45,8 +45,19 @@ function stringifyValue( value ) {
 }
 
 /**
- * Calculate text similarity using word diff (semantically meaningful).
- * Returns ratio of unchanged words to total words.
+ * Calculate text similarity using word-set overlap.
+ *
+ * Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index)
+ * called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient)
+ * where we divide by the larger set size rather than the union. This ensures that
+ * a small edit to a long paragraph scores high — the few changed words don't
+ * dilute the score.
+ *
+ * This replaces the previous diffWords-based similarity which was O(n*m) per pair.
+ * The word-set approach is O(n) where n is the number of words.
+ *
+ * Words are extracted using Intl.Segmenter for proper multilingual support
+ * (CJK, Thai, etc.) rather than splitting on whitespace.
  *
  * @param {string} text1 First text to compare.
  * @param {string} text2 Second text to compare.
@@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) {
 		return 0;
 	}
 
-	const changes = diffWords( text1, text2 );
-	const unchanged = changes
-		.filter( ( c ) => ! c.added && ! c.removed )
-		.reduce( ( sum, c ) => sum + c.value.length, 0 );
-	const total = Math.max( text1.length, text2.length );
-	return total > 0 ? unchanged / total : 0;
+	const segmenter = new Intl.Segmenter( undefined, {
+		granularity: 'word',
+	} );
+	const getWords = ( text ) =>
+		[ ...segmenter.segment( text ) ]
+			.filter( ( s ) => s.isWordLike )
+			.map( ( s ) => s.segment );
+	const words1 = getWords( text1 );
+	const words2 = getWords( text2 );
+
+	if ( words1.length === 0 && words2.length === 0 ) {
+		return 1;
+	}
+
+	const set1 = new Set( words1 );
+	let intersection = 0;
+	for ( const word of words2 ) {
+		if ( set1.has( word ) ) {
+			intersection++;
+		}
+	}
+
+	const total = Math.max( words1.length, words2.length );
+	return total > 0 ? intersection / total : 0;
 }
 
 /**
  * Post-process diff result to pair similar removed/added blocks as modifications.
- * This catches modifications that LCS missed due to content changes.
+ *
+ * After LCS diffing, a block whose content changed appears as a separate "removed"
+ * and "added" entry (since the full block signature differs). This function detects
+ * such pairs and merges them into a single "modified" block with inline diff.
+ *
+ * Two pairing strategies are used:
+ * 1. When exactly one block of a given type was removed and one was added,
+ *    they are paired directly — no ambiguity, no similarity check needed.
+ * 2. When multiple candidates exist, textSimilarity (overlap coefficient) is
+ *    used to find the best match. Blocks must share at least 50% of their
+ *    words to be paired, preventing unrelated paragraphs from being merged.
  *
  * @param {Array} blocks Raw blocks with diff status.
  * @return {Array} Blocks with similar pairs converted to modifications.
@@ -96,38 +135,77 @@ function pairSimilarBlocks( blocks ) {
 
 	const pairedRemoved = new Set(); // Indices of removed blocks that were paired.
 	const modifications = new Map(); // Map from added block index to modified block.
-	const SIMILARITY_THRESHOLD = 0.3;
+	const SIMILARITY_THRESHOLD = 0.5;
+
+	// Group candidates by block name for efficient lookup.
+	const addedByName = new Map();
+	for ( const add of added ) {
+		const name = add.block.blockName;
+		if ( ! addedByName.has( name ) ) {
+			addedByName.set( name, [] );
+		}
+		addedByName.get( name ).push( add );
+	}
+	const removedByName = new Map();
+	for ( const rem of removed ) {
+		const name = rem.block.blockName;
+		if ( ! removedByName.has( name ) ) {
+			removedByName.set( name, [] );
+		}
+		removedByName.get( name ).push( rem );
+	}
 
 	// For each removed block, find best matching added block.
 	for ( const rem of removed ) {
-		let bestMatch = null;
-		let bestScore = 0;
+		const candidates = addedByName.get( rem.block.blockName ) || [];
+		const sameNameRemoved = removedByName.get( rem.block.blockName ) || [];
+		const unpaired = candidates.filter(
+			( add ) => ! modifications.has( add.index )
+		);
 
-		for ( const add of added ) {
-			if ( modifications.has( add.index ) ) {
-				continue;
-			}
-			if ( add.block.blockName !== rem.block.blockName ) {
-				continue;
-			}
+		if ( unpaired.length === 0 ) {
+			continue;
+		}
 
-			const score = textSimilarity(
-				rem.block.innerHTML || '',
-				add.block.innerHTML || ''
-			);
-			// If content is identical (score=1), only pair if attrs differ.
-			// Otherwise identical blocks are just position swaps, not modifications.
+		let bestMatch = null;
+
+		// If there's exactly one removed and one added of this type,
+		// pair them directly — no ambiguity, no similarity check needed.
+		if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) {
+			const add = unpaired[ 0 ];
 			const attrsMatch =
 				JSON.stringify( rem.block.attrs ) ===
 				JSON.stringify( add.block.attrs );
-			if (
-				score > bestScore &&
-				score > SIMILARITY_THRESHOLD &&
-				( score < 1 || ! attrsMatch )
-			) {
-				bestScore = score;
+			// Only skip pairing if both content and attrs are identical
+			// (position swap, not a modification).
+			const contentMatch =
+				( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' );
+			if ( ! contentMatch || ! attrsMatch ) {
 				bestMatch = add;
 			}
+		} else {
+			// Multiple candidates — use similarity to find best match.
+			let bestScore = 0;
+			for ( const add of unpaired ) {
+				const score = textSimilarity(
+					rem.block.innerHTML || '',
+					add.block.innerHTML || ''
+				);
+				// Skip identical blocks (score=1 with same attrs) — those
+				// are position swaps, not modifications. They should show
+				// as separate removed + added, not as a no-op "modified".
+				const attrsMatch =
+					JSON.stringify( rem.block.attrs ) ===
+					JSON.stringify( add.block.attrs );
+				if (
+					score > bestScore &&
+					score > SIMILARITY_THRESHOLD &&
+					( score < 1 || ! attrsMatch )
+				) {
+					bestScore = score;
+					bestMatch = add;
+				}
+			}
 		}
 
 		if ( bestMatch ) {

diff --git a/packages/editor/src/components/post-revisions-preview/test/block-diff.js b/packages/editor/src/components/post-revisions-preview/test/block-diff.js
@@ -712,6 +712,65 @@ describe( 'diffRevisionContent', () => {
 			] );
 		} );
 
+		it( 'handles multiple inner block changes at once (similar content)', () => {
+			const previous = serialize( [
+				createBlock( 'core/group', {}, [
+					createBlock( 'core/paragraph', { content: 'A' } ),
+					createBlock( 'core/paragraph', {
+						content: 'The quick brown fox jumps over the lazy dog',
+					} ),
+					createBlock( 'core/paragraph', { content: 'C' } ),
+				] ),
+			] );
+			const current = serialize( [
+				createBlock( 'core/group', {}, [
+					createBlock( 'core/paragraph', { content: 'A' } ),
+					createBlock( 'core/paragraph', {
+						content: 'The quick brown fox leaps over the lazy dog',
+					} ),
+				] ),
+			] );
+			const blocks = diffRevisionContent( current, previous );
+
+			// Post-LCS pairing matches the fox sentences (high word overlap).
+			// C remains removed since it has no matching added block.
+			expect( normalizeBlockTree( blocks ) ).toMatchObject( [
+				{
+					name: 'core/group',
+					attributes: {
+						__revisionDiffStatus: undefined,
+					},
+					innerBlocks: [
+						{
+							name: 'core/paragraph',
+							attributes: {
+								content: 'A',
+								__revisionDiffStatus: undefined,
+							},
+						},
+						{
+							name: 'core/paragraph',
+							attributes: {
+								content: 'C',
+								__revisionDiffStatus: { status: 'removed' },
+							},
+						},
+						{
+							name: 'core/paragraph',
+							attributes: {
+								// jumps→leaps modification with inline diff
+								content:
+									'The quick brown fox <del title="Removed" class="revision-diff-removed">jumps</del><ins title="Added" class="revision-diff-added">leaps</ins> over the lazy dog',
+								__revisionDiffStatus: {
+									status: 'modified',
+								},
+							},
+						},
+					],
+				},
+			] );
+		} );
+
 		it( 'does not pair blocks with completely different content', () => {
 			const previous = serialize( [
 				createBlock( 'core/group', {}, [