WordPress · ellatrix · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/packages/editor/src/components/post-revisions-preview/block-diff.js b/packages/editor/src/components/post-revisions-preview/block-diff.js
@@ -45,8 +45,19 @@ function stringifyValue( value ) {
 }
 
 /**
- * Calculate text similarity using word diff (semantically meaningful).
- * Returns ratio of unchanged words to total words.
+ * Calculate text similarity using word-set overlap.
+ *
+ * Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index)
+ * called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient)
+ * where we divide by the larger set size rather than the union. This ensures that
+ * a small edit to a long paragraph scores high — the few changed words don't
+ * dilute the score.
+ *
+ * This replaces the previous diffWords-based similarity which was O(n*m) per pair.
+ * The word-set approach is O(n) where n is the number of words.
+ *
+ * Words are extracted using Intl.Segmenter for proper multilingual support
+ * (CJK, Thai, etc.) rather than splitting on whitespace.
  *
  * @param {string} text1 First text to compare.
  * @param {string} text2 Second text to compare.
@@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) {
 		return 0;
 	}
 
-	const changes = diffWords( text1, text2 );
-	const unchanged = changes
-		.filter( ( c ) => ! c.added && ! c.removed )
-		.reduce( ( sum, c ) => sum + c.value.length, 0 );
-	const total = Math.max( text1.length, text2.length );
-	return total > 0 ? unchanged / total : 0;
+	const segmenter = new Intl.Segmenter( undefined, {
+		granularity: 'word',
+	} );
+	const getWords = ( text ) =>
+		[ ...segmenter.segment( text ) ]
+			.filter( ( s ) => s.isWordLike )
+			.map( ( s ) => s.segment );
+	const words1 = getWords( text1 );
+	const words2 = getWords( text2 );
+
+	if ( words1.length === 0 && words2.length === 0 ) {
+		return 1;
+	}
+
+	const set1 = new Set( words1 );
+	let intersection = 0;
+	for ( const word of words2 ) {
+		if ( set1.has( word ) ) {
+			intersection++;
+		}
+	}
+
+	const total = Math.max( words1.length, words2.length );
+	return total > 0 ? intersection / total : 0;
 }
 
 /**
  * Post-process diff result to pair similar removed/added blocks as modifications.
- * This catches modifications that LCS missed due to content changes.
+ *
+ * After LCS diffing, a block whose content changed appears as a separate "removed"
+ * and "added" entry (since the full block signature differs). This function detects
+ * such pairs and merges them into a single "modified" block with inline diff.
+ *
+ * Two pairing strategies are used:
+ * 1. When exactly one block of a given type was removed and one was added,
+ *    they are paired directly — no ambiguity, no similarity check needed.
+ * 2. When multiple candidates exist, textSimilarity (overlap coefficient) is
+ *    used to find the best match. Blocks must share at least 50% of their
+ *    words to be paired, preventing unrelated paragraphs from being merged.
  *
  * @param {Array} blocks Raw blocks with diff status.
  * @return {Array} Blocks with similar pairs converted to modifications.
@@ -94,62 +133,139 @@ function pairSimilarBlocks( blocks ) {
 		return blocks;
 	}
 
-	const pairedRemoved = new Set(); // Indices of removed blocks that were paired.
-	const modifications = new Map(); // Map from added block index to modified block.
-	const SIMILARITY_THRESHOLD = 0.3;
+	const pairedRemoved = new Set(); // Indices of removed blocks filtered out.
+	const pairedAdded = new Set(); // Indices of added blocks filtered out.
+	const modifications = new Map(); // Index → modified block.
+	const SIMILARITY_THRESHOLD = 0.5;
+
+	// Group candidates by block name for efficient lookup.
+	const addedByName = new Map();
+	for ( const add of added ) {
+		const name = add.block.blockName;
+		if ( ! addedByName.has( name ) ) {
+			addedByName.set( name, [] );
+		}
+		addedByName.get( name ).push( add );
+	}
+	const removedByName = new Map();
+	for ( const rem of removed ) {
+		const name = rem.block.blockName;
+		if ( ! removedByName.has( name ) ) {
+			removedByName.set( name, [] );
+		}
+		removedByName.get( name ).push( rem );
+	}
 
 	// For each removed block, find best matching added block.
+	// Track the highest added index paired so far — new pairings must
+	// not go backwards, or the removed/added text order would break.
+	let maxPairedAddedIndex = -1;
+
 	for ( const rem of removed ) {
-		let bestMatch = null;
-		let bestScore = 0;
+		const candidates = addedByName.get( rem.block.blockName ) || [];
+		const sameNameRemoved = removedByName.get( rem.block.blockName ) || [];
+		const unpaired = candidates.filter(
+			( add ) =>
+				! modifications.has( add.index ) &&
+				add.index > maxPairedAddedIndex
+		);
 
-		for ( const add of added ) {
-			if ( modifications.has( add.index ) ) {
-				continue;
-			}
-			if ( add.block.blockName !== rem.block.blockName ) {
-				continue;
-			}
+		if ( unpaired.length === 0 ) {
+			continue;
+		}
 
-			const score = textSimilarity(
-				rem.block.innerHTML || '',
-				add.block.innerHTML || ''
-			);
-			// If content is identical (score=1), only pair if attrs differ.
-			// Otherwise identical blocks are just position swaps, not modifications.
+		let bestMatch = null;
+
+		// If there's exactly one removed and one added of this type,
+		// pair them directly — no ambiguity, no similarity check needed.
+		if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) {
+			const add = unpaired[ 0 ];
 			const attrsMatch =
 				JSON.stringify( rem.block.attrs ) ===
 				JSON.stringify( add.block.attrs );
-			if (
-				score > bestScore &&
-				score > SIMILARITY_THRESHOLD &&
-				( score < 1 || ! attrsMatch )
-			) {
-				bestScore = score;
+			// Only skip pairing if both content and attrs are identical
+			// (position swap, not a modification).
+			const contentMatch =
+				( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' );
+			if ( ! contentMatch || ! attrsMatch ) {
 				bestMatch = add;
 			}
+		} else {
+			// Multiple candidates — use similarity to find best match.
+			let bestScore = 0;
+			for ( const add of unpaired ) {
+				const score = textSimilarity(
+					rem.block.innerHTML || '',
+					add.block.innerHTML || ''
+				);
+				// Skip identical blocks (score=1 with same attrs) — those
+				// are position swaps, not modifications. They should show
+				// as separate removed + added, not as a no-op "modified".
+				const attrsMatch =
+					JSON.stringify( rem.block.attrs ) ===
+					JSON.stringify( add.block.attrs );
+				if (
+					score > bestScore &&
+					score > SIMILARITY_THRESHOLD &&
+					( score < 1 || ! attrsMatch )
+				) {
+					bestScore = score;
+					bestMatch = add;
+				}
+			}
 		}
 
 		if ( bestMatch ) {
-			pairedRemoved.add( rem.index );
+			maxPairedAddedIndex = bestMatch.index;
 
-			// Create modified block with previous content stored.
-			modifications.set( bestMatch.index, {
+			const modifiedBlock = {
 				...bestMatch.block,
 				__revisionDiffStatus: { status: 'modified' },
 				__previousRawBlock: rem.block,
-			} );
+			};
+
+			// Decide where to place the modified block by checking
+			// what's between the removed and added positions.
+			// If there are unpaired added blocks between them,
+			// placing at the removed position would put the modified
+			// block before content that comes before it in the
+			// current revision — so use the added position.
+			// Otherwise, use the removed position to keep the
+			// previous revision's order intact.
+			const lo = Math.min( rem.index, bestMatch.index );
+			const hi = Math.max( rem.index, bestMatch.index );
+			let hasAddedBetween = false;
+			for ( let i = lo + 1; i < hi; i++ ) {
+				if (
+					blocks[ i ].__revisionDiffStatus?.status === 'added' &&
+					! pairedAdded.has( i )
+				) {
+					hasAddedBetween = true;
+					break;
+				}
+			}
+
+			if ( hasAddedBetween ) {
+				// Use the added position — don't jump before
+				// current-revision content.
+				modifications.set( bestMatch.index, modifiedBlock );
+				pairedRemoved.add( rem.index );
+			} else {
+				// Use the removed position — keep the previous
+				// revision's reading order.
+				modifications.set( rem.index, modifiedBlock );
+				pairedAdded.add( bestMatch.index );
+			}
 		}
 	}
 
-	// Rebuild result: filter out paired removed, replace paired added with modified.
+	// Rebuild result: replace modification targets, filter out
+	// their paired counterparts.
 	return blocks
 		.map( ( block, index ) => {
-			// Skip paired removed blocks.
-			if ( pairedRemoved.has( index ) ) {
+			if ( pairedRemoved.has( index ) || pairedAdded.has( index ) ) {
 				return null;
 			}
-			// Replace paired added blocks with modified version.
 			if ( modifications.has( index ) ) {
 				return modifications.get( index );
 			}