diff --git a/packages/editor/src/components/post-revisions-preview/block-diff.js b/packages/editor/src/components/post-revisions-preview/block-diff.js
index c0d873f807e695..50598d18efdd70 100644
--- a/packages/editor/src/components/post-revisions-preview/block-diff.js
+++ b/packages/editor/src/components/post-revisions-preview/block-diff.js
@@ -45,8 +45,19 @@ function stringifyValue( value ) {
}
/**
- * Calculate text similarity using word diff (semantically meaningful).
- * Returns ratio of unchanged words to total words.
+ * Calculate text similarity using word-set overlap.
+ *
+ * Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index)
+ * called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient)
+ * where we divide by the larger set size rather than the union. This ensures that
+ * a small edit to a long paragraph scores high — the few changed words don't
+ * dilute the score.
+ *
+ * This replaces the previous diffWords-based similarity which was O(n*m) per pair.
+ * The word-set approach is O(n) where n is the number of words.
+ *
+ * Words are extracted using Intl.Segmenter for proper multilingual support
+ * (CJK, Thai, etc.) rather than splitting on whitespace.
*
* @param {string} text1 First text to compare.
* @param {string} text2 Second text to compare.
@@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) {
return 0;
}
- const changes = diffWords( text1, text2 );
- const unchanged = changes
- .filter( ( c ) => ! c.added && ! c.removed )
- .reduce( ( sum, c ) => sum + c.value.length, 0 );
- const total = Math.max( text1.length, text2.length );
- return total > 0 ? unchanged / total : 0;
+ const segmenter = new Intl.Segmenter( undefined, {
+ granularity: 'word',
+ } );
+ const getWords = ( text ) =>
+ [ ...segmenter.segment( text ) ]
+ .filter( ( s ) => s.isWordLike )
+ .map( ( s ) => s.segment );
+ const words1 = getWords( text1 );
+ const words2 = getWords( text2 );
+
+ if ( words1.length === 0 && words2.length === 0 ) {
+ return 1;
+ }
+
+ const set1 = new Set( words1 );
+ let intersection = 0;
+ for ( const word of words2 ) {
+ if ( set1.has( word ) ) {
+ intersection++;
+ }
+ }
+
+ const total = Math.max( words1.length, words2.length );
+ return total > 0 ? intersection / total : 0;
}
/**
* Post-process diff result to pair similar removed/added blocks as modifications.
- * This catches modifications that LCS missed due to content changes.
+ *
+ * After LCS diffing, a block whose content changed appears as a separate "removed"
+ * and "added" entry (since the full block signature differs). This function detects
+ * such pairs and merges them into a single "modified" block with inline diff.
+ *
+ * Two pairing strategies are used:
+ * 1. When exactly one block of a given type was removed and one was added,
+ * they are paired directly — no ambiguity, no similarity check needed.
+ * 2. When multiple candidates exist, textSimilarity (overlap coefficient) is
+ * used to find the best match. Blocks must share at least 50% of their
+ * words to be paired, preventing unrelated paragraphs from being merged.
*
* @param {Array} blocks Raw blocks with diff status.
* @return {Array} Blocks with similar pairs converted to modifications.
@@ -96,38 +135,77 @@ function pairSimilarBlocks( blocks ) {
const pairedRemoved = new Set(); // Indices of removed blocks that were paired.
const modifications = new Map(); // Map from added block index to modified block.
- const SIMILARITY_THRESHOLD = 0.3;
+ const SIMILARITY_THRESHOLD = 0.5;
+
+ // Group candidates by block name for efficient lookup.
+ const addedByName = new Map();
+ for ( const add of added ) {
+ const name = add.block.blockName;
+ if ( ! addedByName.has( name ) ) {
+ addedByName.set( name, [] );
+ }
+ addedByName.get( name ).push( add );
+ }
+ const removedByName = new Map();
+ for ( const rem of removed ) {
+ const name = rem.block.blockName;
+ if ( ! removedByName.has( name ) ) {
+ removedByName.set( name, [] );
+ }
+ removedByName.get( name ).push( rem );
+ }
// For each removed block, find best matching added block.
for ( const rem of removed ) {
- let bestMatch = null;
- let bestScore = 0;
+ const candidates = addedByName.get( rem.block.blockName ) || [];
+ const sameNameRemoved = removedByName.get( rem.block.blockName ) || [];
+ const unpaired = candidates.filter(
+ ( add ) => ! modifications.has( add.index )
+ );
- for ( const add of added ) {
- if ( modifications.has( add.index ) ) {
- continue;
- }
- if ( add.block.blockName !== rem.block.blockName ) {
- continue;
- }
+ if ( unpaired.length === 0 ) {
+ continue;
+ }
- const score = textSimilarity(
- rem.block.innerHTML || '',
- add.block.innerHTML || ''
- );
- // If content is identical (score=1), only pair if attrs differ.
- // Otherwise identical blocks are just position swaps, not modifications.
+ let bestMatch = null;
+
+ // If there's exactly one removed and one added of this type,
+ // pair them directly — no ambiguity, no similarity check needed.
+ if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) {
+ const add = unpaired[ 0 ];
const attrsMatch =
JSON.stringify( rem.block.attrs ) ===
JSON.stringify( add.block.attrs );
- if (
- score > bestScore &&
- score > SIMILARITY_THRESHOLD &&
- ( score < 1 || ! attrsMatch )
- ) {
- bestScore = score;
+ // Only skip pairing if both content and attrs are identical
+ // (position swap, not a modification).
+ const contentMatch =
+ ( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' );
+ if ( ! contentMatch || ! attrsMatch ) {
bestMatch = add;
}
+ } else {
+ // Multiple candidates — use similarity to find best match.
+ let bestScore = 0;
+ for ( const add of unpaired ) {
+ const score = textSimilarity(
+ rem.block.innerHTML || '',
+ add.block.innerHTML || ''
+ );
+ // Skip identical blocks (score=1 with same attrs) — those
+ // are position swaps, not modifications. They should show
+ // as separate removed + added, not as a no-op "modified".
+ const attrsMatch =
+ JSON.stringify( rem.block.attrs ) ===
+ JSON.stringify( add.block.attrs );
+ if (
+ score > bestScore &&
+ score > SIMILARITY_THRESHOLD &&
+ ( score < 1 || ! attrsMatch )
+ ) {
+ bestScore = score;
+ bestMatch = add;
+ }
+ }
}
if ( bestMatch ) {
diff --git a/packages/editor/src/components/post-revisions-preview/test/block-diff.js b/packages/editor/src/components/post-revisions-preview/test/block-diff.js
index c28ec690a21382..72a91596975fa5 100644
--- a/packages/editor/src/components/post-revisions-preview/test/block-diff.js
+++ b/packages/editor/src/components/post-revisions-preview/test/block-diff.js
@@ -712,6 +712,65 @@ describe( 'diffRevisionContent', () => {
] );
} );
+ it( 'handles multiple inner block changes at once (similar content)', () => {
+ const previous = serialize( [
+ createBlock( 'core/group', {}, [
+ createBlock( 'core/paragraph', { content: 'A' } ),
+ createBlock( 'core/paragraph', {
+ content: 'The quick brown fox jumps over the lazy dog',
+ } ),
+ createBlock( 'core/paragraph', { content: 'C' } ),
+ ] ),
+ ] );
+ const current = serialize( [
+ createBlock( 'core/group', {}, [
+ createBlock( 'core/paragraph', { content: 'A' } ),
+ createBlock( 'core/paragraph', {
+ content: 'The quick brown fox leaps over the lazy dog',
+ } ),
+ ] ),
+ ] );
+ const blocks = diffRevisionContent( current, previous );
+
+ // Post-LCS pairing matches the fox sentences (high word overlap).
+ // C remains removed since it has no matching added block.
+ expect( normalizeBlockTree( blocks ) ).toMatchObject( [
+ {
+ name: 'core/group',
+ attributes: {
+ __revisionDiffStatus: undefined,
+ },
+ innerBlocks: [
+ {
+ name: 'core/paragraph',
+ attributes: {
+ content: 'A',
+ __revisionDiffStatus: undefined,
+ },
+ },
+ {
+ name: 'core/paragraph',
+ attributes: {
+ content: 'C',
+ __revisionDiffStatus: { status: 'removed' },
+ },
+ },
+ {
+ name: 'core/paragraph',
+ attributes: {
+ // jumps→leaps modification with inline diff
+ content:
+ 'The quick brown fox jumpsleaps over the lazy dog',
+ __revisionDiffStatus: {
+ status: 'modified',
+ },
+ },
+ },
+ ],
+ },
+ ] );
+ } );
+
it( 'does not pair blocks with completely different content', () => {
const previous = serialize( [
createBlock( 'core/group', {}, [