diff --git a/packages/editor/src/components/post-revisions-preview/block-diff.js b/packages/editor/src/components/post-revisions-preview/block-diff.js index c0d873f807e695..50598d18efdd70 100644 --- a/packages/editor/src/components/post-revisions-preview/block-diff.js +++ b/packages/editor/src/components/post-revisions-preview/block-diff.js @@ -45,8 +45,19 @@ function stringifyValue( value ) { } /** - * Calculate text similarity using word diff (semantically meaningful). - * Returns ratio of unchanged words to total words. + * Calculate text similarity using word-set overlap. + * + * Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index) + * called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient) + * where we divide by the larger set size rather than the union. This ensures that + * a small edit to a long paragraph scores high — the few changed words don't + * dilute the score. + * + * This replaces the previous diffWords-based similarity which was O(n*m) per pair. + * The word-set approach is O(n) where n is the number of words. + * + * Words are extracted using Intl.Segmenter for proper multilingual support + * (CJK, Thai, etc.) rather than splitting on whitespace. * * @param {string} text1 First text to compare. * @param {string} text2 Second text to compare. @@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) { return 0; } - const changes = diffWords( text1, text2 ); - const unchanged = changes - .filter( ( c ) => ! c.added && ! c.removed ) - .reduce( ( sum, c ) => sum + c.value.length, 0 ); - const total = Math.max( text1.length, text2.length ); - return total > 0 ? unchanged / total : 0; + const segmenter = new Intl.Segmenter( undefined, { + granularity: 'word', + } ); + const getWords = ( text ) => + [ ...segmenter.segment( text ) ] + .filter( ( s ) => s.isWordLike ) + .map( ( s ) => s.segment ); + const words1 = getWords( text1 ); + const words2 = getWords( text2 ); + + if ( words1.length === 0 && words2.length === 0 ) { + return 1; + } + + const set1 = new Set( words1 ); + let intersection = 0; + for ( const word of words2 ) { + if ( set1.has( word ) ) { + intersection++; + } + } + + const total = Math.max( words1.length, words2.length ); + return total > 0 ? intersection / total : 0; } /** * Post-process diff result to pair similar removed/added blocks as modifications. - * This catches modifications that LCS missed due to content changes. + * + * After LCS diffing, a block whose content changed appears as a separate "removed" + * and "added" entry (since the full block signature differs). This function detects + * such pairs and merges them into a single "modified" block with inline diff. + * + * Two pairing strategies are used: + * 1. When exactly one block of a given type was removed and one was added, + * they are paired directly — no ambiguity, no similarity check needed. + * 2. When multiple candidates exist, textSimilarity (overlap coefficient) is + * used to find the best match. Blocks must share at least 50% of their + * words to be paired, preventing unrelated paragraphs from being merged. * * @param {Array} blocks Raw blocks with diff status. * @return {Array} Blocks with similar pairs converted to modifications. @@ -96,38 +135,77 @@ function pairSimilarBlocks( blocks ) { const pairedRemoved = new Set(); // Indices of removed blocks that were paired. const modifications = new Map(); // Map from added block index to modified block. - const SIMILARITY_THRESHOLD = 0.3; + const SIMILARITY_THRESHOLD = 0.5; + + // Group candidates by block name for efficient lookup. + const addedByName = new Map(); + for ( const add of added ) { + const name = add.block.blockName; + if ( ! addedByName.has( name ) ) { + addedByName.set( name, [] ); + } + addedByName.get( name ).push( add ); + } + const removedByName = new Map(); + for ( const rem of removed ) { + const name = rem.block.blockName; + if ( ! removedByName.has( name ) ) { + removedByName.set( name, [] ); + } + removedByName.get( name ).push( rem ); + } // For each removed block, find best matching added block. for ( const rem of removed ) { - let bestMatch = null; - let bestScore = 0; + const candidates = addedByName.get( rem.block.blockName ) || []; + const sameNameRemoved = removedByName.get( rem.block.blockName ) || []; + const unpaired = candidates.filter( + ( add ) => ! modifications.has( add.index ) + ); - for ( const add of added ) { - if ( modifications.has( add.index ) ) { - continue; - } - if ( add.block.blockName !== rem.block.blockName ) { - continue; - } + if ( unpaired.length === 0 ) { + continue; + } - const score = textSimilarity( - rem.block.innerHTML || '', - add.block.innerHTML || '' - ); - // If content is identical (score=1), only pair if attrs differ. - // Otherwise identical blocks are just position swaps, not modifications. + let bestMatch = null; + + // If there's exactly one removed and one added of this type, + // pair them directly — no ambiguity, no similarity check needed. + if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) { + const add = unpaired[ 0 ]; const attrsMatch = JSON.stringify( rem.block.attrs ) === JSON.stringify( add.block.attrs ); - if ( - score > bestScore && - score > SIMILARITY_THRESHOLD && - ( score < 1 || ! attrsMatch ) - ) { - bestScore = score; + // Only skip pairing if both content and attrs are identical + // (position swap, not a modification). + const contentMatch = + ( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' ); + if ( ! contentMatch || ! attrsMatch ) { bestMatch = add; } + } else { + // Multiple candidates — use similarity to find best match. + let bestScore = 0; + for ( const add of unpaired ) { + const score = textSimilarity( + rem.block.innerHTML || '', + add.block.innerHTML || '' + ); + // Skip identical blocks (score=1 with same attrs) — those + // are position swaps, not modifications. They should show + // as separate removed + added, not as a no-op "modified". + const attrsMatch = + JSON.stringify( rem.block.attrs ) === + JSON.stringify( add.block.attrs ); + if ( + score > bestScore && + score > SIMILARITY_THRESHOLD && + ( score < 1 || ! attrsMatch ) + ) { + bestScore = score; + bestMatch = add; + } + } } if ( bestMatch ) { diff --git a/packages/editor/src/components/post-revisions-preview/test/block-diff.js b/packages/editor/src/components/post-revisions-preview/test/block-diff.js index c28ec690a21382..72a91596975fa5 100644 --- a/packages/editor/src/components/post-revisions-preview/test/block-diff.js +++ b/packages/editor/src/components/post-revisions-preview/test/block-diff.js @@ -712,6 +712,65 @@ describe( 'diffRevisionContent', () => { ] ); } ); + it( 'handles multiple inner block changes at once (similar content)', () => { + const previous = serialize( [ + createBlock( 'core/group', {}, [ + createBlock( 'core/paragraph', { content: 'A' } ), + createBlock( 'core/paragraph', { + content: 'The quick brown fox jumps over the lazy dog', + } ), + createBlock( 'core/paragraph', { content: 'C' } ), + ] ), + ] ); + const current = serialize( [ + createBlock( 'core/group', {}, [ + createBlock( 'core/paragraph', { content: 'A' } ), + createBlock( 'core/paragraph', { + content: 'The quick brown fox leaps over the lazy dog', + } ), + ] ), + ] ); + const blocks = diffRevisionContent( current, previous ); + + // Post-LCS pairing matches the fox sentences (high word overlap). + // C remains removed since it has no matching added block. + expect( normalizeBlockTree( blocks ) ).toMatchObject( [ + { + name: 'core/group', + attributes: { + __revisionDiffStatus: undefined, + }, + innerBlocks: [ + { + name: 'core/paragraph', + attributes: { + content: 'A', + __revisionDiffStatus: undefined, + }, + }, + { + name: 'core/paragraph', + attributes: { + content: 'C', + __revisionDiffStatus: { status: 'removed' }, + }, + }, + { + name: 'core/paragraph', + attributes: { + // jumps→leaps modification with inline diff + content: + 'The quick brown fox jumpsleaps over the lazy dog', + __revisionDiffStatus: { + status: 'modified', + }, + }, + }, + ], + }, + ] ); + } ); + it( 'does not pair blocks with completely different content', () => { const previous = serialize( [ createBlock( 'core/group', {}, [