Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 109 additions & 31 deletions packages/editor/src/components/post-revisions-preview/block-diff.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,19 @@ function stringifyValue( value ) {
}

/**
* Calculate text similarity using word diff (semantically meaningful).
* Returns ratio of unchanged words to total words.
* Calculate text similarity using word-set overlap.
*
* Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index)
* called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient)
* where we divide by the larger set size rather than the union. This ensures that
* a small edit to a long paragraph scores high — the few changed words don't
* dilute the score.
*
* This replaces the previous diffWords-based similarity which was O(n*m) per pair.
* The word-set approach is O(n) where n is the number of words.
*
* Words are extracted using Intl.Segmenter for proper multilingual support
* (CJK, Thai, etc.) rather than splitting on whitespace.
*
* @param {string} text1 First text to compare.
* @param {string} text2 Second text to compare.
Expand All @@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) {
return 0;
}

const changes = diffWords( text1, text2 );
const unchanged = changes
.filter( ( c ) => ! c.added && ! c.removed )
.reduce( ( sum, c ) => sum + c.value.length, 0 );
const total = Math.max( text1.length, text2.length );
return total > 0 ? unchanged / total : 0;
const segmenter = new Intl.Segmenter( undefined, {
granularity: 'word',
} );
const getWords = ( text ) =>
[ ...segmenter.segment( text ) ]
.filter( ( s ) => s.isWordLike )
.map( ( s ) => s.segment );
const words1 = getWords( text1 );
const words2 = getWords( text2 );

if ( words1.length === 0 && words2.length === 0 ) {
return 1;
}

const set1 = new Set( words1 );
let intersection = 0;
for ( const word of words2 ) {
if ( set1.has( word ) ) {
intersection++;
}
}

const total = Math.max( words1.length, words2.length );
return total > 0 ? intersection / total : 0;
}

/**
* Post-process diff result to pair similar removed/added blocks as modifications.
* This catches modifications that LCS missed due to content changes.
*
* After LCS diffing, a block whose content changed appears as a separate "removed"
* and "added" entry (since the full block signature differs). This function detects
* such pairs and merges them into a single "modified" block with inline diff.
*
* Two pairing strategies are used:
* 1. When exactly one block of a given type was removed and one was added,
* they are paired directly — no ambiguity, no similarity check needed.
* 2. When multiple candidates exist, textSimilarity (overlap coefficient) is
* used to find the best match. Blocks must share at least 50% of their
* words to be paired, preventing unrelated paragraphs from being merged.
*
* @param {Array} blocks Raw blocks with diff status.
* @return {Array} Blocks with similar pairs converted to modifications.
Expand All @@ -96,38 +135,77 @@ function pairSimilarBlocks( blocks ) {

const pairedRemoved = new Set(); // Indices of removed blocks that were paired.
const modifications = new Map(); // Map from added block index to modified block.
const SIMILARITY_THRESHOLD = 0.3;
const SIMILARITY_THRESHOLD = 0.5;

// Group candidates by block name for efficient lookup.
const addedByName = new Map();
for ( const add of added ) {
const name = add.block.blockName;
if ( ! addedByName.has( name ) ) {
addedByName.set( name, [] );
}
addedByName.get( name ).push( add );
}
const removedByName = new Map();
for ( const rem of removed ) {
const name = rem.block.blockName;
if ( ! removedByName.has( name ) ) {
removedByName.set( name, [] );
}
removedByName.get( name ).push( rem );
}

// For each removed block, find best matching added block.
for ( const rem of removed ) {
let bestMatch = null;
let bestScore = 0;
const candidates = addedByName.get( rem.block.blockName ) || [];
const sameNameRemoved = removedByName.get( rem.block.blockName ) || [];
const unpaired = candidates.filter(
( add ) => ! modifications.has( add.index )
);

for ( const add of added ) {
if ( modifications.has( add.index ) ) {
continue;
}
if ( add.block.blockName !== rem.block.blockName ) {
continue;
}
if ( unpaired.length === 0 ) {
continue;
}

const score = textSimilarity(
rem.block.innerHTML || '',
add.block.innerHTML || ''
);
// If content is identical (score=1), only pair if attrs differ.
// Otherwise identical blocks are just position swaps, not modifications.
let bestMatch = null;

// If there's exactly one removed and one added of this type,
// pair them directly — no ambiguity, no similarity check needed.
if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) {
const add = unpaired[ 0 ];
const attrsMatch =
JSON.stringify( rem.block.attrs ) ===
JSON.stringify( add.block.attrs );
if (
score > bestScore &&
score > SIMILARITY_THRESHOLD &&
( score < 1 || ! attrsMatch )
) {
bestScore = score;
// Only skip pairing if both content and attrs are identical
// (position swap, not a modification).
const contentMatch =
( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' );
if ( ! contentMatch || ! attrsMatch ) {
bestMatch = add;
}
} else {
// Multiple candidates — use similarity to find best match.
let bestScore = 0;
for ( const add of unpaired ) {
const score = textSimilarity(
rem.block.innerHTML || '',
add.block.innerHTML || ''
);
// Skip identical blocks (score=1 with same attrs) — those
// are position swaps, not modifications. They should show
// as separate removed + added, not as a no-op "modified".
const attrsMatch =
JSON.stringify( rem.block.attrs ) ===
JSON.stringify( add.block.attrs );
if (
score > bestScore &&
score > SIMILARITY_THRESHOLD &&
( score < 1 || ! attrsMatch )
) {
bestScore = score;
bestMatch = add;
}
}
}

if ( bestMatch ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,65 @@ describe( 'diffRevisionContent', () => {
] );
} );

it( 'handles multiple inner block changes at once (similar content)', () => {
const previous = serialize( [
createBlock( 'core/group', {}, [
createBlock( 'core/paragraph', { content: 'A' } ),
createBlock( 'core/paragraph', {
content: 'The quick brown fox jumps over the lazy dog',
} ),
createBlock( 'core/paragraph', { content: 'C' } ),
] ),
] );
const current = serialize( [
createBlock( 'core/group', {}, [
createBlock( 'core/paragraph', { content: 'A' } ),
createBlock( 'core/paragraph', {
content: 'The quick brown fox leaps over the lazy dog',
} ),
] ),
] );
const blocks = diffRevisionContent( current, previous );

// Post-LCS pairing matches the fox sentences (high word overlap).
// C remains removed since it has no matching added block.
expect( normalizeBlockTree( blocks ) ).toMatchObject( [
{
name: 'core/group',
attributes: {
__revisionDiffStatus: undefined,
},
innerBlocks: [
{
name: 'core/paragraph',
attributes: {
content: 'A',
__revisionDiffStatus: undefined,
},
},
{
name: 'core/paragraph',
attributes: {
content: 'C',
__revisionDiffStatus: { status: 'removed' },
},
},
{
name: 'core/paragraph',
attributes: {
// jumps→leaps modification with inline diff
content:
'The quick brown fox <del title="Removed" class="revision-diff-removed">jumps</del><ins title="Added" class="revision-diff-added">leaps</ins> over the lazy dog',
__revisionDiffStatus: {
status: 'modified',
},
},
},
],
},
] );
} );

it( 'does not pair blocks with completely different content', () => {
const previous = serialize( [
createBlock( 'core/group', {}, [
Expand Down
Loading