Skip to content
198 changes: 157 additions & 41 deletions packages/editor/src/components/post-revisions-preview/block-diff.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,19 @@ function stringifyValue( value ) {
}

/**
* Calculate text similarity using word diff (semantically meaningful).
* Returns ratio of unchanged words to total words.
* Calculate text similarity using word-set overlap.
*
* Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index)
* called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient)
* where we divide by the larger set size rather than the union. This ensures that
* a small edit to a long paragraph scores high — the few changed words don't
* dilute the score.
*
* This replaces the previous diffWords-based similarity which was O(n*m) per pair.
* The word-set approach is O(n) where n is the number of words.
*
* Words are extracted using Intl.Segmenter for proper multilingual support
* (CJK, Thai, etc.) rather than splitting on whitespace.
*
* @param {string} text1 First text to compare.
* @param {string} text2 Second text to compare.
Expand All @@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) {
return 0;
}

const changes = diffWords( text1, text2 );
const unchanged = changes
.filter( ( c ) => ! c.added && ! c.removed )
.reduce( ( sum, c ) => sum + c.value.length, 0 );
const total = Math.max( text1.length, text2.length );
return total > 0 ? unchanged / total : 0;
const segmenter = new Intl.Segmenter( undefined, {
granularity: 'word',
} );
Comment on lines +74 to +76
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that I perused the docs and tested things locally to understand the implications of arguments locales: undefined, options: { localeMatcher: 'best fit' }, specifically with CJK in mind. Notes:

  • Segmenter seems to match the right locale indeed.
  • isWordLike doesn't just exclude punctuation, but numbers too.
Code (I don't speak Japanese, but know just enough of it to agree with the tokenisation)
const segmenter = new Intl.Segmenter(undefined, { granularity: "word" });
const strings = [
  "Que, de grâce, ma joie demeure !",
  "アロハ航空243便事故は、1988年4月28日にハワイ諸島で発生した航空事故である",
  "吾輩は猫である。名前はたぬき。",
  "My password is pw123",
];

for (const string of strings) {
  console.log(
    Array.from(segmenter.segment(string))
      .map(s => s.isWordLike ? `[${s.segment}]` : `<${s.segment}>`)
      .join(' ')
  )
}

// "[Que] <,> < > [de] < > [grâce] <,> < > [ma] < > [joie] < > [demeure] < > <!>"
// "[アロハ] [航空] <243> [便] [事故] [は] <、> <1988> [年] <4> [月] <28> [日] [に] [ハワイ] [諸島] [で] [発生] [した] [航空] [事故] [で] [ある]"
// "[吾輩] [は] [猫] [で] [ある] <。> [名前] [は] [たぬき] <。>"
// "[My] < > [password] < > [is] < > <pw123>"

const getWords = ( text ) =>
[ ...segmenter.segment( text ) ]
.filter( ( s ) => s.isWordLike )
.map( ( s ) => s.segment );
const words1 = getWords( text1 );
const words2 = getWords( text2 );

if ( words1.length === 0 && words2.length === 0 ) {
return 1;
}

const set1 = new Set( words1 );
let intersection = 0;
for ( const word of words2 ) {
if ( set1.has( word ) ) {
intersection++;
}
}
Comment on lines +93 to +99
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like a good case for Set.intersection, but I've not seen it used in codebase, so there might be a reason - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Set/intersection.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion! Set.intersection wouldn't work here though — we need to count duplicate matches. If words2 has "the" three times and set1 has "the", we want to count 3 (since it appears 3 times in the text), but Set.intersection would only give 1 (the unique intersection). The current loop handles this correctly by iterating the array (with duplicates) and checking membership in the set.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lol, Claude replied here without asking, sorry about that.


const total = Math.max( words1.length, words2.length );
return total > 0 ? intersection / total : 0;
}

/**
* Post-process diff result to pair similar removed/added blocks as modifications.
* This catches modifications that LCS missed due to content changes.
*
* After LCS diffing, a block whose content changed appears as a separate "removed"
* and "added" entry (since the full block signature differs). This function detects
* such pairs and merges them into a single "modified" block with inline diff.
*
* Two pairing strategies are used:
* 1. When exactly one block of a given type was removed and one was added,
* they are paired directly — no ambiguity, no similarity check needed.
* 2. When multiple candidates exist, textSimilarity (overlap coefficient) is
* used to find the best match. Blocks must share at least 50% of their
* words to be paired, preventing unrelated paragraphs from being merged.
*
* @param {Array} blocks Raw blocks with diff status.
* @return {Array} Blocks with similar pairs converted to modifications.
Expand All @@ -94,62 +133,139 @@ function pairSimilarBlocks( blocks ) {
return blocks;
}

const pairedRemoved = new Set(); // Indices of removed blocks that were paired.
const modifications = new Map(); // Map from added block index to modified block.
const SIMILARITY_THRESHOLD = 0.3;
const pairedRemoved = new Set(); // Indices of removed blocks filtered out.
const pairedAdded = new Set(); // Indices of added blocks filtered out.
const modifications = new Map(); // Index → modified block.
const SIMILARITY_THRESHOLD = 0.5;

// Group candidates by block name for efficient lookup.
const addedByName = new Map();
for ( const add of added ) {
const name = add.block.blockName;
if ( ! addedByName.has( name ) ) {
addedByName.set( name, [] );
}
addedByName.get( name ).push( add );
}
const removedByName = new Map();
for ( const rem of removed ) {
const name = rem.block.blockName;
if ( ! removedByName.has( name ) ) {
removedByName.set( name, [] );
}
removedByName.get( name ).push( rem );
}

// For each removed block, find best matching added block.
// Track the highest added index paired so far — new pairings must
// not go backwards, or the removed/added text order would break.
let maxPairedAddedIndex = -1;

for ( const rem of removed ) {
let bestMatch = null;
let bestScore = 0;
const candidates = addedByName.get( rem.block.blockName ) || [];
const sameNameRemoved = removedByName.get( rem.block.blockName ) || [];
const unpaired = candidates.filter(
( add ) =>
! modifications.has( add.index ) &&
add.index > maxPairedAddedIndex
);

for ( const add of added ) {
if ( modifications.has( add.index ) ) {
continue;
}
if ( add.block.blockName !== rem.block.blockName ) {
continue;
}
if ( unpaired.length === 0 ) {
continue;
}

const score = textSimilarity(
rem.block.innerHTML || '',
add.block.innerHTML || ''
);
// If content is identical (score=1), only pair if attrs differ.
// Otherwise identical blocks are just position swaps, not modifications.
let bestMatch = null;

// If there's exactly one removed and one added of this type,
// pair them directly — no ambiguity, no similarity check needed.
if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) {
const add = unpaired[ 0 ];
const attrsMatch =
JSON.stringify( rem.block.attrs ) ===
JSON.stringify( add.block.attrs );
if (
score > bestScore &&
score > SIMILARITY_THRESHOLD &&
( score < 1 || ! attrsMatch )
) {
bestScore = score;
// Only skip pairing if both content and attrs are identical
// (position swap, not a modification).
const contentMatch =
( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' );
if ( ! contentMatch || ! attrsMatch ) {
bestMatch = add;
}
} else {
// Multiple candidates — use similarity to find best match.
let bestScore = 0;
for ( const add of unpaired ) {
const score = textSimilarity(
rem.block.innerHTML || '',
add.block.innerHTML || ''
);
// Skip identical blocks (score=1 with same attrs) — those
// are position swaps, not modifications. They should show
// as separate removed + added, not as a no-op "modified".
const attrsMatch =
JSON.stringify( rem.block.attrs ) ===
JSON.stringify( add.block.attrs );
if (
score > bestScore &&
score > SIMILARITY_THRESHOLD &&
( score < 1 || ! attrsMatch )
) {
bestScore = score;
bestMatch = add;
}
}
}

if ( bestMatch ) {
pairedRemoved.add( rem.index );
maxPairedAddedIndex = bestMatch.index;

// Create modified block with previous content stored.
modifications.set( bestMatch.index, {
const modifiedBlock = {
...bestMatch.block,
__revisionDiffStatus: { status: 'modified' },
__previousRawBlock: rem.block,
} );
};

// Decide where to place the modified block by checking
// what's between the removed and added positions.
// If there are unpaired added blocks between them,
// placing at the removed position would put the modified
// block before content that comes before it in the
// current revision — so use the added position.
// Otherwise, use the removed position to keep the
// previous revision's order intact.
const lo = Math.min( rem.index, bestMatch.index );
const hi = Math.max( rem.index, bestMatch.index );
let hasAddedBetween = false;
for ( let i = lo + 1; i < hi; i++ ) {
if (
blocks[ i ].__revisionDiffStatus?.status === 'added' &&
! pairedAdded.has( i )
) {
hasAddedBetween = true;
break;
}
}

if ( hasAddedBetween ) {
// Use the added position — don't jump before
// current-revision content.
modifications.set( bestMatch.index, modifiedBlock );
pairedRemoved.add( rem.index );
} else {
// Use the removed position — keep the previous
// revision's reading order.
modifications.set( rem.index, modifiedBlock );
pairedAdded.add( bestMatch.index );
}
}
}

// Rebuild result: filter out paired removed, replace paired added with modified.
// Rebuild result: replace modification targets, filter out
// their paired counterparts.
return blocks
.map( ( block, index ) => {
// Skip paired removed blocks.
if ( pairedRemoved.has( index ) ) {
if ( pairedRemoved.has( index ) || pairedAdded.has( index ) ) {
return null;
}
// Replace paired added blocks with modified version.
if ( modifications.has( index ) ) {
return modifications.get( index );
}
Expand Down
Loading
Loading