-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Editor: Improve revisions diff pairing performance #77126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from 6 commits
c551aac
ae4278b
2169d1d
b91b92e
000316e
bac6a36
449ada6
becd385
1242e6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,8 +45,19 @@ function stringifyValue( value ) { | |
| } | ||
|
|
||
| /** | ||
| * Calculate text similarity using word diff (semantically meaningful). | ||
| * Returns ratio of unchanged words to total words. | ||
| * Calculate text similarity using word-set overlap. | ||
| * | ||
| * Uses a variant of the Jaccard index (https://en.wikipedia.org/wiki/Jaccard_index) | ||
| * called the overlap coefficient (https://en.wikipedia.org/wiki/Overlap_coefficient) | ||
| * where we divide by the larger set size rather than the union. This ensures that | ||
| * a small edit to a long paragraph scores high — the few changed words don't | ||
| * dilute the score. | ||
| * | ||
| * This replaces the previous diffWords-based similarity which was O(n*m) per pair. | ||
| * The word-set approach is O(n) where n is the number of words. | ||
| * | ||
| * Words are extracted using Intl.Segmenter for proper multilingual support | ||
| * (CJK, Thai, etc.) rather than splitting on whitespace. | ||
| * | ||
| * @param {string} text1 First text to compare. | ||
| * @param {string} text2 Second text to compare. | ||
|
|
@@ -60,17 +71,45 @@ function textSimilarity( text1, text2 ) { | |
| return 0; | ||
| } | ||
|
|
||
| const changes = diffWords( text1, text2 ); | ||
| const unchanged = changes | ||
| .filter( ( c ) => ! c.added && ! c.removed ) | ||
| .reduce( ( sum, c ) => sum + c.value.length, 0 ); | ||
| const total = Math.max( text1.length, text2.length ); | ||
| return total > 0 ? unchanged / total : 0; | ||
| const segmenter = new Intl.Segmenter( undefined, { | ||
| granularity: 'word', | ||
| } ); | ||
| const getWords = ( text ) => | ||
| [ ...segmenter.segment( text ) ] | ||
| .filter( ( s ) => s.isWordLike ) | ||
| .map( ( s ) => s.segment ); | ||
Mamaduka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| const words1 = getWords( text1 ); | ||
| const words2 = getWords( text2 ); | ||
|
|
||
| if ( words1.length === 0 && words2.length === 0 ) { | ||
| return 1; | ||
| } | ||
|
|
||
| const set1 = new Set( words1 ); | ||
| let intersection = 0; | ||
| for ( const word of words2 ) { | ||
| if ( set1.has( word ) ) { | ||
| intersection++; | ||
| } | ||
| } | ||
|
Comment on lines
+93
to
+99
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like a good case for
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good suggestion!
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lol, Claude replied here without asking, sorry about that. |
||
|
|
||
| const total = Math.max( words1.length, words2.length ); | ||
| return total > 0 ? intersection / total : 0; | ||
| } | ||
|
|
||
| /** | ||
| * Post-process diff result to pair similar removed/added blocks as modifications. | ||
| * This catches modifications that LCS missed due to content changes. | ||
| * | ||
| * After LCS diffing, a block whose content changed appears as a separate "removed" | ||
| * and "added" entry (since the full block signature differs). This function detects | ||
| * such pairs and merges them into a single "modified" block with inline diff. | ||
| * | ||
| * Two pairing strategies are used: | ||
| * 1. When exactly one block of a given type was removed and one was added, | ||
| * they are paired directly — no ambiguity, no similarity check needed. | ||
| * 2. When multiple candidates exist, textSimilarity (overlap coefficient) is | ||
| * used to find the best match. Blocks must share at least 50% of their | ||
| * words to be paired, preventing unrelated paragraphs from being merged. | ||
| * | ||
| * @param {Array} blocks Raw blocks with diff status. | ||
| * @return {Array} Blocks with similar pairs converted to modifications. | ||
|
|
@@ -94,62 +133,139 @@ function pairSimilarBlocks( blocks ) { | |
| return blocks; | ||
| } | ||
|
|
||
| const pairedRemoved = new Set(); // Indices of removed blocks that were paired. | ||
| const modifications = new Map(); // Map from added block index to modified block. | ||
| const SIMILARITY_THRESHOLD = 0.3; | ||
| const pairedRemoved = new Set(); // Indices of removed blocks filtered out. | ||
| const pairedAdded = new Set(); // Indices of added blocks filtered out. | ||
| const modifications = new Map(); // Index → modified block. | ||
| const SIMILARITY_THRESHOLD = 0.5; | ||
|
|
||
| // Group candidates by block name for efficient lookup. | ||
| const addedByName = new Map(); | ||
| for ( const add of added ) { | ||
| const name = add.block.blockName; | ||
| if ( ! addedByName.has( name ) ) { | ||
| addedByName.set( name, [] ); | ||
| } | ||
| addedByName.get( name ).push( add ); | ||
| } | ||
| const removedByName = new Map(); | ||
| for ( const rem of removed ) { | ||
| const name = rem.block.blockName; | ||
| if ( ! removedByName.has( name ) ) { | ||
| removedByName.set( name, [] ); | ||
| } | ||
| removedByName.get( name ).push( rem ); | ||
| } | ||
|
|
||
| // For each removed block, find best matching added block. | ||
| // Track the highest added index paired so far — new pairings must | ||
| // not go backwards, or the removed/added text order would break. | ||
| let maxPairedAddedIndex = -1; | ||
|
|
||
| for ( const rem of removed ) { | ||
| let bestMatch = null; | ||
| let bestScore = 0; | ||
| const candidates = addedByName.get( rem.block.blockName ) || []; | ||
| const sameNameRemoved = removedByName.get( rem.block.blockName ) || []; | ||
| const unpaired = candidates.filter( | ||
| ( add ) => | ||
| ! modifications.has( add.index ) && | ||
| add.index > maxPairedAddedIndex | ||
| ); | ||
|
|
||
| for ( const add of added ) { | ||
| if ( modifications.has( add.index ) ) { | ||
| continue; | ||
| } | ||
| if ( add.block.blockName !== rem.block.blockName ) { | ||
| continue; | ||
| } | ||
| if ( unpaired.length === 0 ) { | ||
| continue; | ||
| } | ||
|
|
||
| const score = textSimilarity( | ||
| rem.block.innerHTML || '', | ||
| add.block.innerHTML || '' | ||
| ); | ||
| // If content is identical (score=1), only pair if attrs differ. | ||
| // Otherwise identical blocks are just position swaps, not modifications. | ||
| let bestMatch = null; | ||
|
|
||
| // If there's exactly one removed and one added of this type, | ||
| // pair them directly — no ambiguity, no similarity check needed. | ||
| if ( sameNameRemoved.length === 1 && unpaired.length === 1 ) { | ||
| const add = unpaired[ 0 ]; | ||
| const attrsMatch = | ||
| JSON.stringify( rem.block.attrs ) === | ||
| JSON.stringify( add.block.attrs ); | ||
| if ( | ||
| score > bestScore && | ||
| score > SIMILARITY_THRESHOLD && | ||
| ( score < 1 || ! attrsMatch ) | ||
| ) { | ||
| bestScore = score; | ||
| // Only skip pairing if both content and attrs are identical | ||
| // (position swap, not a modification). | ||
| const contentMatch = | ||
| ( rem.block.innerHTML || '' ) === ( add.block.innerHTML || '' ); | ||
| if ( ! contentMatch || ! attrsMatch ) { | ||
| bestMatch = add; | ||
| } | ||
| } else { | ||
| // Multiple candidates — use similarity to find best match. | ||
| let bestScore = 0; | ||
| for ( const add of unpaired ) { | ||
| const score = textSimilarity( | ||
| rem.block.innerHTML || '', | ||
| add.block.innerHTML || '' | ||
| ); | ||
| // Skip identical blocks (score=1 with same attrs) — those | ||
| // are position swaps, not modifications. They should show | ||
| // as separate removed + added, not as a no-op "modified". | ||
| const attrsMatch = | ||
| JSON.stringify( rem.block.attrs ) === | ||
| JSON.stringify( add.block.attrs ); | ||
| if ( | ||
| score > bestScore && | ||
| score > SIMILARITY_THRESHOLD && | ||
| ( score < 1 || ! attrsMatch ) | ||
| ) { | ||
| bestScore = score; | ||
| bestMatch = add; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if ( bestMatch ) { | ||
| pairedRemoved.add( rem.index ); | ||
| maxPairedAddedIndex = bestMatch.index; | ||
|
|
||
| // Create modified block with previous content stored. | ||
| modifications.set( bestMatch.index, { | ||
| const modifiedBlock = { | ||
| ...bestMatch.block, | ||
| __revisionDiffStatus: { status: 'modified' }, | ||
| __previousRawBlock: rem.block, | ||
| } ); | ||
| }; | ||
|
|
||
| // Decide where to place the modified block by checking | ||
| // what's between the removed and added positions. | ||
| // If there are unpaired added blocks between them, | ||
| // placing at the removed position would put the modified | ||
| // block before content that comes before it in the | ||
| // current revision — so use the added position. | ||
| // Otherwise, use the removed position to keep the | ||
| // previous revision's order intact. | ||
| const lo = Math.min( rem.index, bestMatch.index ); | ||
| const hi = Math.max( rem.index, bestMatch.index ); | ||
| let hasAddedBetween = false; | ||
| for ( let i = lo + 1; i < hi; i++ ) { | ||
| if ( | ||
| blocks[ i ].__revisionDiffStatus?.status === 'added' && | ||
| ! pairedAdded.has( i ) | ||
| ) { | ||
| hasAddedBetween = true; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if ( hasAddedBetween ) { | ||
| // Use the added position — don't jump before | ||
| // current-revision content. | ||
| modifications.set( bestMatch.index, modifiedBlock ); | ||
| pairedRemoved.add( rem.index ); | ||
| } else { | ||
| // Use the removed position — keep the previous | ||
| // revision's reading order. | ||
| modifications.set( rem.index, modifiedBlock ); | ||
| pairedAdded.add( bestMatch.index ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Rebuild result: filter out paired removed, replace paired added with modified. | ||
| // Rebuild result: replace modification targets, filter out | ||
| // their paired counterparts. | ||
| return blocks | ||
| .map( ( block, index ) => { | ||
| // Skip paired removed blocks. | ||
| if ( pairedRemoved.has( index ) ) { | ||
| if ( pairedRemoved.has( index ) || pairedAdded.has( index ) ) { | ||
| return null; | ||
| } | ||
| // Replace paired added blocks with modified version. | ||
| if ( modifications.has( index ) ) { | ||
| return modifications.get( index ); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just noting that I perused the docs and tested things locally to understand the implications of arguments
locales: undefined, options: { localeMatcher: 'best fit' }, specifically with CJK in mind. Notes:isWordLikedoesn't just exclude punctuation, but numbers too.Code
(I don't speak Japanese, but know just enough of it to agree with the tokenisation)