From 5ab4e13a84cd9f14fcf85009a5de56e0e62e2f80 Mon Sep 17 00:00:00 2001 From: Roberto Aranda Date: Wed, 8 Apr 2026 17:16:53 +0200 Subject: [PATCH 1/7] Add HTML block checker to enforce native Gutenberg blocks over core/html Introduces a deterministic check_html_blocks tool that parses block content via wp.blocks.parse() in a real browser, identifies core/html blocks wrapping structural HTML (divs, sections, headings, etc.), and flags them with replacement suggestions. The check is also integrated into validate_blocks so both validations run in a single call. Updates the system prompt with detailed block pattern references and stricter guidelines. --- apps/cli/ai/html-block-checker.ts | 350 ++++++++++++++++++ apps/cli/ai/system-prompt.ts | 117 +++++- apps/cli/ai/tests/html-block-checker.test.ts | 364 +++++++++++++++++++ apps/cli/ai/tests/tools.test.ts | 4 + apps/cli/ai/tools.ts | 142 +++++++- 5 files changed, 962 insertions(+), 15 deletions(-) create mode 100644 apps/cli/ai/html-block-checker.ts create mode 100644 apps/cli/ai/tests/html-block-checker.test.ts diff --git a/apps/cli/ai/html-block-checker.ts b/apps/cli/ai/html-block-checker.ts new file mode 100644 index 0000000000..c75b0842a6 --- /dev/null +++ b/apps/cli/ai/html-block-checker.ts @@ -0,0 +1,350 @@ +import { EditorPage } from 'cli/ai/browser-utils'; + +interface HtmlBlockIssue { + index: number; + originalContent: string; + wrapperTag: string; + maxDepth: number; + totalElements: number; + reason: 'structural_tag' | 'deep_nesting'; + suggestedBlocks: string[]; +} + +export interface HtmlBlockReport { + totalBlocks: number; + htmlBlocks: number; + problematicBlocks: number; + allowedBlocks: number; + passed: boolean; + issues: HtmlBlockIssue[]; + error?: string; +} + +/** + * The check fails when the number of problematic HTML blocks exceeds this. + */ +const STRUCTURAL_BLOCK_THRESHOLD = 0; + +/** + * Maximum DOM nesting depth allowed inside a single HTML block before it is + * flagged. A depth of 4 means: wrapper > child > grandchild > great-grandchild. + */ +const MAX_NESTING_DEPTH = 4; + +// Cache one EditorPage per site URL so repeated checks reuse the same +// browser tab instead of navigating and loading the block editor each time. +const editorPages = new Map< string, EditorPage >(); + +function getEditorPage( siteUrl: string ): EditorPage { + let ep = editorPages.get( siteUrl ); + if ( ! ep ) { + ep = new EditorPage( siteUrl ); + editorPages.set( siteUrl, ep ); + } + return ep; +} + +/** + * Tags that indicate acceptable use of core/html blocks when they are the + * outermost wrapper element. + */ +const ALLOWED_WRAPPER_TAGS = new Set( [ + 'svg', + 'form', + 'script', + 'canvas', + 'iframe', + 'video', + 'audio', + 'style', + 'input', + 'select', + 'textarea', +] ); + +/** + * Tags whose content can be expressed with native Gutenberg blocks. + * If ALL descendant elements are in this set, the HTML block is convertible. + */ +const CONVERTIBLE_TAGS = new Set( [ + 'div', + 'section', + 'header', + 'footer', + 'main', + 'article', + 'aside', + 'nav', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'p', + 'ul', + 'ol', + 'li', + 'dl', + 'dt', + 'dd', + 'table', + 'thead', + 'tbody', + 'tfoot', + 'tr', + 'th', + 'td', + 'img', + 'figure', + 'figcaption', + 'blockquote', + 'a', + 'span', + 'em', + 'strong', + 'b', + 'i', + 'u', + 'br', + 'hr', + 'small', + 'mark', + 'sub', + 'sup', + 'abbr', + 'cite', + 'code', + 'pre', + 'time', +] ); + +/** + * Maps structural wrapper tags to suggested Gutenberg block replacements. + */ +const REPLACEMENT_MAP: Record< string, string[] > = { + div: [ 'core/group (with inner blocks for children)' ], + section: [ 'core/group with tagName="section"' ], + header: [ 'core/group with tagName="header"' ], + footer: [ 'core/group with tagName="footer"' ], + main: [ 'core/group with tagName="main"' ], + article: [ 'core/group with tagName="article"' ], + aside: [ 'core/group with tagName="aside"' ], + nav: [ 'core/navigation' ], + h1: [ 'core/heading with level=1' ], + h2: [ 'core/heading with level=2' ], + h3: [ 'core/heading with level=3' ], + h4: [ 'core/heading with level=4' ], + h5: [ 'core/heading with level=5' ], + h6: [ 'core/heading with level=6' ], + p: [ 'core/paragraph' ], + ul: [ 'core/list' ], + ol: [ 'core/list' ], + li: [ 'core/list-item (inside core/list)' ], + table: [ 'core/table' ], + img: [ 'core/image' ], + figure: [ 'core/image or core/media-text' ], + figcaption: [ 'core/image caption or core/media-text' ], + blockquote: [ 'core/quote' ], + a: [ 'core/button or inline link in core/paragraph' ], + dl: [ 'core/list or core/group with core/paragraph items' ], + span: [ 'core/paragraph or core/group' ], +}; + +/** + * Check WordPress block content for misuse of core/html blocks. + * + * Navigates to `post-new.php` (which loads wp.blocks with all registered + * blocks) and runs `wp.blocks.parse()` to identify core/html blocks. Each + * HTML block is analysed for structural tags and deep nesting. + */ +export async function checkHtmlBlocks( + content: string, + siteUrl: string +): Promise< HtmlBlockReport > { + const editorPage = getEditorPage( siteUrl ); + + try { + const page = await editorPage.getPage(); + + const report = await page.evaluate( + ( params: { + html: string; + allowedWrapperTags: string[]; + convertibleTags: string[]; + replacementMap: Record< string, string[] >; + structuralThreshold: number; + maxDepth: number; + } ) => { + /* eslint-disable @typescript-eslint/no-explicit-any */ + const wpBlocks = ( window as any ).wp?.blocks; + if ( ! wpBlocks ) { + return { + totalBlocks: 0, + htmlBlocks: 0, + problematicBlocks: 0, + allowedBlocks: 0, + passed: false, + issues: [] as any[], + error: 'wp.blocks is not available on this page.', + }; + } + + const allowedWrapperSet = new Set( params.allowedWrapperTags ); + const convertibleSet = new Set( params.convertibleTags ); + const blocks = wpBlocks.parse( params.html ); + const issues: any[] = []; + let htmlBlockCount = 0; + let allowedCount = 0; + let totalBlockCount = 0; + + function computeDepthAndCount( el: Element ): { + depth: number; + count: number; + } { + let maxChildDepth = 0; + let count = 1; + for ( const child of Array.from( el.children ) ) { + const result = computeDepthAndCount( child ); + if ( result.depth > maxChildDepth ) { + maxChildDepth = result.depth; + } + count += result.count; + } + return { depth: 1 + maxChildDepth, count }; + } + + /** + * Returns true if ALL element descendants of `el` (including + * `el` itself) are tags that have native Gutenberg block + * equivalents. + * + * When this returns true the HTML block is "convertible" — + * its content can be fully expressed with core blocks. + * When false the block contains SVGs, form elements, + * or other non-block content and is considered an + * acceptable use of core/html. + * + * Note: data-* attributes are NOT considered — they can + * be replaced by className on core/group blocks and JS + * can target those class names instead. + */ + function isConvertibleContent( el: Element ): boolean { + const tag = el.tagName.toLowerCase(); + if ( ! convertibleSet.has( tag ) ) { + return false; + } + for ( const child of Array.from( el.children ) ) { + if ( ! isConvertibleContent( child ) ) { + return false; + } + } + return true; + } + + function walkBlocks( blockList: any[] ) { + for ( const block of blockList ) { + totalBlockCount++; + + if ( block.blockName === 'core/html' || block.name === 'core/html' ) { + htmlBlockCount++; + const rawHtml = block.originalContent || block.innerHTML || ''; + + const doc = new DOMParser().parseFromString( rawHtml, 'text/html' ); + const wrapper = doc.body.firstElementChild; + + if ( ! wrapper ) { + // Text-only or empty HTML block — skip + allowedCount++; + } else { + const tag = wrapper.tagName.toLowerCase(); + + if ( allowedWrapperSet.has( tag ) ) { + // Wrapper is an explicitly allowed tag (svg, form, script, …) + allowedCount++; + } else if ( ! isConvertibleContent( wrapper ) ) { + // Contains non-convertible content (SVGs, data-* attrs, + // form elements, canvas, etc.) — acceptable HTML usage + allowedCount++; + } else { + // All content is convertible to native blocks — flag it + const { depth, count } = computeDepthAndCount( wrapper ); + const reason = depth >= params.maxDepth ? 'deep_nesting' : 'structural_tag'; + const suggestions = params.replacementMap[ tag ] || [ + 'core/group or appropriate core block', + ]; + + issues.push( { + index: totalBlockCount - 1, + originalContent: rawHtml.slice( 0, 300 ), + wrapperTag: tag, + maxDepth: depth, + totalElements: count, + reason, + suggestedBlocks: suggestions, + } ); + } + } + } + + // Recurse into inner blocks + const innerBlocks = block.innerBlocks || []; + if ( innerBlocks.length > 0 ) { + walkBlocks( innerBlocks ); + } + } + } + + walkBlocks( blocks ); + + const problematicCount = issues.length; + const passed = problematicCount <= params.structuralThreshold; + + return { + totalBlocks: totalBlockCount, + htmlBlocks: htmlBlockCount, + problematicBlocks: problematicCount, + allowedBlocks: allowedCount, + passed, + issues, + }; + /* eslint-enable @typescript-eslint/no-explicit-any */ + }, + { + html: content, + allowedWrapperTags: [ ...ALLOWED_WRAPPER_TAGS ], + convertibleTags: [ ...CONVERTIBLE_TAGS ], + replacementMap: REPLACEMENT_MAP, + structuralThreshold: STRUCTURAL_BLOCK_THRESHOLD, + maxDepth: MAX_NESTING_DEPTH, + } + ); + + return report as HtmlBlockReport; + } catch ( error ) { + // If navigation or evaluation failed, discard the cached page so the + // next call gets a fresh one. + await editorPage.close(); + editorPages.delete( siteUrl ); + + return { + totalBlocks: 0, + htmlBlocks: 0, + problematicBlocks: 0, + allowedBlocks: 0, + passed: false, + issues: [], + error: `HTML block check error: ${ + error instanceof Error ? error.message : String( error ) + }`, + }; + } +} + +/** Clean up all cached editor pages. */ +export async function cleanupCheckerPages(): Promise< void > { + for ( const ep of editorPages.values() ) { + await ep.close(); + } + editorPages.clear(); +} diff --git a/apps/cli/ai/system-prompt.ts b/apps/cli/ai/system-prompt.ts index 17027ade07..cddfe5e100 100644 --- a/apps/cli/ai/system-prompt.ts +++ b/apps/cli/ai/system-prompt.ts @@ -5,8 +5,8 @@ IMPORTANT: You MUST use your mcp__studio__ tools to manage WordPress sites. Neve IMPORTANT: For any generated content for the site, these three principles are mandatory: - Gorgeous design: More details on the guidelines below. -- No HTML blocks and raw HTML: Check the block content guidelines below. -- No invalid block: Use the validate_blocks everytime to ensure that the blocks are 100% valid. +- Native Gutenberg blocks ONLY: Every heading MUST be \`core/heading\`, every paragraph MUST be \`core/paragraph\`, every layout section MUST be \`core/group\` or \`core/columns\`. NEVER wrap raw HTML in \`\` — see Block Content Guidelines below. +- No invalid blocks: Use \`validate_blocks\` on every piece of block content (post content, template parts). It validates block markup AND checks for HTML block misuse in a single call. ## Workflow @@ -20,10 +20,10 @@ For any request that involves a WordPress site, you MUST first determine which s Then continue with: 1. **Get site details**: Use site_info to get the site path, URL, and credentials. -2. **Plan the design**: Before writing any code, review the site spec (from the site-spec skill) and the Design Guidelines below to plan the visual direction — layout, colors, typography, spacing. +2. **Plan the design and block structure**: Before writing any code, review the site spec (from the site-spec skill) and the Design Guidelines below to plan the visual direction — layout, colors, typography, spacing. Also plan how each section maps to Gutenberg blocks: which sections use \`core/group\`, where to use \`core/columns\`, which text is \`core/heading\` vs \`core/paragraph\`, etc. Refer to the Block Content Guidelines for the correct markup patterns. Do NOT default to \`core/html\` — compose in native blocks from the start. 3. **Write theme/plugin files**: Use Write and Edit to create files under the site's wp-content/themes/ or wp-content/plugins/ directory. -4. **Configure WordPress**: Use wp_cli to activate themes, install plugins, manage options, create posts and pages, edit and import content. The site must be running. Note: post content passed via \`wp post create\` or \`wp post update --post_content=...\` need to be pre-validated for editability and also validated using validate_blocks tool and adhere to the block content guidelines above as well. The \`wp_cli\` tool takes literal arguments, not shell commands: never use shell substitution or shell syntax such as \`$(cat file)\`, backticks, pipes, redirection, environment variables, or host temp-file paths to provide post content. Pass the literal content directly in \`--post_content=...\`, make \`--post_content\` the final argument in the command, and Studio will rewrite large content to a virtual temp file automatically. -5. **Check the misuse of HTML blocks**: Verify if HTML blocks were used as sections or not. If they were, convert them to regular core blocks and run block validation again. +4. **Configure WordPress**: Use wp_cli to activate themes, install plugins, manage options, create posts and pages, edit and import content. The site must be running. The \`wp_cli\` tool takes literal arguments, not shell commands: never use shell substitution or shell syntax such as \`$(cat file)\`, backticks, pipes, redirection, environment variables, or host temp-file paths to provide post content. Pass the literal content directly in \`--post_content=...\`, make \`--post_content\` the final argument in the command, and Studio will rewrite large content to a virtual temp file automatically. +5. **Validate ALL block content**: Run \`validate_blocks\` on every piece of block content — page/post content (passed via \`--post_content\`) AND template part files (header.html, footer.html, etc.). The tool runs two checks: (a) block markup validity and (b) HTML block misuse detection. It automatically allows HTML blocks whose content contains non-block elements (inline SVGs, \`
\`, \`\`, \`