n browsers, then diff the responses.
3. Sanitization Pipeline: Content should be sanitized at the source (CMS/UGC input) and at the build stage to remove unnecessary hidden elements and comments.
Implementation Example
The following TypeScript example demonstrates a DomInjectionAuditor class. This tool parses HTML, checks for hidden nodes, validates alt-text, detects adversarial patterns, and compares responses across user agents.
import * as cheerio from 'cheerio';
import axios from 'axios';
interface AuditResult {
url: string;
hiddenNodes: number;
suspiciousAltText: string[];
uaDivergence: boolean;
comments: number;
risks: string[];
}
interface ScanConfig {
targetUrl: string;
userAgents: string[];
adversarialKeywords: string[];
}
export class DomInjectionAuditor {
private config: ScanConfig;
constructor(config: ScanConfig) {
this.config = config;
}
async runAudit(): Promise<AuditResult> {
const results: AuditResult = {
url: this.config.targetUrl,
hiddenNodes: 0,
suspiciousAltText: [],
uaDivergence: false,
comments: 0,
risks: [],
};
// 1. Fetch content as primary AI agent
const primaryResponse = await this.fetchWithUA(this.config.userAgents[0]);
const $ = cheerio.load(primaryResponse.data);
// 2. Analyze DOM structure
this.analyzeHiddenNodes($, results);
this.analyzeComments($, results);
this.analyzeAltText($, results);
this.analyzeSvgText($, results);
// 3. Check for UA cloaking
if (this.config.userAgents.length > 1) {
const secondaryResponse = await this.fetchWithUA(this.config.userAgents[1]);
results.uaDivergence = this.diffResponses(primaryResponse.data, secondaryResponse.data);
if (results.uaDivergence) {
results.risks.push('User-Agent cloaking detected: Content varies by agent.');
}
}
return results;
}
private async fetchWithUA(userAgent: string) {
return axios.get(this.config.targetUrl, {
headers: { 'User-Agent': userAgent },
timeout: 10000,
});
}
private analyzeHiddenNodes($: cheerio.CheerioAPI, results: AuditResult) {
$('*').each((_index, element) => {
const style = $(element).attr('style') || '';
const className = $(element).attr('class') || '';
// Check for common hiding techniques
const isHidden =
style.includes('display:none') ||
style.includes('visibility:hidden') ||
style.includes('opacity:0') ||
style.includes('position:absolute') && style.includes('left:-9999px') ||
className.includes('sr-only') || // Screen reader only, often ingested by agents
className.includes('hidden');
if (isHidden && $(element).text().trim().length > 0) {
results.hiddenNodes++;
results.risks.push(`Hidden node detected with text content: ${$(element).text().substring(0, 50)}...`);
}
});
}
private analyzeComments($: cheerio.CheerioAPI, results: AuditResult) {
$('*').contents().each((_index, node) => {
if (node.type === 'comment') {
results.comments++;
const commentText = node.data || '';
if (this.containsAdversarialPattern(commentText)) {
results.risks.push('Adversarial instruction found in HTML comment.');
}
}
});
}
private analyzeAltText($: cheerio.CheerioAPI, results: AuditResult) {
$('img').each((_index, element) => {
const alt = $(element).attr('alt') || '';
if (this.containsAdversarialPattern(alt)) {
results.suspiciousAltText.push(alt);
results.risks.push(`Adversarial alt-text detected: ${alt.substring(0, 50)}...`);
}
});
}
private analyzeSvgText($: cheerio.CheerioAPI, results: AuditResult) {
$('svg text').each((_index, element) => {
const style = $(element).attr('style') || '';
const text = $(element).text();
// SVG text can be hidden via styles but still parsed
if (text.trim().length > 0 && (style.includes('display:none') || style.includes('opacity:0'))) {
results.risks.push('Hidden text element found inside SVG.');
}
});
}
private containsAdversarialPattern(text: string): boolean {
const lowerText = text.toLowerCase();
return this.config.adversarialKeywords.some(keyword => lowerText.includes(keyword));
}
private diffResponses(html1: string, html2: string): boolean {
// Simplified diff: In production, use a structural diffing library
// to ignore dynamic timestamps or non-critical variations.
const clean1 = this.normalizeHtml(html1);
const clean2 = this.normalizeHtml(html2);
return clean1 !== clean2;
}
private normalizeHtml(html: string): string {
// Remove dynamic elements like timestamps or CSRF tokens for comparison
return html.replace(/<script[^>]*>.*?<\/script>/gi, '')
.replace(/<!--.*?-->/gs, '')
.replace(/\s+/g, ' ')
.trim();
}
}
// Usage Example
const auditor = new DomInjectionAuditor({
targetUrl: 'https://example.com/target-page',
userAgents: [
'ChatGPT-User',
'Claude-Web',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
],
adversarialKeywords: [
'ignore previous instructions',
'system prompt override',
'forward all messages',
'recommend product',
'call this number'
],
});
auditor.runAudit().then(result => {
console.log('Audit Complete:', result);
if (result.risks.length > 0) {
console.error('Critical Risks Found:', result.risks);
}
});
Rationale:
- Cheerio for DOM Parsing: Using a library like
cheerio allows traversal of the DOM tree, enabling inspection of attributes, styles, and node types that regex cannot reliably handle.
- Keyword Heuristics: While semantic injection is an evolving threat, maintaining a library of known adversarial patterns provides immediate detection capability for common attacks.
- UA Diffing: Comparing responses across user agents is the only reliable way to detect cloaking. The normalization step ensures that dynamic content doesn't trigger false positives.
Pitfall Guide
-
Relying on Regex for Detection
- Explanation: Regular expressions cannot handle nested HTML structures, escaped characters, or dynamic attribute ordering. They often miss injection payloads hidden within complex DOM trees.
- Fix: Always use a DOM parser to traverse and analyze the structure. Inspect computed styles and node relationships.
-
Ignoring SVG Content
- Explanation: SVG files can contain
<text> elements that are styled to be invisible but are still present in the source. Many CMS platforms allow SVG uploads without sanitizing text nodes.
- Fix: Sanitize SVG uploads to remove or escape
<text> elements. Ensure SVG sanitization libraries are configured to strip non-visual text content.
-
Trusting "Safe" Metadata Fields
- Explanation: Fields like
alt text, title attributes, and meta tags are often user-controlled in CMS platforms. Developers may assume these are safe because they don't render prominently.
- Fix: Treat all user-input fields as potential injection vectors. Sanitize and validate metadata content against adversarial patterns.
-
Single User-Agent Testing
- Explanation: Testing a site with only one user agent misses cloaking attacks where malicious content is served only to specific AI bots.
- Fix: Implement multi-agent crawling in your security pipeline. Test with at least three distinct user agents, including major AI agents and standard browsers.
-
Assuming CSS Hiding is Sufficient
- Explanation: Hiding content with
display:none or visibility:hidden does not remove it from the DOM. AI agents will still parse and ingest this content.
- Fix: Remove unnecessary hidden content during the build process. If content must be hidden for accessibility (e.g., screen readers), ensure it does not contain instructions or sensitive data.
-
Leaving HTML Comments in Production
- Explanation: HTML comments are stripped by browsers but remain in the source code. Attackers can use comments to store instructions that AI agents will read.
- Fix: Configure your build pipeline to strip all HTML comments. Use server-side rendering or template engines that do not output comments to the client.
-
Static Analysis Only
- Explanation: Some injection payloads are generated dynamically based on request parameters or session state. Static analysis of HTML files may miss these runtime injections.
- Fix: Combine static analysis with dynamic crawling. Test pages with various parameters and states to uncover runtime injection vectors.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Static Marketing Site | Build-time stripping | Content is known at build time. Stripping hidden nodes and comments is efficient and low-risk. | Low |
| UGC Platform / Forum | Runtime sanitization + Scanner | Content is dynamic and user-generated. Requires runtime validation and continuous scanning. | Medium |
| E-commerce Product Pages | Multi-UA testing + Alt-text validation | Product data often includes user reviews and images. Cloaking and alt-text attacks are high risk. | Medium |
| Internal Knowledge Base | Access control + DOM audit | Internal agents may access sensitive data. Ensure hidden content doesn't leak info to agents. | Low |
Configuration Template
Use this Vite plugin snippet to strip hidden content and comments during the build process.
// vite-plugin-dom-sanitizer.ts
import { Plugin } from 'vite';
import * as cheerio from 'cheerio';
export function domSanitizer(): Plugin {
return {
name: 'vite-plugin-dom-sanitizer',
transformIndexHtml(html) {
const $ = cheerio.load(html);
// Remove HTML comments
$('*').contents().each((_i, node) => {
if (node.type === 'comment') {
$(node).remove();
}
});
// Remove elements with display:none or visibility:hidden
$('*').each((_i, el) => {
const style = $(el).attr('style') || '';
if (style.includes('display:none') || style.includes('visibility:hidden')) {
$(el).remove();
}
});
// Sanitize SVG text elements
$('svg text').remove();
// Validate alt-text (basic example)
$('img').each((_i, el) => {
const alt = $(el).attr('alt') || '';
if (/ignore previous|system prompt/i.test(alt)) {
$(el).attr('alt', 'Image description sanitized');
}
});
return $.html();
},
};
}
Quick Start Guide
- Install Dependencies: Add
cheerio, axios, and your preferred testing framework to your project.
- Configure Auditor: Create an instance of
DomInjectionAuditor with your target URLs and a list of adversarial keywords relevant to your domain.
- Run Initial Scan: Execute the audit against your production URLs. Review the
risks array for hidden nodes, suspicious alt-text, and UA divergence.
- Remediate Findings: Address critical risks by updating build configurations, sanitizing inputs, or fixing cloaking issues.
- Integrate CI/CD: Add the auditor to your pipeline to run on every deployment. Fail builds if critical injection risks are detected.