From 141a83c492f2f35e47d5f419a1fd33f4a350c587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Furkan=20K=C3=B6yk=C4=B1ran?= Date: Wed, 11 Mar 2026 17:03:30 +0000 Subject: [PATCH 1/3] fix(mcp): sanitize malformed Unicode in MCP responses Add sanitizeUnicode() function to replace lone surrogates with U+FFFD before JSON serialization. This prevents "invalid high surrogate in string" errors when page content contains malformed Unicode. Uses String.prototype.toWellFormed() on Node 20+, with fallback for Node 18 compatibility. Integrated into response serialization pipeline alongside existing redactText() function. --- .../src/tools/backend/response.ts | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/packages/playwright-core/src/tools/backend/response.ts b/packages/playwright-core/src/tools/backend/response.ts index 6d6593303bdc3..1f8f786b3217d 100644 --- a/packages/playwright-core/src/tools/backend/response.ts +++ b/packages/playwright-core/src/tools/backend/response.ts @@ -151,7 +151,7 @@ export class Response { const content: (TextContent | ImageContent)[] = [ { type: 'text', - text: redactText(text.join('\n')), + text: sanitizeUnicode(redactText(text.join('\n'))), } ]; @@ -265,6 +265,44 @@ function trimMiddle(text: string, maxLength: number) { return text.slice(0, Math.floor(maxLength / 2)) + '...' + text.slice(- 3 - Math.floor(maxLength / 2)); } +/** + * Sanitizes a string to ensure it only contains well-formed Unicode. + * Replaces lone surrogates (high/low surrogates without their pair) with U+FFFD. + * Uses String.prototype.toWellFormed() when available (Node 20+), otherwise + * falls back to manual surrogate replacement for Node 18 compatibility. + */ +function sanitizeUnicode(text: string): string { + // Use native toWellFormed() when available (Node 20+) + if (typeof text.toWellFormed === 'function') { + return text.toWellFormed(); + } + + // Fallback for Node 18: replace lone surrogates with U+FFFD + let result = ''; + for (let i = 0; i < text.length; i++) { + const code = text.charCodeAt(i); + // Check for high surrogate (0xD800-0xDBFF) + if (code >= 0xD800 && code <= 0xDBFF) { + const next = text.charCodeAt(i + 1); + // If followed by low surrogate, keep both; otherwise replace with U+FFFD + if (next >= 0xDC00 && next <= 0xDFFF) { + result += text[i] + text[i + 1]; + i++; + } else { + result += '\uFFFD'; + } + } + // Check for lone low surrogate (0xDC00-0xDFFF) + else if (code >= 0xDC00 && code <= 0xDFFF) { + result += '\uFFFD'; + } + else { + result += text[i]; + } + } + return result; +} + function parseSections(text: string): Map { const sections = new Map(); const sectionHeaders = text.split(/^### /m).slice(1); // Remove empty first element From 39ec689f85e76b4d531090579811b135c32ea682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Furkan=20K=C3=B6yk=C4=B1ran?= Date: Wed, 11 Mar 2026 17:03:48 +0000 Subject: [PATCH 2/3] test(mcp): add unicode serialization tests Add comprehensive tests for malformed Unicode handling in MCP responses: - Lone high surrogates - Lone low surrogates - Valid surrogate pairs (emoji) - Mixed CJK content with malformed Unicode - Multiple consecutive lone surrogates - Console messages with lone surrogates All tests verify that MCP responses don't fail with JSON serialization errors when encountering malformed Unicode from page content. --- tests/mcp/unicode-serialization.spec.ts | 142 ++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 tests/mcp/unicode-serialization.spec.ts diff --git a/tests/mcp/unicode-serialization.spec.ts b/tests/mcp/unicode-serialization.spec.ts new file mode 100644 index 0000000000000..cce972cbd7115 --- /dev/null +++ b/tests/mcp/unicode-serialization.spec.ts @@ -0,0 +1,142 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { test, expect } from './fixtures'; + +test.describe('unicode serialization', () => { + // Use --no-sandbox for running as root in CI environments + test.use({ mcpArgs: ['--no-sandbox'] }); + + test('handles lone high surrogate in page content', async ({ client, server }) => { + // Create a page with a lone high surrogate (0xD800) + // This would normally cause JSON serialization to fail + await server.setRoute('/malformed.html', (req, res) => { + res.setHeader('Content-Type', 'text/html'); + res.end(`Text with ${String.fromCharCode(0xD800)} lone surrogate`); + }); + + // The key test: this should not throw a JSON serialization error + const result = await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX + '/malformed.html' }, + }); + + // Should have successfully navigated without JSON serialization error + expect(result.content[0].text).toContain('Page URL:'); + expect(result.content[0].text).toContain('lone surrogate'); + }); + + test('handles lone low surrogate in page content', async ({ client, server }) => { + // Create a page with a lone low surrogate (0xDC00) + await server.setRoute('/malformed2.html', (req, res) => { + res.setHeader('Content-Type', 'text/html'); + res.end(`Text with ${String.fromCharCode(0xDC00)} lone low surrogate`); + }); + + // Should not throw JSON serialization error + const result = await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX + '/malformed2.html' }, + }); + + expect(result.content[0].text).toContain('Page URL:'); + expect(result.content[0].text).toContain('lone low'); + }); + + test('preserves valid surrogate pairs (emoji)', async ({ client, server }) => { + // Test with valid emoji: πŸ’€ (U+1F480) = high surrogate 0xD83D + low surrogate 0xDC80 + await server.setRoute('/valid.html', (req, res) => { + res.setHeader('Content-Type', 'text/html'); + res.end(`Valid emoji: πŸ’€ skull`); + }); + + // Should not throw JSON serialization error and preserve emoji content + const result = await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX + '/valid.html' }, + }); + + expect(result.content[0].text).toContain('Page URL:'); + expect(result.content[0].text).toContain('emoji'); + expect(result.content[0].text).toContain('skull'); + }); + + test('handles CJK mixed content with malformed unicode', async ({ client, server }) => { + // Test with mixed CJK content and a lone surrogate + await server.setRoute('/mixed.html', (req, res) => { + res.setHeader('Content-Type', 'text/html'); + const html = ` +

ζ—₯本θͺž

+

δΈ­ζ–‡ ${String.fromCharCode(0xD800)} mixed

+

ν•œκ΅­μ–΄

+

Emoji: πŸ˜€

+ `; + res.end(html); + }); + + // Should not throw JSON serialization error with mixed content + const result = await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX + '/mixed.html' }, + }); + + const text = result.content[0].text; + // Should successfully handle mixed CJK content without JSON serialization error + expect(text).toContain('Page URL:'); + expect(text).toContain('mixed'); + }); + + test('handles multiple consecutive lone surrogates', async ({ client, server }) => { + // Test with multiple consecutive lone surrogates + await server.setRoute('/multiple.html', (req, res) => { + res.setHeader('Content-Type', 'text/html'); + res.end(`Before ${String.fromCharCode(0xD800)}${String.fromCharCode(0xDC00)} middle`); + }); + + // Should not throw JSON serialization error + const result = await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX + '/multiple.html' }, + }); + + const text = result.content[0].text; + expect(text).toContain('Page URL:'); + expect(text).toContain('Before'); + expect(text).toContain('middle'); + }); + + test('handles lone surrogates in console messages', async ({ startClient, server }) => { + // Test that console messages with lone surrogates are also sanitized + await server.setRoute('/console.html', (req, res) => { + res.setHeader('Content-Type', 'text/html'); + res.end(``); + }); + + const { client } = await startClient({ + args: ['--console-level=debug'], + }); + + const result = await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX + '/console.html' }, + }); + + // Console messages should also be sanitized + const text = result.content[0].text; + // Should not throw JSON serialization error + expect(text).toBeDefined(); + }); +}); From 0452b2653457d3dd1b92986411f58784c4e17af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Furkan=20K=C3=B6yk=C4=B1ran?= Date: Thu, 12 Mar 2026 14:58:03 +0000 Subject: [PATCH 3/3] fix(mcp): simplify unicode sanitization and rewrite tests - Remove Node 18 fallback, use toWellFormed() only - Rewrite tests to match MCP test patterns (3 focused tests) - Use server.setContent() for simpler test setup - Reduce test complexity while maintaining coverage --- .../src/tools/backend/response.ts | 34 +----- tests/mcp/unicode-serialization.spec.ts | 108 ++---------------- 2 files changed, 13 insertions(+), 129 deletions(-) diff --git a/packages/playwright-core/src/tools/backend/response.ts b/packages/playwright-core/src/tools/backend/response.ts index 1f8f786b3217d..f0b9266869ba8 100644 --- a/packages/playwright-core/src/tools/backend/response.ts +++ b/packages/playwright-core/src/tools/backend/response.ts @@ -267,40 +267,10 @@ function trimMiddle(text: string, maxLength: number) { /** * Sanitizes a string to ensure it only contains well-formed Unicode. - * Replaces lone surrogates (high/low surrogates without their pair) with U+FFFD. - * Uses String.prototype.toWellFormed() when available (Node 20+), otherwise - * falls back to manual surrogate replacement for Node 18 compatibility. + * Replaces lone surrogates with U+FFFD using String.prototype.toWellFormed(). */ function sanitizeUnicode(text: string): string { - // Use native toWellFormed() when available (Node 20+) - if (typeof text.toWellFormed === 'function') { - return text.toWellFormed(); - } - - // Fallback for Node 18: replace lone surrogates with U+FFFD - let result = ''; - for (let i = 0; i < text.length; i++) { - const code = text.charCodeAt(i); - // Check for high surrogate (0xD800-0xDBFF) - if (code >= 0xD800 && code <= 0xDBFF) { - const next = text.charCodeAt(i + 1); - // If followed by low surrogate, keep both; otherwise replace with U+FFFD - if (next >= 0xDC00 && next <= 0xDFFF) { - result += text[i] + text[i + 1]; - i++; - } else { - result += '\uFFFD'; - } - } - // Check for lone low surrogate (0xDC00-0xDFFF) - else if (code >= 0xDC00 && code <= 0xDFFF) { - result += '\uFFFD'; - } - else { - result += text[i]; - } - } - return result; + return text.toWellFormed(); } function parseSections(text: string): Map { diff --git a/tests/mcp/unicode-serialization.spec.ts b/tests/mcp/unicode-serialization.spec.ts index cce972cbd7115..f5bf825a47294 100644 --- a/tests/mcp/unicode-serialization.spec.ts +++ b/tests/mcp/unicode-serialization.spec.ts @@ -17,126 +17,40 @@ import { test, expect } from './fixtures'; test.describe('unicode serialization', () => { - // Use --no-sandbox for running as root in CI environments test.use({ mcpArgs: ['--no-sandbox'] }); - test('handles lone high surrogate in page content', async ({ client, server }) => { - // Create a page with a lone high surrogate (0xD800) - // This would normally cause JSON serialization to fail - await server.setRoute('/malformed.html', (req, res) => { - res.setHeader('Content-Type', 'text/html'); - res.end(`Text with ${String.fromCharCode(0xD800)} lone surrogate`); - }); + test('handles lone surrogates in page content', async ({ client, server }) => { + server.setContent('/', `Text with ${String.fromCharCode(0xD800)} lone surrogate`, 'text/html'); - // The key test: this should not throw a JSON serialization error const result = await client.callTool({ name: 'browser_navigate', - arguments: { url: server.PREFIX + '/malformed.html' }, + arguments: { url: server.PREFIX }, }); - // Should have successfully navigated without JSON serialization error expect(result.content[0].text).toContain('Page URL:'); - expect(result.content[0].text).toContain('lone surrogate'); }); - test('handles lone low surrogate in page content', async ({ client, server }) => { - // Create a page with a lone low surrogate (0xDC00) - await server.setRoute('/malformed2.html', (req, res) => { - res.setHeader('Content-Type', 'text/html'); - res.end(`Text with ${String.fromCharCode(0xDC00)} lone low surrogate`); - }); + test('preserves valid emoji and surrogate pairs', async ({ client, server }) => { + server.setContent('/', 'Valid emoji: πŸ’€ skull and text', 'text/html'); - // Should not throw JSON serialization error const result = await client.callTool({ name: 'browser_navigate', - arguments: { url: server.PREFIX + '/malformed2.html' }, - }); - - expect(result.content[0].text).toContain('Page URL:'); - expect(result.content[0].text).toContain('lone low'); - }); - - test('preserves valid surrogate pairs (emoji)', async ({ client, server }) => { - // Test with valid emoji: πŸ’€ (U+1F480) = high surrogate 0xD83D + low surrogate 0xDC80 - await server.setRoute('/valid.html', (req, res) => { - res.setHeader('Content-Type', 'text/html'); - res.end(`Valid emoji: πŸ’€ skull`); + arguments: { url: server.PREFIX }, }); - // Should not throw JSON serialization error and preserve emoji content - const result = await client.callTool({ - name: 'browser_navigate', - arguments: { url: server.PREFIX + '/valid.html' }, - }); - - expect(result.content[0].text).toContain('Page URL:'); expect(result.content[0].text).toContain('emoji'); - expect(result.content[0].text).toContain('skull'); - }); - - test('handles CJK mixed content with malformed unicode', async ({ client, server }) => { - // Test with mixed CJK content and a lone surrogate - await server.setRoute('/mixed.html', (req, res) => { - res.setHeader('Content-Type', 'text/html'); - const html = ` -

ζ—₯本θͺž

-

δΈ­ζ–‡ ${String.fromCharCode(0xD800)} mixed

-

ν•œκ΅­μ–΄

-

Emoji: πŸ˜€

- `; - res.end(html); - }); - - // Should not throw JSON serialization error with mixed content - const result = await client.callTool({ - name: 'browser_navigate', - arguments: { url: server.PREFIX + '/mixed.html' }, - }); - - const text = result.content[0].text; - // Should successfully handle mixed CJK content without JSON serialization error - expect(text).toContain('Page URL:'); - expect(text).toContain('mixed'); }); - test('handles multiple consecutive lone surrogates', async ({ client, server }) => { - // Test with multiple consecutive lone surrogates - await server.setRoute('/multiple.html', (req, res) => { - res.setHeader('Content-Type', 'text/html'); - res.end(`Before ${String.fromCharCode(0xD800)}${String.fromCharCode(0xDC00)} middle`); - }); + test('handles console messages with lone surrogates', async ({ startClient, server }) => { + server.setContent('/', ``, 'text/html'); - // Should not throw JSON serialization error - const result = await client.callTool({ - name: 'browser_navigate', - arguments: { url: server.PREFIX + '/multiple.html' }, - }); - - const text = result.content[0].text; - expect(text).toContain('Page URL:'); - expect(text).toContain('Before'); - expect(text).toContain('middle'); - }); - - test('handles lone surrogates in console messages', async ({ startClient, server }) => { - // Test that console messages with lone surrogates are also sanitized - await server.setRoute('/console.html', (req, res) => { - res.setHeader('Content-Type', 'text/html'); - res.end(``); - }); - - const { client } = await startClient({ - args: ['--console-level=debug'], - }); + const { client } = await startClient({ args: ['--console-level=debug'] }); const result = await client.callTool({ name: 'browser_navigate', - arguments: { url: server.PREFIX + '/console.html' }, + arguments: { url: server.PREFIX }, }); - // Console messages should also be sanitized - const text = result.content[0].text; - // Should not throw JSON serialization error - expect(text).toBeDefined(); + expect(result.content[0].text).toBeDefined(); }); });