Skip to content

Commit fe24ba5

Browse files
committed
fix: preserve angle brackets inside fenced code blocks during sanitization
bluemonday.StrictPolicy() treats angle-bracket sequences like <int>, <T>, <string> as HTML tags and strips them, even when they appear inside fenced code blocks. This causes code samples containing generics, templates, or any angle-bracket syntax to lose content when read through the MCP tools. The fix adds a protect/restore step around FilterHTMLTags: before bluemonday runs, < and > inside fenced code blocks are replaced with NUL-delimited placeholders that bluemonday passes through unchanged; after bluemonday, the placeholders are restored. The fence detection follows the same pattern used by FilterCodeFenceMetadata, with one improvement: closing fences are accepted when they are at least as long as the opening fence (per CommonMark spec), preventing a longer closing fence from leaking placeholder-protection into subsequent non-code content. Fixes #2202
1 parent 95726ad commit fe24ba5

File tree

2 files changed

+223
-1
lines changed

2 files changed

+223
-1
lines changed

pkg/sanitize/sanitize.go

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@ var policy *bluemonday.Policy
1212
var policyOnce sync.Once
1313

1414
func Sanitize(input string) string {
15-
return FilterHTMLTags(FilterCodeFenceMetadata(FilterInvisibleCharacters(input)))
15+
s := FilterInvisibleCharacters(input)
16+
s = FilterCodeFenceMetadata(s)
17+
s = protectCodeAngles(s)
18+
s = FilterHTMLTags(s)
19+
s = restoreCodeAngles(s)
20+
return s
1621
}
1722

1823
// FilterInvisibleCharacters removes invisible or control characters that should not appear
@@ -207,3 +212,72 @@ func shouldRemoveRune(r rune) bool {
207212

208213
return false
209214
}
215+
216+
// Placeholders used to shield angle brackets inside code regions from
217+
// the HTML sanitizer. They must not look like HTML tags themselves and
218+
// must be unlikely to appear in real content.
219+
const (
220+
codeLtPlaceholder = "\x00CODELT\x00"
221+
codeGtPlaceholder = "\x00CODEGT\x00"
222+
)
223+
224+
// protectCodeAngles replaces < and > with unique placeholders inside
225+
// fenced code blocks so that bluemonday does not strip them as HTML tags.
226+
// This must run after FilterCodeFenceMetadata (which cleans fence info
227+
// strings) and before FilterHTMLTags.
228+
func protectCodeAngles(input string) string {
229+
if input == "" {
230+
return input
231+
}
232+
233+
lines := strings.Split(input, "\n")
234+
insideFence := false
235+
currentFenceLen := 0
236+
237+
for i, line := range lines {
238+
fenceIdx := strings.Index(line, "```")
239+
240+
if fenceIdx != -1 && !hasNonWhitespace(line[:fenceIdx]) {
241+
fenceEnd := fenceIdx
242+
for fenceEnd < len(line) && line[fenceEnd] == '`' {
243+
fenceEnd++
244+
}
245+
fenceLen := fenceEnd - fenceIdx
246+
247+
if fenceLen >= 3 {
248+
if insideFence {
249+
if currentFenceLen == 0 || fenceLen >= currentFenceLen {
250+
// Valid closing fence (CommonMark: closing fence
251+
// must be at least as long as the opening fence).
252+
insideFence = false
253+
currentFenceLen = 0
254+
continue
255+
}
256+
// Fence length too short — still inside code.
257+
} else {
258+
// Opening fence.
259+
insideFence = true
260+
currentFenceLen = fenceLen
261+
continue
262+
}
263+
}
264+
}
265+
266+
if insideFence {
267+
lines[i] = strings.ReplaceAll(
268+
strings.ReplaceAll(line, "<", codeLtPlaceholder),
269+
">", codeGtPlaceholder,
270+
)
271+
}
272+
}
273+
274+
return strings.Join(lines, "\n")
275+
}
276+
277+
// restoreCodeAngles reverses the placeholder substitution performed by
278+
// protectCodeAngles.
279+
func restoreCodeAngles(input string) string {
280+
s := strings.ReplaceAll(input, codeLtPlaceholder, "<")
281+
s = strings.ReplaceAll(s, codeGtPlaceholder, ">")
282+
return s
283+
}

pkg/sanitize/sanitize_test.go

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,151 @@ func TestSanitizeRemovesInvisibleCodeFenceMetadata(t *testing.T) {
300300
result := Sanitize(input)
301301
assert.Equal(t, expected, result)
302302
}
303+
304+
func TestProtectCodeAngles(t *testing.T) {
305+
tests := []struct {
306+
name string
307+
input string
308+
expected string
309+
}{
310+
{
311+
name: "empty string",
312+
input: "",
313+
expected: "",
314+
},
315+
{
316+
name: "no code blocks",
317+
input: "Hello <b>World</b>",
318+
expected: "Hello <b>World</b>",
319+
},
320+
{
321+
name: "fenced code block with angle brackets",
322+
input: "```\nvector<int> v;\n```",
323+
expected: "```\nvector" + codeLtPlaceholder + "int" + codeGtPlaceholder + " v;\n```",
324+
},
325+
{
326+
name: "fenced code block with language tag",
327+
input: "```cpp\nmap<string, int> m;\n```",
328+
expected: "```cpp\nmap" + codeLtPlaceholder + "string, int" + codeGtPlaceholder + " m;\n```",
329+
},
330+
{
331+
name: "multiple code blocks",
332+
input: "text\n```\na<b>c\n```\nmiddle\n```\nd<e>f\n```",
333+
expected: "text\n```\na" + codeLtPlaceholder + "b" + codeGtPlaceholder + "c\n```\nmiddle\n```\nd" + codeLtPlaceholder + "e" + codeGtPlaceholder + "f\n```",
334+
},
335+
{
336+
name: "angle brackets outside code blocks preserved as-is",
337+
input: "Use <b>bold</b>\n```\ncode<T>\n```\nMore <em>text</em>",
338+
expected: "Use <b>bold</b>\n```\ncode" + codeLtPlaceholder + "T" + codeGtPlaceholder + "\n```\nMore <em>text</em>",
339+
},
340+
{
341+
name: "four-backtick fence",
342+
input: "````\nfn foo<T>()\n````",
343+
expected: "````\nfn foo" + codeLtPlaceholder + "T" + codeGtPlaceholder + "()\n````",
344+
},
345+
{
346+
name: "shorter fence inside code does not close block",
347+
input: "````\nline<A>\n```\nstill<B>\n````",
348+
expected: "````\nline" + codeLtPlaceholder + "A" + codeGtPlaceholder + "\n```\nstill" + codeLtPlaceholder + "B" + codeGtPlaceholder + "\n````",
349+
},
350+
{
351+
name: "longer closing fence closes the block (CommonMark)",
352+
input: "```\ncode<T>\n````\noutside<b>text</b>",
353+
expected: "```\ncode" + codeLtPlaceholder + "T" + codeGtPlaceholder + "\n````\noutside<b>text</b>",
354+
},
355+
{
356+
name: "unclosed fence protects remaining lines",
357+
input: "```\na<b>c\nmore<d>",
358+
expected: "```\na" + codeLtPlaceholder + "b" + codeGtPlaceholder + "c\nmore" + codeLtPlaceholder + "d" + codeGtPlaceholder,
359+
},
360+
}
361+
362+
for _, tt := range tests {
363+
t.Run(tt.name, func(t *testing.T) {
364+
result := protectCodeAngles(tt.input)
365+
assert.Equal(t, tt.expected, result)
366+
})
367+
}
368+
}
369+
370+
func TestRestoreCodeAngles(t *testing.T) {
371+
tests := []struct {
372+
name string
373+
input string
374+
expected string
375+
}{
376+
{
377+
name: "empty string",
378+
input: "",
379+
expected: "",
380+
},
381+
{
382+
name: "no placeholders",
383+
input: "Hello World",
384+
expected: "Hello World",
385+
},
386+
{
387+
name: "restores lt and gt",
388+
input: "vector" + codeLtPlaceholder + "int" + codeGtPlaceholder,
389+
expected: "vector<int>",
390+
},
391+
}
392+
393+
for _, tt := range tests {
394+
t.Run(tt.name, func(t *testing.T) {
395+
result := restoreCodeAngles(tt.input)
396+
assert.Equal(t, tt.expected, result)
397+
})
398+
}
399+
}
400+
401+
func TestSanitizePreservesAngleBracketsInCodeBlocks(t *testing.T) {
402+
tests := []struct {
403+
name string
404+
input string
405+
expected string
406+
}{
407+
{
408+
name: "issue 2202: template parameter in code block",
409+
input: "```\nlet ptr: mut_raw_ptr<int> = raw_new int;\n```",
410+
expected: "```\nlet ptr: mut_raw_ptr<int> = raw_new int;\n```",
411+
},
412+
{
413+
name: "C++ template in code block",
414+
input: "```cpp\nstd::vector<std::string> items;\n```",
415+
expected: "```cpp\nstd::vector<std::string> items;\n```",
416+
},
417+
{
418+
name: "HTML-like tags outside code blocks still sanitized",
419+
input: "<script>alert(1)</script>\n```\nvector<int> v;\n```",
420+
expected: "\n```\nvector<int> v;\n```",
421+
},
422+
{
423+
name: "allowed HTML outside code blocks preserved",
424+
input: "<b>bold</b>\n```\nfoo<T>()\n```",
425+
expected: "<b>bold</b>\n```\nfoo<T>()\n```",
426+
},
427+
{
428+
name: "multiple angle brackets in code",
429+
input: "```\nMap<String, List<Integer>> m;\n```",
430+
expected: "```\nMap<String, List<Integer>> m;\n```",
431+
},
432+
{
433+
name: "script tags after code block still sanitized",
434+
input: "```\nvector<int> v;\n```\n<script>alert(1)</script>",
435+
expected: "```\nvector<int> v;\n```\n",
436+
},
437+
{
438+
name: "longer closing fence does not leak protection",
439+
input: "```\ncode<T>\n````\n<script>alert(1)</script>",
440+
expected: "```\ncode<T>\n````\n",
441+
},
442+
}
443+
444+
for _, tt := range tests {
445+
t.Run(tt.name, func(t *testing.T) {
446+
result := Sanitize(tt.input)
447+
assert.Equal(t, tt.expected, result)
448+
})
449+
}
450+
}

0 commit comments

Comments
 (0)