More robust

This commit is contained in:
julien
2026-03-27 20:14:11 +01:00
parent 75ec966435
commit 68c547ddcb
25 changed files with 474 additions and 224 deletions

View File

@@ -10,11 +10,6 @@ class MarkdownService
'strong', 'em', 'a', 'img', 'hr', 'br',
];
private const ALLOWED_ATTRS = [
'a' => ['href', 'title', 'rel', 'target'],
'img' => ['src', 'alt', 'title', 'loading', 'decoding'],
];
public static function compile(string $markdown, Media $media): string
{
$markdown = trim($markdown);
@@ -34,109 +29,190 @@ class MarkdownService
return $html;
}
// Passe DOM unique : sanitise les balises/attributs et résout les références media:.
// Reconstruction en liste blanche : les descendants d'une balise interdite
// sont retraités récursivement avant d'être réinsérés.
private static function sanitizeAndResolve(string $html, Media $media): string
{
$dom = new DOMDocument('1.0', 'UTF-8');
libxml_use_internal_errors(true);
$dom->loadHTML('<?xml encoding="UTF-8"><body>' . $html . '</body>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
libxml_clear_errors();
$source = new DOMDocument('1.0', 'UTF-8');
$clean = new DOMDocument('1.0', 'UTF-8');
$cleanBody = $clean->createElement('body');
$clean->appendChild($cleanBody);
$body = $dom->getElementsByTagName('body')->item(0);
if (!$body instanceof DOMElement) {
$previousUseInternalErrors = libxml_use_internal_errors(true);
$source->loadHTML('<?xml encoding="UTF-8"><body>' . $html . '</body>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
libxml_clear_errors();
libxml_use_internal_errors($previousUseInternalErrors);
$sourceBody = $source->getElementsByTagName('body')->item(0);
if (!$sourceBody instanceof DOMElement) {
return '';
}
self::processNode($body, $media);
self::appendSanitizedChildren($sourceBody, $cleanBody, $clean, $media);
$out = '';
foreach ($body->childNodes as $child) {
$out .= $dom->saveHTML($child);
for ($i = 0; $i < $cleanBody->childNodes->length; $i++) {
$child = $cleanBody->childNodes->item($i);
if ($child !== null) {
$out .= $clean->saveHTML($child);
}
}
return trim($out);
}
private static function processNode(DOMNode $parent, Media $media): void
private static function appendSanitizedChildren(DOMNode $sourceParent, DOMNode $targetParent, DOMDocument $target, Media $media): void
{
for ($i = $parent->childNodes->length - 1; $i >= 0; $i--) {
$child = $parent->childNodes->item($i);
if ($child === null) {
continue;
$children = [];
for ($i = 0; $i < $sourceParent->childNodes->length; $i++) {
$child = $sourceParent->childNodes->item($i);
if ($child !== null) {
$children[] = $child;
}
}
foreach ($children as $child) {
if ($child instanceof DOMComment) {
$parent->removeChild($child);
continue;
}
if ($child instanceof DOMText) {
$targetParent->appendChild($target->createTextNode($child->nodeValue ?? ''));
continue;
}
if (!$child instanceof DOMElement) {
$parent->removeChild($child);
continue;
}
if (!in_array($child->tagName, self::ALLOWED_TAGS, true)) {
self::unwrap($child);
continue;
}
self::sanitizeAttributes($child, $media);
// img may have been removed by sanitizeAttributes
if ($child->parentNode !== null) {
self::processNode($child, $media);
}
self::appendSanitizedElement($child, $targetParent, $target, $media);
}
}
private static function sanitizeAttributes(DOMElement $element, Media $media): void
private static function appendSanitizedElement(DOMElement $sourceElement, DOMNode $targetParent, DOMDocument $target, Media $media): void
{
$allowed = self::ALLOWED_ATTRS[$element->tagName] ?? [];
$toRemove = [];
foreach ($element->attributes as $attribute) {
if (!in_array($attribute->name, $allowed, true)) {
$toRemove[] = $attribute->name;
}
}
foreach ($toRemove as $name) {
$element->removeAttribute($name);
$tag = strtolower($sourceElement->tagName);
if (!in_array($tag, self::ALLOWED_TAGS, true)) {
self::appendSanitizedChildren($sourceElement, $targetParent, $target, $media);
return;
}
if ($element->tagName === 'a') {
$href = trim($element->getAttribute('href'));
if ($href === '' || !preg_match('~^(https?:|mailto:|tel:|/)~i', $href)) {
$element->removeAttribute('href');
} else {
$element->setAttribute('rel', 'noopener noreferrer');
if (preg_match('~^https?://~i', $href)) {
$element->setAttribute('target', '_blank');
}
if ($tag === 'img') {
$image = self::buildSanitizedImage($sourceElement, $target, $media);
if ($image !== null) {
$targetParent->appendChild($image);
}
return;
}
$cleanElement = $target->createElement($tag);
self::sanitizeAttributes($sourceElement, $cleanElement);
$targetParent->appendChild($cleanElement);
self::appendSanitizedChildren($sourceElement, $cleanElement, $target, $media);
}
private static function sanitizeAttributes(DOMElement $sourceElement, DOMElement $targetElement): void
{
if ($targetElement->tagName !== 'a') {
return;
}
$href = self::sanitizeHref((string) $sourceElement->getAttribute('href'));
if ($href !== null) {
$targetElement->setAttribute('href', $href);
$targetElement->setAttribute('rel', 'noopener noreferrer');
if (preg_match('~^https?://~i', $href) === 1) {
$targetElement->setAttribute('target', '_blank');
}
}
if ($element->tagName === 'img') {
$src = trim($element->getAttribute('src'));
if ($src === '' || !str_starts_with($src, 'media:')) {
$element->parentNode?->removeChild($element);
return;
}
$fileName = substr($src, 6);
$item = $media->findByFileName($fileName);
if ($item === null) {
throw new RuntimeException('Une image utilisée dans le Markdown est introuvable.');
}
$element->setAttribute('src', (string) $item['url']);
$element->setAttribute('loading', 'lazy');
$element->setAttribute('decoding', 'async');
$title = self::sanitizeAttributeValue((string) $sourceElement->getAttribute('title'));
if ($title !== null) {
$targetElement->setAttribute('title', $title);
}
}
private static function buildSanitizedImage(DOMElement $sourceElement, DOMDocument $target, Media $media): ?DOMElement
{
$src = trim((string) $sourceElement->getAttribute('src'));
if ($src === '' || !str_starts_with($src, 'media:')) {
return null;
}
$fileName = substr($src, 6);
if ($fileName === '' || preg_match('/[\x00-\x1F\x7F]/u', $fileName) === 1) {
return null;
}
$item = $media->findByFileName($fileName);
if ($item === null) {
throw new RuntimeException('Une image utilisée dans le Markdown est introuvable.');
}
$image = $target->createElement('img');
$image->setAttribute('src', (string) $item['url']);
$image->setAttribute('loading', 'lazy');
$image->setAttribute('decoding', 'async');
if ($sourceElement->hasAttribute('alt')) {
$image->setAttribute('alt', self::sanitizeAttributeValue((string) $sourceElement->getAttribute('alt'), true) ?? '');
} elseif ((string) $item['alt'] !== '') {
$image->setAttribute('alt', (string) $item['alt']);
} else {
$image->setAttribute('alt', '');
}
$title = self::sanitizeAttributeValue((string) $sourceElement->getAttribute('title'));
if ($title !== null) {
$image->setAttribute('title', $title);
}
return $image;
}
private static function sanitizeHref(string $href): ?string
{
$href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($href === '' || preg_match('/[\x00-\x1F\x7F]/u', $href) === 1) {
return null;
}
if (preg_match('~^(https?://|mailto:|tel:)~i', $href) === 1) {
return $href;
}
if (self::isSafeRelativeHref($href)) {
return $href;
}
return null;
}
private static function isSafeRelativeHref(string $href): bool
{
if ($href === '/') {
return true;
}
if (str_starts_with($href, '//')) {
return false;
}
return preg_match('~^(?:/[^/]|\./|\.\./|#|\?)~', $href) === 1;
}
private static function sanitizeAttributeValue(string $value, bool $allowEmpty = false): ?string
{
$value = html_entity_decode($value, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$value = trim((string) preg_replace('/[\x00-\x1F\x7F]+/u', ' ', $value));
if ($value === '' && !$allowEmpty) {
return null;
}
return $value;
}
private static function normalizeMarkdown(string $markdown): string
{
$markdown = str_replace(["\r\n", "\r"], "\n", $markdown);
@@ -175,18 +251,4 @@ class MarkdownService
return trim(implode("\n", $normalized));
}
private static function unwrap(DOMElement $element): void
{
$parent = $element->parentNode;
if ($parent === null) {
return;
}
while ($element->firstChild !== null) {
$parent->insertBefore($element->firstChild, $element);
}
$parent->removeChild($element);
}
}