-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Parse HTML properly in Scaladoc #25681
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| package tests.sanitization | ||
|
|
||
| /** <script>alert('hello')</script> */ | ||
| class Script | ||
|
|
||
| /** < script >alert('hello')</script | ||
| > */ | ||
| class ScriptWithSpaces | ||
|
|
||
| /** <script>alert('hello')</script> */ | ||
| class FakeSafeScript | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the character in the comment that your browser probably doesn't render very well is the same char we internally use to mark "safe" HTML tags (why do we have such a char? because IMHO the entire scaladoc parsing stack is too hacky, and should be rewritten to be a single-pass parser... but that's for another day) |
||
|
|
||
| /** Example < Second <: Third <= Fourth */ | ||
| class NotATag | ||
|
|
||
| /** Example < Second >: Third */ | ||
| class NotATagButHasGreaterThan | ||
|
|
||
| /** a<b */ | ||
| class NotATagButNoSpaces | ||
|
|
||
| /** | ||
| * test | ||
| * ``` | ||
| * example = false | ||
| * ``` | ||
| * <script>alert('hello')</script> | ||
| */ | ||
| class TagOutsideCode | ||
|
|
||
| /** | ||
| * see [[<:<]], or [[>:>]] | ||
| */ | ||
| class LinkToTagLike | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,12 +12,6 @@ object Regexes { | |
| val CleanCommentLine = | ||
| new Regex("""(?:\s*\*\s?\s?)?(.*)""") | ||
|
|
||
| /** Dangerous HTML tags that should be replaced by something safer, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a dangerous use of dangerous, pun intended. Those tags were not the kind of tags one should call dangerous. |
||
| * such as wiki syntax, or that should be dropped | ||
| */ | ||
| val DangerousTags = | ||
| new Regex("""<(/?(div|ol|ul|li|h[1-6]|p))( [^>]*)?/?>|<!--.*-->""") | ||
|
|
||
| /** Javadoc tags that should be replaced by something useful, such as wiki | ||
| * syntax, or that should be dropped. */ | ||
| val JavadocTags = | ||
|
|
@@ -36,27 +30,7 @@ object Regexes { | |
| } | ||
| } | ||
|
|
||
| /** Maps a dangerous HTML tag to a safe wiki replacement, or an empty string | ||
| * if it cannot be salvaged. */ | ||
| def htmlReplacement(mtch: Regex.Match): String = mtch.group(1) match { | ||
| case "p" | "div" => "\n\n" | ||
| case "h1" => "\n= " | ||
| case "/h1" => " =\n" | ||
| case "h2" => "\n== " | ||
| case "/h2" => " ==\n" | ||
| case "h3" => "\n=== " | ||
| case "/h3" => " ===\n" | ||
| case "h4" | "h5" | "h6" => "\n==== " | ||
| case "/h4" | "/h5" | "/h6" => " ====\n" | ||
| case "li" => "\n * - " | ||
| case _ => "" | ||
| } | ||
|
|
||
| /** Safe HTML tags that can be kept. */ | ||
| val SafeTags = | ||
| new Regex("""((&\w+;)|(&#\d+;)|(</?(abbr|acronym|address|area|a|bdo|big|blockquote|br|button|b|caption|cite|code|col|colgroup|dd|del|dfn|em|fieldset|form|hr|img|input|ins|i|kbd|label|legend|link|map|object|optgroup|option|param|pre|q|samp|select|small|span|strong|sub|sup|table|tbody|td|textarea|tfoot|th|thead|tr|tt|var)( [^>]*)?/?>))""") | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed "safe" tags that are related to user input (why would anyone do this?) or considered terminally deprecated (e.g., |
||
| val safeTagMarker = '\u000E' | ||
| val safeTagMarker = '\u000E' // IMPORTANT: Only change if you've updated the sanitization tests to match | ||
| val endOfLine = '\u000A' | ||
| val endOfText = '\u0003' | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,21 +13,8 @@ import java.nio.file.Files | |
| */ | ||
| class CaretTest extends BaseHtmlTest: | ||
|
|
||
| private def docHtml(cls: String, syntax: String = "markdown"): String = | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved one level up so the logic can be shared with the new tests |
||
| val dest = Files.createTempDirectory("test-doc").toFile | ||
| try | ||
| val args = Scaladoc.Args( | ||
| name = projectName, | ||
| tastyFiles = tastyFiles("i25517"), | ||
| output = dest, | ||
| projectVersion = Some(projectVersion), | ||
| defaultSyntax = List(syntax), | ||
| ) | ||
| Scaladoc.run(args)(using testContext) | ||
| val path = dest.toPath.resolve(s"tests/i25517/$cls.html") | ||
| val doc = org.jsoup.Jsoup.parse(dotty.tools.scaladoc.util.IO.read(path)) | ||
| doc.select(".doc").html() | ||
| finally dotty.tools.scaladoc.util.IO.delete(dest) | ||
| private def docHtml(cls: String, syntax: String): String = | ||
| super.docHtml("i25517", cls, syntax) | ||
|
|
||
| @Test def supTagsInMarkdown(): Unit = | ||
| val html = docHtml("SupDefault", "markdown") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| package dotty.tools.scaladoc | ||
| package tasty | ||
| package comments | ||
|
|
||
| import org.junit.Test | ||
| import org.junit.Assert.* | ||
|
|
||
| import java.nio.file.Files | ||
|
|
||
| class TagSanitizationTest extends BaseHtmlTest: | ||
| private def docHtml(cls: String): String = | ||
| super.docHtml("sanitization", cls, "markdown") | ||
|
|
||
| @Test def scriptTag(): Unit = | ||
| val html = docHtml("Script") | ||
| assertEquals("<p>alert('hello')</p>", html) | ||
|
|
||
| @Test def scriptTagWithSpaces(): Unit = | ||
| val html = docHtml("ScriptWithSpaces") | ||
| assertEquals("<p>< script >alert('hello')</p>", html) | ||
|
|
||
| @Test def scriptTagWithSafeChar(): Unit = | ||
| val html = docHtml("FakeSafeScript") | ||
| assertEquals("<p>alert('hello')</p>", html) | ||
|
|
||
| @Test def notATag(): Unit = | ||
| val html = docHtml("NotATag") | ||
| assertEquals("<p>Example < Second <: Third <= Fourth</p>", html) | ||
|
|
||
| @Test def notATagButHasGreaterThan(): Unit = | ||
| val html = docHtml("NotATagButHasGreaterThan") | ||
| assertEquals("<p>Example < Second >: Third</p>", html) | ||
|
|
||
| @Test def notATagButNoSpaces(): Unit = | ||
| val html = docHtml("NotATagButNoSpaces") | ||
| assertEquals("<p>a<b</p>", html) | ||
|
|
||
| @Test def tagOutsideCode(): Unit = | ||
| val html = docHtml("TagOutsideCode") | ||
| assertFalse(html, html.contains("<script>")) | ||
| assertFalse(html, html.contains("</script>")) | ||
|
|
||
| @Test def linkToTagLike(): Unit = | ||
| val html = docHtml("LinkToTagLike") | ||
| // ensure we don't treat the text between <:< and >:> as a tag content | ||
| assertTrue(html, html.contains("or")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no reason not to do this IMHO
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a very strong reason: you don't want your IDE to recompile every bootstrapped project from scratch every time you change one line in the compiler!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't follow. If you deliberately enable this off-by-default option, why not recompile whatever's necessary in that case?
If there's an issue we should at the very least put it on the same plan as
enableBspAllProjectsand make it a variable that's easy to toggle without editing the build file.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah never mind, I saw the removal of
falseand panicked. UsingenableBspAllProjectsseems accurate.