From ee7df7ba8c5e6a4b32b0c4048d2b535d8df3cbe9 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Sat, 7 Dec 2019 14:49:04 -0500 Subject: [PATCH] Markdown: Sanitizier Configuration (#9075) * Support custom sanitization policy Allowing the gitea administrator to configure sanitization policy allows them to couple external renders and custom templates to support more markup. In particular, the `pandoc` renderer allows generating KaTeX annotations, wrapping them in `` elements with class `math` and either `inline` or `display` (depending on whether or not inline or block mode was requested). This iteration gives the administrator whitelisting powers; carefully crafted regexes will thus let through only the desired attributes necessary to support their custom markup. Resolves: #9054 Signed-off-by: Alexander Scheel * Document new sanitization configuration - Adds basic documentation to app.ini.sample, - Adds an example to the Configuration Cheat Sheet, and - Adds extended information to External Renderers section. Signed-off-by: Alexander Scheel * Drop extraneous length check in newMarkupSanitizer(...) Signed-off-by: Alexander Scheel * Fix plural ELEMENT and ALLOW_ATTR in docs These were left over from their initial names. Make them singular to conform with the current expectations. Signed-off-by: Alexander Scheel --- custom/conf/app.ini.sample | 6 + .../doc/advanced/config-cheat-sheet.en-us.md | 18 +++ .../doc/advanced/external-renderers.en-us.md | 18 +++ modules/markup/sanitizer.go | 9 ++ modules/setting/markup.go | 133 ++++++++++++++---- 5 files changed, 155 insertions(+), 29 deletions(-) diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample index 8d11cfc293e1..050a0db73012 100644 --- a/custom/conf/app.ini.sample +++ b/custom/conf/app.ini.sample @@ -877,6 +877,12 @@ SHOW_FOOTER_VERSION = true ; Show template execution time in the footer SHOW_FOOTER_TEMPLATE_LOAD_TIME = true +[markup.sanitizer] +; The following keys can be used multiple times to define sanitation policy rules. +;ELEMENT = span +;ALLOW_ATTR = class +;REGEXP = ^(info|warning|error)$ + [markup.asciidoc] ENABLED = false ; List of file extensions that should be rendered by an external command diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md index 9f02e888cf39..0d7a641b1900 100644 --- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md +++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md @@ -578,6 +578,24 @@ Two special environment variables are passed to the render command: - `GITEA_PREFIX_SRC`, which contains the current URL prefix in the `src` path tree. To be used as prefix for links. - `GITEA_PREFIX_RAW`, which contains the current URL prefix in the `raw` path tree. To be used as prefix for image paths. + +Gitea supports customizing the sanitization policy for rendered HTML. The example below will support KaTeX output from pandoc. + +```ini +[markup.sanitizer] +; Pandoc renders TeX segments as s with the "math" class, optionally +; with "inline" or "display" classes depending on context. +ELEMENT = span +ALLOW_ATTR = class +REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+ +``` + + - `ELEMENT`: The element this policy applies to. Must be non-empty. + - `ALLOW_ATTR`: The attribute this policy allows. Must be non-empty. + - `REGEXP`: A regex to match the contents of the attribute against. Must be present but may be empty for unconditional whitelisting of this attribute. + +You may redefine `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` multiple times; each time all three are defined is a single policy entry. + ## Time (`time`) - `FORMAT`: Time format to diplay on UI. i.e. RFC1123 or 2006-01-02 15:04:05 diff --git a/docs/content/doc/advanced/external-renderers.en-us.md b/docs/content/doc/advanced/external-renderers.en-us.md index a14f344e63b8..ec1ee63fb68d 100644 --- a/docs/content/doc/advanced/external-renderers.en-us.md +++ b/docs/content/doc/advanced/external-renderers.en-us.md @@ -68,4 +68,22 @@ RENDER_COMMAND = rst2html.py IS_INPUT_FILE = false ``` +If your external markup relies on additional classes and attributes on the generated HTML elements, you might need to enable custom sanitizer policies. Gitea uses the [`bluemonday`](https://godoc.org/github.com/microcosm-cc/bluemonday) package as our HTML sanitizier. The example below will support [KaTeX](https://katex.org/) output from [`pandoc`](https://pandoc.org/). + +```ini +[markup.sanitizer] +; Pandoc renders TeX segments as s with the "math" class, optionally +; with "inline" or "display" classes depending on context. +ELEMENT = span +ALLOW_ATTR = class +REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+ + +[markup.markdown] +ENABLED = true +FILE_EXTENSIONS = .md,.markdown +RENDER_COMMAND = pandoc -f markdown -t html --katex +``` + +You may redefine `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` multiple times; each time all three are defined is a single policy entry. All three must be defined, but `REGEXP` may be blank to allow unconditional whitelisting of that attribute. + Once your configuration changes have been made, restart Gitea to have changes take effect. diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go index 0ebb3ff88b03..f7789a9e5659 100644 --- a/modules/markup/sanitizer.go +++ b/modules/markup/sanitizer.go @@ -50,6 +50,15 @@ func ReplaceSanitizer() { // Allow tags for keyboard shortcut styling sanitizer.policy.AllowElements("kbd") + + // Custom keyword markup + for _, rule := range setting.ExternalSanitizerRules { + if rule.Regexp != nil { + sanitizer.policy.AllowAttrs(rule.AllowAttr).Matching(rule.Regexp).OnElements(rule.Element) + } else { + sanitizer.policy.AllowAttrs(rule.AllowAttr).OnElements(rule.Element) + } + } } // Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist. diff --git a/modules/setting/markup.go b/modules/setting/markup.go index 41f3cdd3a199..75e6d651bdde 100644 --- a/modules/setting/markup.go +++ b/modules/setting/markup.go @@ -9,11 +9,14 @@ import ( "strings" "code.gitea.io/gitea/modules/log" + + "gopkg.in/ini.v1" ) // ExternalMarkupParsers represents the external markup parsers var ( - ExternalMarkupParsers []MarkupParser + ExternalMarkupParsers []MarkupParser + ExternalSanitizerRules []MarkupSanitizerRule ) // MarkupParser defines the external parser configured in ini @@ -25,8 +28,15 @@ type MarkupParser struct { IsInputFile bool } +// MarkupSanitizerRule defines the policy for whitelisting attributes on +// certain elements. +type MarkupSanitizerRule struct { + Element string + AllowAttr string + Regexp *regexp.Regexp +} + func newMarkup() { - extensionReg := regexp.MustCompile(`\.\w`) for _, sec := range Cfg.Section("markup").ChildSections() { name := strings.TrimPrefix(sec.Name(), "markup.") if name == "" { @@ -34,33 +44,98 @@ func newMarkup() { continue } - extensions := sec.Key("FILE_EXTENSIONS").Strings(",") - var exts = make([]string, 0, len(extensions)) - for _, extension := range extensions { - if !extensionReg.MatchString(extension) { - log.Warn(sec.Name() + " file extension " + extension + " is invalid. Extension ignored") - } else { - exts = append(exts, extension) - } + if name == "sanitizer" { + newMarkupSanitizer(name, sec) + } else { + newMarkupRenderer(name, sec) } - - if len(exts) == 0 { - log.Warn(sec.Name() + " file extension is empty, markup " + name + " ignored") - continue - } - - command := sec.Key("RENDER_COMMAND").MustString("") - if command == "" { - log.Warn(" RENDER_COMMAND is empty, markup " + name + " ignored") - continue - } - - ExternalMarkupParsers = append(ExternalMarkupParsers, MarkupParser{ - Enabled: sec.Key("ENABLED").MustBool(false), - MarkupName: name, - FileExtensions: exts, - Command: command, - IsInputFile: sec.Key("IS_INPUT_FILE").MustBool(false), - }) } } + +func newMarkupSanitizer(name string, sec *ini.Section) { + haveElement := sec.HasKey("ELEMENT") + haveAttr := sec.HasKey("ALLOW_ATTR") + haveRegexp := sec.HasKey("REGEXP") + + if !haveElement && !haveAttr && !haveRegexp { + log.Warn("Skipping empty section: markup.%s.", name) + return + } + + if !haveElement || !haveAttr || !haveRegexp { + log.Error("Missing required keys from markup.%s. Must have all three of ELEMENT, ALLOW_ATTR, and REGEXP defined!", name) + return + } + + elements := sec.Key("ELEMENT").ValueWithShadows() + allowAttrs := sec.Key("ALLOW_ATTR").ValueWithShadows() + regexps := sec.Key("REGEXP").ValueWithShadows() + + if len(elements) != len(allowAttrs) || + len(elements) != len(regexps) { + log.Error("All three keys in markup.%s (ELEMENT, ALLOW_ATTR, REGEXP) must be defined the same number of times! Got %d, %d, and %d respectively.", name, len(elements), len(allowAttrs), len(regexps)) + return + } + + ExternalSanitizerRules = make([]MarkupSanitizerRule, 0, len(elements)) + + for index, pattern := range regexps { + if pattern == "" { + rule := MarkupSanitizerRule{ + Element: elements[index], + AllowAttr: allowAttrs[index], + Regexp: nil, + } + ExternalSanitizerRules = append(ExternalSanitizerRules, rule) + continue + } + + // Validate when parsing the config that this is a valid regular + // expression. Then we can use regexp.MustCompile(...) later. + compiled, err := regexp.Compile(pattern) + if err != nil { + log.Error("In module.%s: REGEXP at definition %d failed to compile: %v", name, index+1, err) + continue + } + + rule := MarkupSanitizerRule{ + Element: elements[index], + AllowAttr: allowAttrs[index], + Regexp: compiled, + } + ExternalSanitizerRules = append(ExternalSanitizerRules, rule) + } +} + +func newMarkupRenderer(name string, sec *ini.Section) { + extensionReg := regexp.MustCompile(`\.\w`) + + extensions := sec.Key("FILE_EXTENSIONS").Strings(",") + var exts = make([]string, 0, len(extensions)) + for _, extension := range extensions { + if !extensionReg.MatchString(extension) { + log.Warn(sec.Name() + " file extension " + extension + " is invalid. Extension ignored") + } else { + exts = append(exts, extension) + } + } + + if len(exts) == 0 { + log.Warn(sec.Name() + " file extension is empty, markup " + name + " ignored") + return + } + + command := sec.Key("RENDER_COMMAND").MustString("") + if command == "" { + log.Warn(" RENDER_COMMAND is empty, markup " + name + " ignored") + return + } + + ExternalMarkupParsers = append(ExternalMarkupParsers, MarkupParser{ + Enabled: sec.Key("ENABLED").MustBool(false), + MarkupName: name, + FileExtensions: exts, + Command: command, + IsInputFile: sec.Key("IS_INPUT_FILE").MustBool(false), + }) +}