From c9c7afda1a80bda7b61ded222163db796132b78f Mon Sep 17 00:00:00 2001 From: KN4CK3R Date: Wed, 23 Jun 2021 23:09:51 +0200 Subject: [PATCH] Add sanitizer rules per renderer (#16110) * Added sanitizer rules per renderer. * Updated documentation. Co-authored-by: techknowlogick --- .../doc/advanced/config-cheat-sheet.en-us.md | 4 + .../doc/advanced/external-renderers.en-us.md | 41 ++++++- modules/markup/csv/csv.go | 10 ++ modules/markup/external/external.go | 7 +- modules/markup/html_test.go | 4 +- modules/markup/markdown/markdown.go | 9 +- modules/markup/orgmode/orgmode.go | 6 + modules/markup/renderer.go | 52 ++++----- modules/markup/sanitizer.go | 88 +++++++++----- modules/setting/markup.go | 107 ++++++++++-------- 10 files changed, 215 insertions(+), 113 deletions(-) diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md index a33407d15a73..8f1f9ce42d4b 100644 --- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md +++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md @@ -907,13 +907,17 @@ Gitea supports customizing the sanitization policy for rendered HTML. The exampl ELEMENT = span ALLOW_ATTR = class REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+ +ALLOW_DATA_URI_IMAGES = true ``` - `ELEMENT`: The element this policy applies to. Must be non-empty. - `ALLOW_ATTR`: The attribute this policy allows. Must be non-empty. - `REGEXP`: A regex to match the contents of the attribute against. Must be present but may be empty for unconditional whitelisting of this attribute. + - `ALLOW_DATA_URI_IMAGES`: **false** Allow data uri images (``). Multiple sanitisation rules can be defined by adding unique subsections, e.g. `[markup.sanitizer.TeX-2]`. +To apply a sanitisation rules only for a specify external renderer they must use the renderer name, e.g. `[markup.sanitizer.asciidoc.rule-1]`. +If the rule is defined above the renderer ini section or the name does not match a renderer it is applied to every renderer. ## Time (`time`) diff --git a/docs/content/doc/advanced/external-renderers.en-us.md b/docs/content/doc/advanced/external-renderers.en-us.md index 71fabc529d7b..c0109b801409 100644 --- a/docs/content/doc/advanced/external-renderers.en-us.md +++ b/docs/content/doc/advanced/external-renderers.en-us.md @@ -64,8 +64,8 @@ IS_INPUT_FILE = false [markup.jupyter] ENABLED = true FILE_EXTENSIONS = .ipynb -RENDER_COMMAND = "jupyter nbconvert --stdout --to html --template basic " -IS_INPUT_FILE = true +RENDER_COMMAND = "jupyter nbconvert --stdin --stdout --to html --template basic" +IS_INPUT_FILE = false [markup.restructuredtext] ENABLED = true @@ -90,15 +90,50 @@ FILE_EXTENSIONS = .md,.markdown RENDER_COMMAND = pandoc -f markdown -t html --katex ``` -You must define `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` in each section. +You must define `ELEMENT` and `ALLOW_ATTR` in each section. To define multiple entries, add a unique alphanumeric suffix (e.g., `[markup.sanitizer.1]` and `[markup.sanitizer.something]`). +To apply a sanitisation rules only for a specify external renderer they must use the renderer name, e.g. `[markup.sanitizer.asciidoc.rule-1]`, `[markup.sanitizer..rule-1]`. + +**Note**: If the rule is defined above the renderer ini section or the name does not match a renderer it is applied to every renderer. + Once your configuration changes have been made, restart Gitea to have changes take effect. **Note**: Prior to Gitea 1.12 there was a single `markup.sanitiser` section with keys that were redefined for multiple rules, however, there were significant problems with this method of configuration necessitating configuration through multiple sections. +### Example: Office DOCX + +Display Office DOCX files with [`pandoc`](https://pandoc.org/): +```ini +[markup.docx] +ENABLED = true +FILE_EXTENSIONS = .docx +RENDER_COMMAND = "pandoc --from docx --to html --self-contained --template /path/to/basic.html" + +[markup.sanitizer.docx.img] +ALLOW_DATA_URI_IMAGES = true +``` + +The template file has the following content: +``` +$body$ +``` + +### Example: Jupyter Notebook + +Display Jupyter Notebook files with [`nbconvert`](https://github.com/jupyter/nbconvert): +```ini +[markup.jupyter] +ENABLED = true +FILE_EXTENSIONS = .ipynb +RENDER_COMMAND = "jupyter-nbconvert --stdin --stdout --to html --template basic" + +[markup.sanitizer.jupyter.img] +ALLOW_DATA_URI_IMAGES = true +``` + ## Customizing CSS The external renderer is specified in the .ini in the format `[markup.XXXXX]` and the HTML supplied by your external renderer will be wrapped in a `
` with classes `markup` and `XXXXX`. The `markup` class provides out of the box styling (as does `markdown` if `XXXXX` is `markdown`). Otherwise you can use these classes to specifically target the contents of your rendered HTML. diff --git a/modules/markup/csv/csv.go b/modules/markup/csv/csv.go index 6572b0ee1e81..8a4df8951154 100644 --- a/modules/markup/csv/csv.go +++ b/modules/markup/csv/csv.go @@ -10,6 +10,7 @@ import ( "html" "io" "io/ioutil" + "regexp" "strconv" "code.gitea.io/gitea/modules/csv" @@ -38,6 +39,15 @@ func (Renderer) Extensions() []string { return []string{".csv", ".tsv"} } +// SanitizerRules implements markup.Renderer +func (Renderer) SanitizerRules() []setting.MarkupSanitizerRule { + return []setting.MarkupSanitizerRule{ + {Element: "table", AllowAttr: "class", Regexp: regexp.MustCompile(`data-table`)}, + {Element: "th", AllowAttr: "class", Regexp: regexp.MustCompile(`line-num`)}, + {Element: "td", AllowAttr: "class", Regexp: regexp.MustCompile(`line-num`)}, + } +} + func writeField(w io.Writer, element, class, field string) error { if _, err := io.WriteString(w, "<"); err != nil { return err diff --git a/modules/markup/external/external.go b/modules/markup/external/external.go index 62814c9914b9..c849f505e7ee 100644 --- a/modules/markup/external/external.go +++ b/modules/markup/external/external.go @@ -30,7 +30,7 @@ func RegisterRenderers() { // Renderer implements markup.Renderer for external tools type Renderer struct { - setting.MarkupRenderer + *setting.MarkupRenderer } // Name returns the external tool name @@ -48,6 +48,11 @@ func (p *Renderer) Extensions() []string { return p.FileExtensions } +// SanitizerRules implements markup.Renderer +func (p *Renderer) SanitizerRules() []setting.MarkupSanitizerRule { + return p.MarkupSanitizerRules +} + func envMark(envName string) string { if runtime.GOOS == "windows" { return "%" + envName + "%" diff --git a/modules/markup/html_test.go b/modules/markup/html_test.go index 4cdd5798c8a9..8c3d2b5395c1 100644 --- a/modules/markup/html_test.go +++ b/modules/markup/html_test.go @@ -112,7 +112,7 @@ func TestRender_links(t *testing.T) { defaultCustom := setting.Markdown.CustomURLSchemes setting.Markdown.CustomURLSchemes = []string{"ftp", "magnet"} - ReplaceSanitizer() + InitializeSanitizer() CustomLinkURLSchemes(setting.Markdown.CustomURLSchemes) test( @@ -192,7 +192,7 @@ func TestRender_links(t *testing.T) { // Restore previous settings setting.Markdown.CustomURLSchemes = defaultCustom - ReplaceSanitizer() + InitializeSanitizer() CustomLinkURLSchemes(setting.Markdown.CustomURLSchemes) } diff --git a/modules/markup/markdown/markdown.go b/modules/markup/markdown/markdown.go index 87fae2a23b2f..cac2a180faee 100644 --- a/modules/markup/markdown/markdown.go +++ b/modules/markup/markdown/markdown.go @@ -199,7 +199,7 @@ func actualRender(ctx *markup.RenderContext, input io.Reader, output io.Writer) } _ = lw.Close() }() - buf := markup.SanitizeReader(rd) + buf := markup.SanitizeReader(rd, "") _, err := io.Copy(output, buf) return err } @@ -215,7 +215,7 @@ func render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error if log.IsDebug() { log.Debug("Panic in markdown: %v\n%s", err, string(log.Stack(2))) } - ret := markup.SanitizeReader(input) + ret := markup.SanitizeReader(input, "") _, err = io.Copy(output, ret) if err != nil { log.Error("SanitizeReader failed: %v", err) @@ -249,6 +249,11 @@ func (Renderer) Extensions() []string { return setting.Markdown.FileExtensions } +// SanitizerRules implements markup.Renderer +func (Renderer) SanitizerRules() []setting.MarkupSanitizerRule { + return []setting.MarkupSanitizerRule{} +} + // Render implements markup.Renderer func (Renderer) Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error { return render(ctx, input, output) diff --git a/modules/markup/orgmode/orgmode.go b/modules/markup/orgmode/orgmode.go index 851fc97f9a81..7e9f1f45c5a7 100644 --- a/modules/markup/orgmode/orgmode.go +++ b/modules/markup/orgmode/orgmode.go @@ -13,6 +13,7 @@ import ( "code.gitea.io/gitea/modules/highlight" "code.gitea.io/gitea/modules/markup" + "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" "github.com/alecthomas/chroma" @@ -41,6 +42,11 @@ func (Renderer) Extensions() []string { return []string{".org"} } +// SanitizerRules implements markup.Renderer +func (Renderer) SanitizerRules() []setting.MarkupSanitizerRule { + return []setting.MarkupSanitizerRule{} +} + // Render renders orgmode rawbytes to HTML func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error { htmlWriter := org.NewHTMLWriter() diff --git a/modules/markup/renderer.go b/modules/markup/renderer.go index d60c8ad71066..04619caee335 100644 --- a/modules/markup/renderer.go +++ b/modules/markup/renderer.go @@ -81,6 +81,7 @@ type Renderer interface { Name() string // markup format name Extensions() []string NeedPostProcess() bool + SanitizerRules() []setting.MarkupSanitizerRule Render(ctx *RenderContext, input io.Reader, output io.Writer) error } @@ -136,37 +137,32 @@ func render(ctx *RenderContext, renderer Renderer, input io.Reader, output io.Wr _ = pw.Close() }() - if renderer.NeedPostProcess() { - pr2, pw2 := io.Pipe() - defer func() { - _ = pr2.Close() - _ = pw2.Close() - }() + pr2, pw2 := io.Pipe() + defer func() { + _ = pr2.Close() + _ = pw2.Close() + }() - wg.Add(1) - go func() { - buf := SanitizeReader(pr2) - _, err = io.Copy(output, buf) - _ = pr2.Close() - wg.Done() - }() + wg.Add(1) + go func() { + buf := SanitizeReader(pr2, renderer.Name()) + _, err = io.Copy(output, buf) + _ = pr2.Close() + wg.Done() + }() - wg.Add(1) - go func() { + wg.Add(1) + go func() { + if renderer.NeedPostProcess() { err = PostProcess(ctx, pr, pw2) - _ = pr.Close() - _ = pw2.Close() - wg.Done() - }() - } else { - wg.Add(1) - go func() { - buf := SanitizeReader(pr) - _, err = io.Copy(output, buf) - _ = pr.Close() - wg.Done() - }() - } + } else { + _, err = io.Copy(pw2, pr) + } + _ = pr.Close() + _ = pw2.Close() + wg.Done() + }() + if err1 := renderer.Render(ctx, input, pw); err1 != nil { return err1 } diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go index 5611bd06ad00..9342d65de581 100644 --- a/modules/markup/sanitizer.go +++ b/modules/markup/sanitizer.go @@ -19,8 +19,9 @@ import ( // Sanitizer is a protection wrapper of *bluemonday.Policy which does not allow // any modification to the underlying policies once it's been created. type Sanitizer struct { - policy *bluemonday.Policy - init sync.Once + defaultPolicy *bluemonday.Policy + rendererPolicies map[string]*bluemonday.Policy + init sync.Once } var sanitizer = &Sanitizer{} @@ -30,47 +31,57 @@ var sanitizer = &Sanitizer{} // entire application lifecycle. func NewSanitizer() { sanitizer.init.Do(func() { - ReplaceSanitizer() + InitializeSanitizer() }) } -// ReplaceSanitizer replaces the current sanitizer to account for changes in settings -func ReplaceSanitizer() { - sanitizer.policy = bluemonday.UGCPolicy() +// InitializeSanitizer (re)initializes the current sanitizer to account for changes in settings +func InitializeSanitizer() { + sanitizer.rendererPolicies = map[string]*bluemonday.Policy{} + sanitizer.defaultPolicy = createDefaultPolicy() + + for name, renderer := range renderers { + sanitizerRules := renderer.SanitizerRules() + if len(sanitizerRules) > 0 { + policy := createDefaultPolicy() + addSanitizerRules(policy, sanitizerRules) + sanitizer.rendererPolicies[name] = policy + } + } +} + +func createDefaultPolicy() *bluemonday.Policy { + policy := bluemonday.UGCPolicy() // For Chroma markdown plugin - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^is-loading$`)).OnElements("pre") - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^(chroma )?language-[\w-]+$`)).OnElements("code") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`^is-loading$`)).OnElements("pre") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`^(chroma )?language-[\w-]+$`)).OnElements("code") // Checkboxes - sanitizer.policy.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input") - sanitizer.policy.AllowAttrs("checked", "disabled", "data-source-position").OnElements("input") + policy.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input") + policy.AllowAttrs("checked", "disabled", "data-source-position").OnElements("input") // Custom URL-Schemes if len(setting.Markdown.CustomURLSchemes) > 0 { - sanitizer.policy.AllowURLSchemes(setting.Markdown.CustomURLSchemes...) + policy.AllowURLSchemes(setting.Markdown.CustomURLSchemes...) } // Allow classes for anchors - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`ref-issue`)).OnElements("a") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`ref-issue`)).OnElements("a") // Allow classes for task lists - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`task-list-item`)).OnElements("li") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`task-list-item`)).OnElements("li") // Allow icons - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^icon(\s+[\p{L}\p{N}_-]+)+$`)).OnElements("i") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`^icon(\s+[\p{L}\p{N}_-]+)+$`)).OnElements("i") // Allow unlabelled labels - sanitizer.policy.AllowNoAttrs().OnElements("label") + policy.AllowNoAttrs().OnElements("label") // Allow classes for emojis - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`emoji`)).OnElements("img") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`emoji`)).OnElements("img") // Allow icons, emojis, chroma syntax and keyword markup on span - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$|^` + keywordClass + `$`)).OnElements("span") - - // Allow data tables - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`data-table`)).OnElements("table") - sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`line-num`)).OnElements("th", "td") + policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$|^` + keywordClass + `$`)).OnElements("span") // Allow generally safe attributes generalSafeAttrs := []string{"abbr", "accept", "accept-charset", @@ -101,18 +112,29 @@ func ReplaceSanitizer() { "abbr", "bdo", "cite", "dfn", "mark", "small", "span", "time", "wbr", } - sanitizer.policy.AllowAttrs(generalSafeAttrs...).OnElements(generalSafeElements...) + policy.AllowAttrs(generalSafeAttrs...).OnElements(generalSafeElements...) - sanitizer.policy.AllowAttrs("itemscope", "itemtype").OnElements("div") + policy.AllowAttrs("itemscope", "itemtype").OnElements("div") // FIXME: Need to handle longdesc in img but there is no easy way to do it // Custom keyword markup - for _, rule := range setting.ExternalSanitizerRules { - if rule.Regexp != nil { - sanitizer.policy.AllowAttrs(rule.AllowAttr).Matching(rule.Regexp).OnElements(rule.Element) - } else { - sanitizer.policy.AllowAttrs(rule.AllowAttr).OnElements(rule.Element) + addSanitizerRules(policy, setting.ExternalSanitizerRules) + + return policy +} + +func addSanitizerRules(policy *bluemonday.Policy, rules []setting.MarkupSanitizerRule) { + for _, rule := range rules { + if rule.AllowDataURIImages { + policy.AllowDataURIImages() + } + if rule.Element != "" { + if rule.Regexp != nil { + policy.AllowAttrs(rule.AllowAttr).Matching(rule.Regexp).OnElements(rule.Element) + } else { + policy.AllowAttrs(rule.AllowAttr).OnElements(rule.Element) + } } } } @@ -120,11 +142,15 @@ func ReplaceSanitizer() { // Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist. func Sanitize(s string) string { NewSanitizer() - return sanitizer.policy.Sanitize(s) + return sanitizer.defaultPolicy.Sanitize(s) } // SanitizeReader sanitizes a Reader -func SanitizeReader(r io.Reader) *bytes.Buffer { +func SanitizeReader(r io.Reader, renderer string) *bytes.Buffer { NewSanitizer() - return sanitizer.policy.SanitizeReader(r) + policy, exist := sanitizer.rendererPolicies[renderer] + if !exist { + policy = sanitizer.defaultPolicy + } + return policy.SanitizeReader(r) } diff --git a/modules/setting/markup.go b/modules/setting/markup.go index 43df4ce442cd..31ec1dd2eb6e 100644 --- a/modules/setting/markup.go +++ b/modules/setting/markup.go @@ -15,31 +15,34 @@ import ( // ExternalMarkupRenderers represents the external markup renderers var ( - ExternalMarkupRenderers []MarkupRenderer + ExternalMarkupRenderers []*MarkupRenderer ExternalSanitizerRules []MarkupSanitizerRule ) // MarkupRenderer defines the external parser configured in ini type MarkupRenderer struct { - Enabled bool - MarkupName string - Command string - FileExtensions []string - IsInputFile bool - NeedPostProcess bool + Enabled bool + MarkupName string + Command string + FileExtensions []string + IsInputFile bool + NeedPostProcess bool + MarkupSanitizerRules []MarkupSanitizerRule } // MarkupSanitizerRule defines the policy for whitelisting attributes on // certain elements. type MarkupSanitizerRule struct { - Element string - AllowAttr string - Regexp *regexp.Regexp + Element string + AllowAttr string + Regexp *regexp.Regexp + AllowDataURIImages bool } func newMarkup() { - ExternalMarkupRenderers = make([]MarkupRenderer, 0, 10) + ExternalMarkupRenderers = make([]*MarkupRenderer, 0, 10) ExternalSanitizerRules = make([]MarkupSanitizerRule, 0, 10) + for _, sec := range Cfg.Section("markup").ChildSections() { name := strings.TrimPrefix(sec.Name(), "markup.") if name == "" { @@ -56,50 +59,62 @@ func newMarkup() { } func newMarkupSanitizer(name string, sec *ini.Section) { - haveElement := sec.HasKey("ELEMENT") - haveAttr := sec.HasKey("ALLOW_ATTR") - haveRegexp := sec.HasKey("REGEXP") + rule, ok := createMarkupSanitizerRule(name, sec) + if ok { + if strings.HasPrefix(name, "sanitizer.") { + names := strings.SplitN(strings.TrimPrefix(name, "sanitizer."), ".", 2) + name = names[0] + } + for _, renderer := range ExternalMarkupRenderers { + if name == renderer.MarkupName { + renderer.MarkupSanitizerRules = append(renderer.MarkupSanitizerRules, rule) + return + } + } + ExternalSanitizerRules = append(ExternalSanitizerRules, rule) + } +} - if !haveElement && !haveAttr && !haveRegexp { - log.Warn("Skipping empty section: markup.%s.", name) - return +func createMarkupSanitizerRule(name string, sec *ini.Section) (MarkupSanitizerRule, bool) { + var rule MarkupSanitizerRule + + ok := false + if sec.HasKey("ALLOW_DATA_URI_IMAGES") { + rule.AllowDataURIImages = sec.Key("ALLOW_DATA_URI_IMAGES").MustBool(false) + ok = true } - if !haveElement || !haveAttr || !haveRegexp { - log.Error("Missing required keys from markup.%s. Must have all three of ELEMENT, ALLOW_ATTR, and REGEXP defined!", name) - return - } + if sec.HasKey("ELEMENT") || sec.HasKey("ALLOW_ATTR") { + rule.Element = sec.Key("ELEMENT").Value() + rule.AllowAttr = sec.Key("ALLOW_ATTR").Value() - elements := sec.Key("ELEMENT").Value() - allowAttrs := sec.Key("ALLOW_ATTR").Value() - regexpStr := sec.Key("REGEXP").Value() - - if regexpStr == "" { - rule := MarkupSanitizerRule{ - Element: elements, - AllowAttr: allowAttrs, - Regexp: nil, + if rule.Element == "" || rule.AllowAttr == "" { + log.Error("Missing required values from markup.%s. Must have ELEMENT and ALLOW_ATTR defined!", name) + return rule, false } - ExternalSanitizerRules = append(ExternalSanitizerRules, rule) - return + regexpStr := sec.Key("REGEXP").Value() + if regexpStr != "" { + // Validate when parsing the config that this is a valid regular + // expression. Then we can use regexp.MustCompile(...) later. + compiled, err := regexp.Compile(regexpStr) + if err != nil { + log.Error("In markup.%s: REGEXP (%s) failed to compile: %v", name, regexpStr, err) + return rule, false + } + + rule.Regexp = compiled + } + + ok = true } - // Validate when parsing the config that this is a valid regular - // expression. Then we can use regexp.MustCompile(...) later. - compiled, err := regexp.Compile(regexpStr) - if err != nil { - log.Error("In module.%s: REGEXP (%s) at definition %d failed to compile: %v", regexpStr, name, err) - return + if !ok { + log.Error("Missing required keys from markup.%s. Must have ELEMENT and ALLOW_ATTR or ALLOW_DATA_URI_IMAGES defined!", name) + return rule, false } - rule := MarkupSanitizerRule{ - Element: elements, - AllowAttr: allowAttrs, - Regexp: compiled, - } - - ExternalSanitizerRules = append(ExternalSanitizerRules, rule) + return rule, true } func newMarkupRenderer(name string, sec *ini.Section) { @@ -126,7 +141,7 @@ func newMarkupRenderer(name string, sec *ini.Section) { return } - ExternalMarkupRenderers = append(ExternalMarkupRenderers, MarkupRenderer{ + ExternalMarkupRenderers = append(ExternalMarkupRenderers, &MarkupRenderer{ Enabled: sec.Key("ENABLED").MustBool(false), MarkupName: name, FileExtensions: exts,