forked from gitea/gitea
Replace linkRegex with xurls library (#6261)
* Replace linkRegex with xurls library Rather than maintaining a complicated regex to match URLs for autolinking, gitea can use this existing go library that takes care of the matching with very little code change to gitea itself. After spending a while trying to find the perfect regex for all cases this library still works better as it is more flexible than a single regex ever will be. This will also fix the following issues: #5844 #3095 #3381 This passes all our current tests and I've added new ones mentioned in those issues as well. * Use xurls.StrictMatchingScheme instead of xurls.Strict This is much faster and we only care about https? links to preserve existing behavior.
This commit is contained in:
parent
01bd1fcd33
commit
f2de5dc8c8
|
@ -725,6 +725,14 @@
|
|||
pruneopts = "NUT"
|
||||
revision = "02ccfbfaf0cc627aa3aec8ef7ed5cfeec5b43f63"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:63953ffb90bbc880c612d576fcfd973a5904277d25ec9e2d8d5719bf67969662"
|
||||
name = "github.com/mvdan/xurls"
|
||||
packages = ["."]
|
||||
pruneopts = "NUT"
|
||||
revision = "e52e821cbfe8fe163ff6f8628ab5869b11fc05af"
|
||||
version = "v2.0.0"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:2be1d891535ce3d6d2a3db9087f07415e909744e9eff1a30f8f0b2519df60ae6"
|
||||
name = "github.com/nfnt/resize"
|
||||
|
@ -1293,6 +1301,7 @@
|
|||
"github.com/mcuadros/go-version",
|
||||
"github.com/microcosm-cc/bluemonday",
|
||||
"github.com/msteinert/pam",
|
||||
"github.com/mvdan/xurls",
|
||||
"github.com/nfnt/resize",
|
||||
"github.com/pquerna/otp",
|
||||
"github.com/pquerna/otp/totp",
|
||||
|
|
|
@ -113,3 +113,7 @@ ignored = ["google.golang.org/appengine*"]
|
|||
[[constraint]]
|
||||
name = "github.com/prometheus/client_golang"
|
||||
version = "0.9.0"
|
||||
|
||||
[[constraint]]
|
||||
name = "github.com/mvdan/xurls"
|
||||
version = "2.0.0"
|
||||
|
|
|
@ -17,6 +17,7 @@ import (
|
|||
"code.gitea.io/gitea/modules/util"
|
||||
|
||||
"github.com/Unknwon/com"
|
||||
"github.com/mvdan/xurls"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
)
|
||||
|
@ -64,9 +65,7 @@ var (
|
|||
// https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
|
||||
emailRegex = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
|
||||
|
||||
// matches http/https links. used for autlinking those. partly modified from
|
||||
// the original present in autolink.js
|
||||
linkRegex = regexp.MustCompile(`(?:(?:http|https):\/\/(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+(?:\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)(?:(?:\/[\+~%\/\.\w\-]*)?\??(?:[\-\+:=&;%@\.\w]*)#?(?:[\.\!\/\\\w]*))?`)
|
||||
linkRegex, _ = xurls.StrictMatchingScheme("https?://")
|
||||
)
|
||||
|
||||
// regexp for full links to issues/pulls
|
||||
|
|
|
@ -104,6 +104,15 @@ func TestRender_links(t *testing.T) {
|
|||
test(
|
||||
"http://142.42.1.1/",
|
||||
`<p><a href="http://142.42.1.1/" rel="nofollow">http://142.42.1.1/</a></p>`)
|
||||
test(
|
||||
"https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd",
|
||||
`<p><a href="https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd" rel="nofollow">https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd</a></p>`)
|
||||
test(
|
||||
"https://en.wikipedia.org/wiki/URL_(disambiguation)",
|
||||
`<p><a href="https://en.wikipedia.org/wiki/URL_(disambiguation)" rel="nofollow">https://en.wikipedia.org/wiki/URL_(disambiguation)</a></p>`)
|
||||
test(
|
||||
"https://foo_bar.example.com/",
|
||||
`<p><a href="https://foo_bar.example.com/" rel="nofollow">https://foo_bar.example.com/</a></p>`)
|
||||
|
||||
// Test that should *not* be turned into URL
|
||||
test(
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
Copyright (c) 2015, Daniel Martí. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,299 @@
|
|||
// Generated by schemesgen
|
||||
|
||||
package xurls
|
||||
|
||||
// Schemes is a sorted list of all IANA assigned schemes.
|
||||
//
|
||||
// Source:
|
||||
// https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
|
||||
var Schemes = []string{
|
||||
`aaa`,
|
||||
`aaas`,
|
||||
`about`,
|
||||
`acap`,
|
||||
`acct`,
|
||||
`acr`,
|
||||
`adiumxtra`,
|
||||
`afp`,
|
||||
`afs`,
|
||||
`aim`,
|
||||
`appdata`,
|
||||
`apt`,
|
||||
`attachment`,
|
||||
`aw`,
|
||||
`barion`,
|
||||
`beshare`,
|
||||
`bitcoin`,
|
||||
`bitcoincash`,
|
||||
`blob`,
|
||||
`bolo`,
|
||||
`browserext`,
|
||||
`callto`,
|
||||
`cap`,
|
||||
`chrome`,
|
||||
`chrome-extension`,
|
||||
`cid`,
|
||||
`coap`,
|
||||
`coap+tcp`,
|
||||
`coap+ws`,
|
||||
`coaps`,
|
||||
`coaps+tcp`,
|
||||
`coaps+ws`,
|
||||
`com-eventbrite-attendee`,
|
||||
`content`,
|
||||
`conti`,
|
||||
`crid`,
|
||||
`cvs`,
|
||||
`data`,
|
||||
`dav`,
|
||||
`diaspora`,
|
||||
`dict`,
|
||||
`did`,
|
||||
`dis`,
|
||||
`dlna-playcontainer`,
|
||||
`dlna-playsingle`,
|
||||
`dns`,
|
||||
`dntp`,
|
||||
`dtn`,
|
||||
`dvb`,
|
||||
`ed2k`,
|
||||
`elsi`,
|
||||
`example`,
|
||||
`facetime`,
|
||||
`fax`,
|
||||
`feed`,
|
||||
`feedready`,
|
||||
`file`,
|
||||
`filesystem`,
|
||||
`finger`,
|
||||
`fish`,
|
||||
`ftp`,
|
||||
`geo`,
|
||||
`gg`,
|
||||
`git`,
|
||||
`gizmoproject`,
|
||||
`go`,
|
||||
`gopher`,
|
||||
`graph`,
|
||||
`gtalk`,
|
||||
`h323`,
|
||||
`ham`,
|
||||
`hcap`,
|
||||
`hcp`,
|
||||
`http`,
|
||||
`https`,
|
||||
`hxxp`,
|
||||
`hxxps`,
|
||||
`hydrazone`,
|
||||
`iax`,
|
||||
`icap`,
|
||||
`icon`,
|
||||
`im`,
|
||||
`imap`,
|
||||
`info`,
|
||||
`iotdisco`,
|
||||
`ipn`,
|
||||
`ipp`,
|
||||
`ipps`,
|
||||
`irc`,
|
||||
`irc6`,
|
||||
`ircs`,
|
||||
`iris`,
|
||||
`iris.beep`,
|
||||
`iris.lwz`,
|
||||
`iris.xpc`,
|
||||
`iris.xpcs`,
|
||||
`isostore`,
|
||||
`itms`,
|
||||
`jabber`,
|
||||
`jar`,
|
||||
`jms`,
|
||||
`keyparc`,
|
||||
`lastfm`,
|
||||
`ldap`,
|
||||
`ldaps`,
|
||||
`lvlt`,
|
||||
`magnet`,
|
||||
`mailserver`,
|
||||
`mailto`,
|
||||
`maps`,
|
||||
`market`,
|
||||
`message`,
|
||||
`microsoft.windows.camera`,
|
||||
`microsoft.windows.camera.multipicker`,
|
||||
`microsoft.windows.camera.picker`,
|
||||
`mid`,
|
||||
`mms`,
|
||||
`modem`,
|
||||
`mongodb`,
|
||||
`moz`,
|
||||
`ms-access`,
|
||||
`ms-browser-extension`,
|
||||
`ms-drive-to`,
|
||||
`ms-enrollment`,
|
||||
`ms-excel`,
|
||||
`ms-gamebarservices`,
|
||||
`ms-gamingoverlay`,
|
||||
`ms-getoffice`,
|
||||
`ms-help`,
|
||||
`ms-infopath`,
|
||||
`ms-inputapp`,
|
||||
`ms-lockscreencomponent-config`,
|
||||
`ms-media-stream-id`,
|
||||
`ms-mixedrealitycapture`,
|
||||
`ms-officeapp`,
|
||||
`ms-people`,
|
||||
`ms-project`,
|
||||
`ms-powerpoint`,
|
||||
`ms-publisher`,
|
||||
`ms-restoretabcompanion`,
|
||||
`ms-screenclip`,
|
||||
`ms-screensketch`,
|
||||
`ms-search`,
|
||||
`ms-search-repair`,
|
||||
`ms-secondary-screen-controller`,
|
||||
`ms-secondary-screen-setup`,
|
||||
`ms-settings`,
|
||||
`ms-settings-airplanemode`,
|
||||
`ms-settings-bluetooth`,
|
||||
`ms-settings-camera`,
|
||||
`ms-settings-cellular`,
|
||||
`ms-settings-cloudstorage`,
|
||||
`ms-settings-connectabledevices`,
|
||||
`ms-settings-displays-topology`,
|
||||
`ms-settings-emailandaccounts`,
|
||||
`ms-settings-language`,
|
||||
`ms-settings-location`,
|
||||
`ms-settings-lock`,
|
||||
`ms-settings-nfctransactions`,
|
||||
`ms-settings-notifications`,
|
||||
`ms-settings-power`,
|
||||
`ms-settings-privacy`,
|
||||
`ms-settings-proximity`,
|
||||
`ms-settings-screenrotation`,
|
||||
`ms-settings-wifi`,
|
||||
`ms-settings-workplace`,
|
||||
`ms-spd`,
|
||||
`ms-sttoverlay`,
|
||||
`ms-transit-to`,
|
||||
`ms-useractivityset`,
|
||||
`ms-virtualtouchpad`,
|
||||
`ms-visio`,
|
||||
`ms-walk-to`,
|
||||
`ms-whiteboard`,
|
||||
`ms-whiteboard-cmd`,
|
||||
`ms-word`,
|
||||
`msnim`,
|
||||
`msrp`,
|
||||
`msrps`,
|
||||
`mtqp`,
|
||||
`mumble`,
|
||||
`mupdate`,
|
||||
`mvn`,
|
||||
`news`,
|
||||
`nfs`,
|
||||
`ni`,
|
||||
`nih`,
|
||||
`nntp`,
|
||||
`notes`,
|
||||
`ocf`,
|
||||
`oid`,
|
||||
`onenote`,
|
||||
`onenote-cmd`,
|
||||
`opaquelocktoken`,
|
||||
`openpgp4fpr`,
|
||||
`pack`,
|
||||
`palm`,
|
||||
`paparazzi`,
|
||||
`pkcs11`,
|
||||
`platform`,
|
||||
`pop`,
|
||||
`pres`,
|
||||
`prospero`,
|
||||
`proxy`,
|
||||
`pwid`,
|
||||
`psyc`,
|
||||
`qb`,
|
||||
`query`,
|
||||
`redis`,
|
||||
`rediss`,
|
||||
`reload`,
|
||||
`res`,
|
||||
`resource`,
|
||||
`rmi`,
|
||||
`rsync`,
|
||||
`rtmfp`,
|
||||
`rtmp`,
|
||||
`rtsp`,
|
||||
`rtsps`,
|
||||
`rtspu`,
|
||||
`secondlife`,
|
||||
`service`,
|
||||
`session`,
|
||||
`sftp`,
|
||||
`sgn`,
|
||||
`shttp`,
|
||||
`sieve`,
|
||||
`simpleledger`,
|
||||
`sip`,
|
||||
`sips`,
|
||||
`skype`,
|
||||
`smb`,
|
||||
`sms`,
|
||||
`smtp`,
|
||||
`snews`,
|
||||
`snmp`,
|
||||
`soap.beep`,
|
||||
`soap.beeps`,
|
||||
`soldat`,
|
||||
`spiffe`,
|
||||
`spotify`,
|
||||
`ssh`,
|
||||
`steam`,
|
||||
`stun`,
|
||||
`stuns`,
|
||||
`submit`,
|
||||
`svn`,
|
||||
`tag`,
|
||||
`teamspeak`,
|
||||
`tel`,
|
||||
`teliaeid`,
|
||||
`telnet`,
|
||||
`tftp`,
|
||||
`things`,
|
||||
`thismessage`,
|
||||
`tip`,
|
||||
`tn3270`,
|
||||
`tool`,
|
||||
`turn`,
|
||||
`turns`,
|
||||
`tv`,
|
||||
`udp`,
|
||||
`unreal`,
|
||||
`urn`,
|
||||
`ut2004`,
|
||||
`v-event`,
|
||||
`vemmi`,
|
||||
`ventrilo`,
|
||||
`videotex`,
|
||||
`vnc`,
|
||||
`view-source`,
|
||||
`wais`,
|
||||
`webcal`,
|
||||
`wpid`,
|
||||
`ws`,
|
||||
`wss`,
|
||||
`wtai`,
|
||||
`wyciwyg`,
|
||||
`xcon`,
|
||||
`xcon-userid`,
|
||||
`xfire`,
|
||||
`xmlrpc.beep`,
|
||||
`xmlrpc.beeps`,
|
||||
`xmpp`,
|
||||
`xri`,
|
||||
`ymsgr`,
|
||||
`z39.50`,
|
||||
`z39.50r`,
|
||||
`z39.50s`,
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,24 @@
|
|||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package xurls
|
||||
|
||||
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
|
||||
//
|
||||
// Sources:
|
||||
// * https://en.wikipedia.org/wiki/Pseudo-top-level_domain
|
||||
// * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
|
||||
// * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
|
||||
// * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
|
||||
var PseudoTLDs = []string{
|
||||
`bit`, // Namecoin
|
||||
`example`, // Example domain
|
||||
`exit`, // Tor exit node
|
||||
`gnu`, // GNS by public key
|
||||
`i2p`, // I2P network
|
||||
`invalid`, // Invalid domain
|
||||
`local`, // Local network
|
||||
`localhost`, // Local network
|
||||
`test`, // Test domain
|
||||
`zkey`, // GNS domain name
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
// Package xurls extracts urls from plain text using regular expressions.
|
||||
package xurls
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
//go:generate go run generate/tldsgen/main.go
|
||||
//go:generate go run generate/schemesgen/main.go
|
||||
|
||||
const (
|
||||
letter = `\p{L}`
|
||||
mark = `\p{M}`
|
||||
number = `\p{N}`
|
||||
iriChar = letter + mark + number
|
||||
currency = `\p{Sc}`
|
||||
otherSymb = `\p{So}`
|
||||
endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb
|
||||
otherPunc = `\p{Po}`
|
||||
midChar = endChar + `|` + otherPunc
|
||||
wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)`
|
||||
wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]`
|
||||
wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}`
|
||||
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
|
||||
pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+`
|
||||
|
||||
iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?`
|
||||
domain = `(` + iri + `\.)+`
|
||||
octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
|
||||
ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b`
|
||||
ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:`
|
||||
ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)`
|
||||
port = `(:[0-9]*)?`
|
||||
)
|
||||
|
||||
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
|
||||
// scheme, and not just the known ones.
|
||||
var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
|
||||
|
||||
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
|
||||
// followed by ":" instead of "://".
|
||||
var SchemesNoAuthority = []string{
|
||||
`bitcoin`, // Bitcoin
|
||||
`file`, // Files
|
||||
`magnet`, // Torrent magnets
|
||||
`mailto`, // Mail
|
||||
`sms`, // SMS
|
||||
`tel`, // Telephone
|
||||
`xmpp`, // XMPP
|
||||
}
|
||||
|
||||
func anyOf(strs ...string) string {
|
||||
var b bytes.Buffer
|
||||
b.WriteByte('(')
|
||||
for i, s := range strs {
|
||||
if i != 0 {
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteString(regexp.QuoteMeta(s))
|
||||
}
|
||||
b.WriteByte(')')
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func strictExp() string {
|
||||
schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)`
|
||||
return `(?i)` + schemes + `(?-i)` + pathCont
|
||||
}
|
||||
|
||||
func relaxedExp() string {
|
||||
site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)`
|
||||
hostName := `(` + site + `|` + ipAddr + `)`
|
||||
webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)`
|
||||
return strictExp() + `|` + webURL
|
||||
}
|
||||
|
||||
// Strict produces a regexp that matches any URL with a scheme in either the
|
||||
// Schemes or SchemesNoAuthority lists.
|
||||
func Strict() *regexp.Regexp {
|
||||
re := regexp.MustCompile(strictExp())
|
||||
re.Longest()
|
||||
return re
|
||||
}
|
||||
|
||||
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
|
||||
// URL with no scheme.
|
||||
func Relaxed() *regexp.Regexp {
|
||||
re := regexp.MustCompile(relaxedExp())
|
||||
re.Longest()
|
||||
return re
|
||||
}
|
||||
|
||||
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
|
||||
// the scheme match the given regular expression. See AnyScheme too.
|
||||
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
|
||||
strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont
|
||||
re, err := regexp.Compile(strictMatching)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
re.Longest()
|
||||
return re, nil
|
||||
}
|
Loading…
Reference in New Issue