diff --git a/SignalServiceKit/src/Loki/API/Utilities/HTMLMetadata.swift b/SignalServiceKit/src/Loki/API/Utilities/HTMLMetadata.swift
new file mode 100644
index 000000000..5037faaa9
--- /dev/null
+++ b/SignalServiceKit/src/Loki/API/Utilities/HTMLMetadata.swift
@@ -0,0 +1,119 @@
+import Foundation
+
+public struct HTMLMetadata: Equatable {
+ /// Parsed from
+ var titleTag: String?
+ /// Parsed from
+ var faviconUrlString: String?
+ /// Parsed from
+ var description: String?
+ /// Parsed from the og:title meta property
+ var ogTitle: String?
+ /// Parsed from the og:description meta property
+ var ogDescription: String?
+ /// Parsed from the og:image or og:image:url meta property
+ var ogImageUrlString: String?
+ /// Parsed from the og:published_time meta property
+ var ogPublishDateString: String?
+ /// Parsed from article:published_time meta property
+ var articlePublishDateString: String?
+ /// Parsed from the og:modified_time meta property
+ var ogModifiedDateString: String?
+ /// Parsed from the article:modified_time meta property
+ var articleModifiedDateString: String?
+
+ static func construct(parsing rawHTML: String) -> HTMLMetadata {
+ let metaPropertyTags = Self.parseMetaProperties(in: rawHTML)
+ return HTMLMetadata(
+ titleTag: Self.parseTitleTag(in: rawHTML),
+ faviconUrlString: Self.parseFaviconUrlString(in: rawHTML),
+ description: Self.parseDescriptionTag(in: rawHTML),
+ ogTitle: metaPropertyTags["og:title"],
+ ogDescription: metaPropertyTags["og:description"],
+ ogImageUrlString: (metaPropertyTags["og:image"] ?? metaPropertyTags["og:image:url"]),
+ ogPublishDateString: metaPropertyTags["og:published_time"],
+ articlePublishDateString: metaPropertyTags["article:published_time"],
+ ogModifiedDateString: metaPropertyTags["og:modified_time"],
+ articleModifiedDateString: metaPropertyTags["article:modified_time"]
+ )
+ }
+}
+
+// MARK: - Parsing
+extension HTMLMetadata {
+
+ private static func parseTitleTag(in rawHTML: String) -> String? {
+ titleRegex
+ .firstMatchSet(in: rawHTML)?
+ .group(idx: 0)
+ .flatMap { decodeHTMLEntities(in: String($0)) }
+ }
+
+ private static func parseFaviconUrlString(in rawHTML: String) -> String? {
+ guard let matchedTag = faviconRegex
+ .firstMatchSet(in: rawHTML)
+ .map({ String($0.fullString) }) else { return nil }
+
+ return faviconUrlRegex
+ .parseFirstMatch(inText: matchedTag)
+ .flatMap { decodeHTMLEntities(in: String($0)) }
+ }
+
+ private static func parseDescriptionTag(in rawHTML: String) -> String? {
+ guard let matchedTag = metaDescriptionRegex
+ .firstMatchSet(in: rawHTML)
+ .map({ String($0.fullString) }) else { return nil }
+
+ return metaContentRegex
+ .parseFirstMatch(inText: matchedTag)
+ .flatMap { decodeHTMLEntities(in: String($0)) }
+ }
+
+ private static func parseMetaProperties(in rawHTML: String) -> [String: String] {
+ metaPropertyRegex
+ .allMatchSets(in: rawHTML)
+ .reduce(into: [:]) { (builder, matchSet) in
+ guard let ogTypeSubstring = matchSet.group(idx: 0) else { return }
+ let ogType = String(ogTypeSubstring)
+ let fullTag = String(matchSet.fullString)
+
+ // Exit early if we've already found a tag of this type
+ guard builder[ogType] == nil else { return }
+ guard let content = metaContentRegex.parseFirstMatch(inText: fullTag) else { return }
+
+ builder[ogType] = decodeHTMLEntities(in: content)
+ }
+ }
+
+ private static func decodeHTMLEntities(in string: String) -> String? {
+ guard let data = string.data(using: .utf8) else {
+ return nil
+ }
+
+ let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
+ .documentType: NSAttributedString.DocumentType.html,
+ .characterEncoding: String.Encoding.utf8.rawValue
+ ]
+
+ guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else {
+ return nil
+ }
+ return attributedString.string
+ }
+}
+
+ // MARK: - Regular Expressions
+extension HTMLMetadata {
+ static let titleRegex = regex(pattern: "<\\s*title[^>]*>(.*?)<\\s*/title[^>]*>")
+ static let faviconRegex = regex(pattern: "<\\s*link[^>]*rel\\s*=\\s*\"\\s*(shortcut\\s+)?icon\\s*\"[^>]*>")
+ static let faviconUrlRegex = regex(pattern: "href\\s*=\\s*\"([^\"]*)\"")
+ static let metaDescriptionRegex = regex(pattern: "<\\s*meta[^>]*name\\s*=\\s*\"\\s*description[^\"]*\"[^>]*>")
+ static let metaPropertyRegex = regex(pattern: "<\\s*meta[^>]*property\\s*=\\s*\"\\s*([^\"]+?)\"[^>]*>")
+ static let metaContentRegex = regex(pattern: "content\\s*=\\s*\"([^\"]*?)\"")
+
+ static private func regex(pattern: String) -> NSRegularExpression {
+ try! NSRegularExpression(
+ pattern: pattern,
+ options: [.dotMatchesLineSeparators, .caseInsensitive])
+ }
+}
diff --git a/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift b/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift
index 80faf4545..cd9e63987 100644
--- a/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift
+++ b/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift
@@ -291,83 +291,6 @@ public class OWSLinkPreview: MTLModel {
return result.filterStringForDisplay()
}
- // MARK: - Whitelists
-
- // For link domains, we require an exact match - no subdomains allowed.
- //
- // Note that order matters in this whitelist since the logic for determining
- // how to render link preview domains in displayDomain(...) uses the first match.
- // We should list TLDs first and subdomains later.
- private static let linkDomainWhitelist = [
- // YouTube
- "youtube.com",
- "www.youtube.com",
- "m.youtube.com",
- "youtu.be",
-
- // Reddit
- "reddit.com",
- "www.reddit.com",
- "m.reddit.com",
- // NOTE: We don't use redd.it.
-
- // Imgur
- //
- // NOTE: Subdomains are also used for content.
- //
- // For example, you can access "user/member" pages: https://sillygoose2.imgur.com/
- // A different member page can be accessed without a subdomain: https://imgur.com/user/SillyGoose2
- //
- // I'm not sure we need to support these subdomains; they don't appear to be core functionality.
- "imgur.com",
- "www.imgur.com",
- "m.imgur.com",
-
- // Instagram
- "instagram.com",
- "www.instagram.com",
- "m.instagram.com",
-
- // Pinterest
- "pinterest.com",
- "www.pinterest.com",
- "pin.it",
-
- // Giphy
- "giphy.com",
- "media.giphy.com",
- "media1.giphy.com",
- "media2.giphy.com",
- "media3.giphy.com",
- "gph.is"
- ]
-
- // For media domains, we DO NOT require an exact match - subdomains are allowed.
- private static let mediaDomainWhitelist = [
- // YouTube
- "ytimg.com",
-
- // Reddit
- "redd.it",
-
- // Imgur
- "imgur.com",
-
- // Instagram
- "cdninstagram.com",
- "fbcdn.net",
-
- // Pinterest
- "pinimg.com",
-
- // Giphy
- "giphy.com"
- ]
-
- private static let protocolWhitelist = [
- "https"
- ]
-
@objc
public func displayDomain() -> String? {
return OWSLinkPreview.displayDomain(forUrl: urlString)
@@ -383,13 +306,7 @@ public class OWSLinkPreview: MTLModel {
owsFailDebug("Invalid url.")
return nil
}
- guard let result = whitelistedDomain(forUrl: url,
- domainWhitelist: OWSLinkPreview.linkDomainWhitelist,
- allowSubdomains: false) else {
- Logger.error("Missing domain.")
- return nil
- }
- return result
+ return url.host
}
@objc
@@ -397,9 +314,7 @@ public class OWSLinkPreview: MTLModel {
guard let url = URL(string: urlString) else {
return false
}
- return whitelistedDomain(forUrl: url,
- domainWhitelist: OWSLinkPreview.linkDomainWhitelist,
- allowSubdomains: false) != nil
+ return true
}
@objc
@@ -407,36 +322,7 @@ public class OWSLinkPreview: MTLModel {
guard let url = URL(string: urlString) else {
return false
}
- return whitelistedDomain(forUrl: url,
- domainWhitelist: OWSLinkPreview.mediaDomainWhitelist,
- allowSubdomains: true) != nil
- }
-
- private class func whitelistedDomain(forUrl url: URL, domainWhitelist: [String], allowSubdomains: Bool) -> String? {
- guard let urlProtocol = url.scheme?.lowercased() else {
- return nil
- }
- guard protocolWhitelist.contains(urlProtocol) else {
- return nil
- }
- guard let domain = url.host?.lowercased() else {
- return nil
- }
- guard url.path.count > 1 else {
- // URL must have non-empty path.
- return nil
- }
-
- for whitelistedDomain in domainWhitelist {
- if domain == whitelistedDomain.lowercased() {
- return whitelistedDomain
- }
- if allowSubdomains,
- domain.hasSuffix("." + whitelistedDomain.lowercased()) {
- return whitelistedDomain
- }
- }
- return nil
+ return true
}
// MARK: - Serial Queue
@@ -812,31 +698,27 @@ public class OWSLinkPreview: MTLModel {
}
}
- // Example:
- //
- //
- //
class func parse(linkData: Data) throws -> OWSLinkPreviewContents {
guard let linkText = String(bytes: linkData, encoding: .utf8) else {
owsFailDebug("Could not parse link text.")
throw LinkPreviewError.invalidInput
}
+
+ let content = HTMLMetadata.construct(parsing: linkText)
var title: String?
- if let rawTitle = NSRegularExpression.parseFirstMatch(pattern: "]*content\\s*=\\s*\"(.*?)\"\\s*[^>]*/?>",
- text: linkText,
- options: .dotMatchesLineSeparators) {
- if let decodedTitle = decodeHTMLEntities(inString: rawTitle) {
- let normalizedTitle = OWSLinkPreview.normalizeTitle(title: decodedTitle)
- if normalizedTitle.count > 0 {
- title = normalizedTitle
- }
+ let rawTitle = content.ogTitle ?? content.titleTag
+ if let decodedTitle = decodeHTMLEntities(inString: rawTitle ?? "") {
+ let normalizedTitle = OWSLinkPreview.normalizeTitle(title: decodedTitle)
+ if normalizedTitle.count > 0 {
+ title = normalizedTitle
}
}
+
Logger.verbose("title: \(String(describing: title))")
- guard let rawImageUrlString = NSRegularExpression.parseFirstMatch(pattern: "]*content\\s*=\\s*\"(.*?)\"[^>]*/?>", text: linkText) else {
+ guard let rawImageUrlString = content.ogImageUrlString ?? content.faviconUrlString else {
return OWSLinkPreviewContents(title: title)
}
guard let imageUrlString = decodeHTMLEntities(inString: rawImageUrlString)?.ows_stripped() else {
diff --git a/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift b/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift
index e4574467d..17ff74c78 100644
--- a/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift
+++ b/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift
@@ -52,4 +52,57 @@ public extension NSRegularExpression {
let substring = String(text[textRange])
return substring
}
+
+ @nonobjc
+ func firstMatchSet(in searchString: String) -> MatchSet? {
+ firstMatch(in: searchString, options: [], range: searchString.completeNSRange)?.createMatchSet(originalSearchString: searchString)
+ }
+
+ @nonobjc
+ func allMatchSets(in searchString: String) -> [MatchSet] {
+ matches(in: searchString, options: [], range: searchString.completeNSRange).compactMap { $0.createMatchSet(originalSearchString: searchString) }
+ }
+
+}
+
+public struct MatchSet {
+ let fullString: Substring
+ let matchedGroups: [Substring?]
+
+ func group(idx: Int) -> Substring? {
+ guard idx < matchedGroups.count else { return nil }
+ return matchedGroups[idx]
+ }
+}
+
+fileprivate extension String {
+ subscript(_ nsRange: NSRange) -> Substring? {
+ guard let swiftRange = Range(nsRange, in: self) else { return nil }
+ return self[swiftRange]
+ }
+
+ var completeRange: Range {
+ startIndex.. MatchSet? {
+ guard numberOfRanges > 0 else { return nil }
+ let substrings = (0..