diff --git a/SignalServiceKit/src/Loki/API/Utilities/HTMLMetadata.swift b/SignalServiceKit/src/Loki/API/Utilities/HTMLMetadata.swift new file mode 100644 index 000000000..5037faaa9 --- /dev/null +++ b/SignalServiceKit/src/Loki/API/Utilities/HTMLMetadata.swift @@ -0,0 +1,119 @@ +import Foundation + +public struct HTMLMetadata: Equatable { + /// Parsed from + var titleTag: String? + /// Parsed from <link rel="icon"...> + var faviconUrlString: String? + /// Parsed from <meta name="description"...> + var description: String? + /// Parsed from the og:title meta property + var ogTitle: String? + /// Parsed from the og:description meta property + var ogDescription: String? + /// Parsed from the og:image or og:image:url meta property + var ogImageUrlString: String? + /// Parsed from the og:published_time meta property + var ogPublishDateString: String? + /// Parsed from article:published_time meta property + var articlePublishDateString: String? + /// Parsed from the og:modified_time meta property + var ogModifiedDateString: String? + /// Parsed from the article:modified_time meta property + var articleModifiedDateString: String? + + static func construct(parsing rawHTML: String) -> HTMLMetadata { + let metaPropertyTags = Self.parseMetaProperties(in: rawHTML) + return HTMLMetadata( + titleTag: Self.parseTitleTag(in: rawHTML), + faviconUrlString: Self.parseFaviconUrlString(in: rawHTML), + description: Self.parseDescriptionTag(in: rawHTML), + ogTitle: metaPropertyTags["og:title"], + ogDescription: metaPropertyTags["og:description"], + ogImageUrlString: (metaPropertyTags["og:image"] ?? metaPropertyTags["og:image:url"]), + ogPublishDateString: metaPropertyTags["og:published_time"], + articlePublishDateString: metaPropertyTags["article:published_time"], + ogModifiedDateString: metaPropertyTags["og:modified_time"], + articleModifiedDateString: metaPropertyTags["article:modified_time"] + ) + } +} + +// MARK: - Parsing +extension HTMLMetadata { + + private static func parseTitleTag(in rawHTML: String) -> String? { + titleRegex + .firstMatchSet(in: rawHTML)? + .group(idx: 0) + .flatMap { decodeHTMLEntities(in: String($0)) } + } + + private static func parseFaviconUrlString(in rawHTML: String) -> String? { + guard let matchedTag = faviconRegex + .firstMatchSet(in: rawHTML) + .map({ String($0.fullString) }) else { return nil } + + return faviconUrlRegex + .parseFirstMatch(inText: matchedTag) + .flatMap { decodeHTMLEntities(in: String($0)) } + } + + private static func parseDescriptionTag(in rawHTML: String) -> String? { + guard let matchedTag = metaDescriptionRegex + .firstMatchSet(in: rawHTML) + .map({ String($0.fullString) }) else { return nil } + + return metaContentRegex + .parseFirstMatch(inText: matchedTag) + .flatMap { decodeHTMLEntities(in: String($0)) } + } + + private static func parseMetaProperties(in rawHTML: String) -> [String: String] { + metaPropertyRegex + .allMatchSets(in: rawHTML) + .reduce(into: [:]) { (builder, matchSet) in + guard let ogTypeSubstring = matchSet.group(idx: 0) else { return } + let ogType = String(ogTypeSubstring) + let fullTag = String(matchSet.fullString) + + // Exit early if we've already found a tag of this type + guard builder[ogType] == nil else { return } + guard let content = metaContentRegex.parseFirstMatch(inText: fullTag) else { return } + + builder[ogType] = decodeHTMLEntities(in: content) + } + } + + private static func decodeHTMLEntities(in string: String) -> String? { + guard let data = string.data(using: .utf8) else { + return nil + } + + let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [ + .documentType: NSAttributedString.DocumentType.html, + .characterEncoding: String.Encoding.utf8.rawValue + ] + + guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else { + return nil + } + return attributedString.string + } +} + + // MARK: - Regular Expressions +extension HTMLMetadata { + static let titleRegex = regex(pattern: "<\\s*title[^>]*>(.*?)<\\s*/title[^>]*>") + static let faviconRegex = regex(pattern: "<\\s*link[^>]*rel\\s*=\\s*\"\\s*(shortcut\\s+)?icon\\s*\"[^>]*>") + static let faviconUrlRegex = regex(pattern: "href\\s*=\\s*\"([^\"]*)\"") + static let metaDescriptionRegex = regex(pattern: "<\\s*meta[^>]*name\\s*=\\s*\"\\s*description[^\"]*\"[^>]*>") + static let metaPropertyRegex = regex(pattern: "<\\s*meta[^>]*property\\s*=\\s*\"\\s*([^\"]+?)\"[^>]*>") + static let metaContentRegex = regex(pattern: "content\\s*=\\s*\"([^\"]*?)\"") + + static private func regex(pattern: String) -> NSRegularExpression { + try! NSRegularExpression( + pattern: pattern, + options: [.dotMatchesLineSeparators, .caseInsensitive]) + } +} diff --git a/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift b/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift index 80faf4545..cd9e63987 100644 --- a/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift +++ b/SignalServiceKit/src/Messages/Interactions/OWSLinkPreview.swift @@ -291,83 +291,6 @@ public class OWSLinkPreview: MTLModel { return result.filterStringForDisplay() } - // MARK: - Whitelists - - // For link domains, we require an exact match - no subdomains allowed. - // - // Note that order matters in this whitelist since the logic for determining - // how to render link preview domains in displayDomain(...) uses the first match. - // We should list TLDs first and subdomains later. - private static let linkDomainWhitelist = [ - // YouTube - "youtube.com", - "www.youtube.com", - "m.youtube.com", - "youtu.be", - - // Reddit - "reddit.com", - "www.reddit.com", - "m.reddit.com", - // NOTE: We don't use redd.it. - - // Imgur - // - // NOTE: Subdomains are also used for content. - // - // For example, you can access "user/member" pages: https://sillygoose2.imgur.com/ - // A different member page can be accessed without a subdomain: https://imgur.com/user/SillyGoose2 - // - // I'm not sure we need to support these subdomains; they don't appear to be core functionality. - "imgur.com", - "www.imgur.com", - "m.imgur.com", - - // Instagram - "instagram.com", - "www.instagram.com", - "m.instagram.com", - - // Pinterest - "pinterest.com", - "www.pinterest.com", - "pin.it", - - // Giphy - "giphy.com", - "media.giphy.com", - "media1.giphy.com", - "media2.giphy.com", - "media3.giphy.com", - "gph.is" - ] - - // For media domains, we DO NOT require an exact match - subdomains are allowed. - private static let mediaDomainWhitelist = [ - // YouTube - "ytimg.com", - - // Reddit - "redd.it", - - // Imgur - "imgur.com", - - // Instagram - "cdninstagram.com", - "fbcdn.net", - - // Pinterest - "pinimg.com", - - // Giphy - "giphy.com" - ] - - private static let protocolWhitelist = [ - "https" - ] - @objc public func displayDomain() -> String? { return OWSLinkPreview.displayDomain(forUrl: urlString) @@ -383,13 +306,7 @@ public class OWSLinkPreview: MTLModel { owsFailDebug("Invalid url.") return nil } - guard let result = whitelistedDomain(forUrl: url, - domainWhitelist: OWSLinkPreview.linkDomainWhitelist, - allowSubdomains: false) else { - Logger.error("Missing domain.") - return nil - } - return result + return url.host } @objc @@ -397,9 +314,7 @@ public class OWSLinkPreview: MTLModel { guard let url = URL(string: urlString) else { return false } - return whitelistedDomain(forUrl: url, - domainWhitelist: OWSLinkPreview.linkDomainWhitelist, - allowSubdomains: false) != nil + return true } @objc @@ -407,36 +322,7 @@ public class OWSLinkPreview: MTLModel { guard let url = URL(string: urlString) else { return false } - return whitelistedDomain(forUrl: url, - domainWhitelist: OWSLinkPreview.mediaDomainWhitelist, - allowSubdomains: true) != nil - } - - private class func whitelistedDomain(forUrl url: URL, domainWhitelist: [String], allowSubdomains: Bool) -> String? { - guard let urlProtocol = url.scheme?.lowercased() else { - return nil - } - guard protocolWhitelist.contains(urlProtocol) else { - return nil - } - guard let domain = url.host?.lowercased() else { - return nil - } - guard url.path.count > 1 else { - // URL must have non-empty path. - return nil - } - - for whitelistedDomain in domainWhitelist { - if domain == whitelistedDomain.lowercased() { - return whitelistedDomain - } - if allowSubdomains, - domain.hasSuffix("." + whitelistedDomain.lowercased()) { - return whitelistedDomain - } - } - return nil + return true } // MARK: - Serial Queue @@ -812,31 +698,27 @@ public class OWSLinkPreview: MTLModel { } } - // Example: - // - // <meta property="og:title" content="Randomness is Random - Numberphile"> - // <meta property="og:image" content="https://i.ytimg.com/vi/tP-Ipsat90c/maxresdefault.jpg"> class func parse(linkData: Data) throws -> OWSLinkPreviewContents { guard let linkText = String(bytes: linkData, encoding: .utf8) else { owsFailDebug("Could not parse link text.") throw LinkPreviewError.invalidInput } + + let content = HTMLMetadata.construct(parsing: linkText) var title: String? - if let rawTitle = NSRegularExpression.parseFirstMatch(pattern: "<meta\\s+property\\s*=\\s*\"og:title\"\\s+[^>]*content\\s*=\\s*\"(.*?)\"\\s*[^>]*/?>", - text: linkText, - options: .dotMatchesLineSeparators) { - if let decodedTitle = decodeHTMLEntities(inString: rawTitle) { - let normalizedTitle = OWSLinkPreview.normalizeTitle(title: decodedTitle) - if normalizedTitle.count > 0 { - title = normalizedTitle - } + let rawTitle = content.ogTitle ?? content.titleTag + if let decodedTitle = decodeHTMLEntities(inString: rawTitle ?? "") { + let normalizedTitle = OWSLinkPreview.normalizeTitle(title: decodedTitle) + if normalizedTitle.count > 0 { + title = normalizedTitle } } + Logger.verbose("title: \(String(describing: title))") - guard let rawImageUrlString = NSRegularExpression.parseFirstMatch(pattern: "<meta\\s+property\\s*=\\s*\"og:image\"\\s+[^>]*content\\s*=\\s*\"(.*?)\"[^>]*/?>", text: linkText) else { + guard let rawImageUrlString = content.ogImageUrlString ?? content.faviconUrlString else { return OWSLinkPreviewContents(title: title) } guard let imageUrlString = decodeHTMLEntities(inString: rawImageUrlString)?.ows_stripped() else { diff --git a/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift b/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift index e4574467d..17ff74c78 100644 --- a/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift +++ b/SignalServiceKit/src/Util/NSRegularExpression+SSK.swift @@ -52,4 +52,57 @@ public extension NSRegularExpression { let substring = String(text[textRange]) return substring } + + @nonobjc + func firstMatchSet(in searchString: String) -> MatchSet? { + firstMatch(in: searchString, options: [], range: searchString.completeNSRange)?.createMatchSet(originalSearchString: searchString) + } + + @nonobjc + func allMatchSets(in searchString: String) -> [MatchSet] { + matches(in: searchString, options: [], range: searchString.completeNSRange).compactMap { $0.createMatchSet(originalSearchString: searchString) } + } + +} + +public struct MatchSet { + let fullString: Substring + let matchedGroups: [Substring?] + + func group(idx: Int) -> Substring? { + guard idx < matchedGroups.count else { return nil } + return matchedGroups[idx] + } +} + +fileprivate extension String { + subscript(_ nsRange: NSRange) -> Substring? { + guard let swiftRange = Range(nsRange, in: self) else { return nil } + return self[swiftRange] + } + + var completeRange: Range<String.Index> { + startIndex..<endIndex + } + + var completeNSRange: NSRange { + NSRange(completeRange, in: self) + } } + +fileprivate extension NSTextCheckingResult { + func createMatchSet(originalSearchString string: String) -> MatchSet? { + guard numberOfRanges > 0 else { return nil } + let substrings = (0..<numberOfRanges) + .map { range(at: $0) } + .map { string[$0] } + + guard let fullString = substrings[0] else { + owsFailDebug("Missing expected full string") + return nil + } + + return MatchSet(fullString: fullString, matchedGroups: Array(substrings[1...])) + } +} +