/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ const lazy = {}; ChromeUtils.defineESModuleGetters(lazy, { NetUtil: "resource://gre/modules/NetUtil.sys.mjs", ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs", }); /** * Represents a child actor for handling link previews in the browser. * Interacts with content windows and handles events related to link previews. * * @class LinkPreviewChild * @augments {JSWindowActorChild} */ export class LinkPreviewChild extends JSWindowActorChild { /** * Handles incoming messages from the parent actor. * * @param {object} message - The message object containing name and data. * @param {string} message.name - The name of the message. * @param {object} message.data - Data associated with the message. * @returns {Promise|undefined} The result of fetchPageData if applicable. */ async receiveMessage({ name, data }) { if (name === "LinkPreview:FetchPageData") { return this.fetchPageData(data.url); } //expected a return value. consistent-return (eslint) return undefined; } /** * Fetches the HTML content from the given URL. * * @param {string} url - The URL to fetch. * @returns {Promise} The HTML content as a string. * @throws {Error} If the fetch fails or the content type is invalid. */ fetchHTML(url) { const uri = lazy.NetUtil.newURI(url); if (!uri.schemeIs("https")) { throw Components.Exception( "Only handling https", Cr.NS_ERROR_UNKNOWN_PROTOCOL ); } // Make requests with a channel to automatically get safe browsing checks. // Use null principals in combination with anonymous for now ahead of // fetching content with cookies to handle sites requiring login. const principal = Services.scriptSecurityManager.createNullPrincipal({}); const channel = lazy.NetUtil.newChannel({ contentPolicyType: Ci.nsIContentPolicy.TYPE_DOCUMENT, loadingPrincipal: principal, securityFlags: Ci.nsILoadInfo.SEC_ALLOW_CROSS_ORIGIN_INHERITS_SEC_CONTEXT, triggeringPrincipal: principal, uri, }).QueryInterface(Ci.nsIHttpChannel); channel.loadFlags = Ci.nsIRequest.LOAD_ANONYMOUS; // Specially identify this request, e.g., for publishers to opt out channel.setRequestHeader("x-firefox-ai", "1", false); const { promise, resolve, reject } = Promise.withResolvers(); const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit let charset = "utf-8"; const byteChunks = []; let totalLength = 0; channel.asyncOpen({ onDataAvailable(request, stream, offset, count) { totalLength += count; if (totalLength > MAX_CONTENT_LENGTH) { request.cancel(Cr.NS_ERROR_FILE_TOO_BIG); } else { byteChunks.push(lazy.NetUtil.readInputStream(stream, count)); } }, onStartRequest(request) { const http = request.QueryInterface(Ci.nsIHttpChannel); // Enforce text/html if provided by server let contentType = ""; try { contentType = http.getResponseHeader("content-type"); } catch (ex) {} if (contentType && !contentType.startsWith("text/html")) { request.cancel(Cr.NS_ERROR_FILE_UNKNOWN_TYPE); } // Save charset without quotes or spaces for TextDecoder const match = contentType.match(/charset=["' ]*([^;"' ]+)/i); if (match) { charset = match[1]; } // Enforce max length if provided by server try { if (http.getResponseHeader("content-length") > MAX_CONTENT_LENGTH) { request.cancel(Cr.NS_ERROR_FILE_TOO_BIG); } } catch (ex) {} }, onStopRequest(_request, status) { if (Components.isSuccessCode(status)) { const bytes = new Uint8Array(totalLength); let offset = 0; for (const chunk of byteChunks) { bytes.set(new Uint8Array(chunk), offset); offset += chunk.byteLength; } const decoder = new TextDecoder(charset); resolve(decoder.decode(bytes)); } else { reject(Components.Exception("Failed to fetch HTML", status)); } }, }); return promise; } /** * Fetches HTML content from a URL and parses its meta tags and page text. * * @param {string} url - The URL to fetch and parse. * @returns {Promise} An object containing meta information, page text, and HTML code. */ async fetchPageData(url) { const ret = { article: {}, rawMetaInfo: {}, url, }; try { const htmlCode = await this.fetchHTML(url); ret.urlComponents = this.extractUrlComponents(url); const parser = new DOMParser(); const doc = parser.parseFromString(htmlCode, "text/html"); ret.rawMetaInfo = this.parseMetaTagsFromDoc(doc); if (!this.isProbablyReaderable(doc)) { // Add normalized metadata even if the document isn't reader-able ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo); return ret; } ret.article = await this.getArticleDataFromDoc(doc); ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo, ret.article); } catch (error) { console.error(`Failed to fetch and parse page data: ${error}`); ret.error = { message: error.message, result: error.result }; // Add empty normalized metadata in case of error ret.meta = this.extractNormalizedMetadata(); } return ret; } /** * Extracts and normalizes metadata from the page's meta tags and article content. * * @param {object} metaData - Metadata extracted from the page's meta tags (Open Graph, Twitter, HTML) * @param {object} articleData - Data extracted from the article content using ReaderMode * @returns {object} Normalized metadata containing: * - title: Page title prioritizing Open Graph, Twitter, then HTML title * - description: Content excerpt or meta description from various sources * - imageUrl: HTTPS-only URL of the page's primary image * - isMissingMetadata: Boolean flag indicating if description is missing */ extractNormalizedMetadata(metaData = {}, articleData = {}) { const title = metaData["og:title"] || metaData["twitter:title"] || metaData["html:title"] || ""; const description = articleData.excerpt || metaData["og:description"] || metaData["twitter:description"] || metaData.description || ""; let imageUrl = metaData["og:image"] || metaData["twitter:image:src"] || ""; if (!imageUrl.startsWith("https://")) { imageUrl = ""; } return { title, description, imageUrl, }; } /** * Extracts URL components including domain and filename. * * @param {string} url - The URL to extract information from. * @returns {object} Object containing domain and filename. */ extractUrlComponents(url) { try { const urlObj = new URL(url); const domain = urlObj.hostname; // Extract the filename (last part of pathname) let pathname = urlObj.pathname; // Remove trailing slash if present if (pathname.endsWith("/")) { pathname = pathname.slice(0, -1); } // Get last segment of path const pathParts = pathname.split("/"); const filename = pathParts[pathParts.length - 1] || domain; return { domain, filename }; } catch (e) { // Return both properties with same fallback value if URL is invalid return { domain: url, filename: url }; } } /** * Parses meta tags from the provided Document into a key-value object. * Also extracts the title if available. * * @param {Document} doc - The parsed HTML document. * @returns {object} An object containing meta tag key-value pairs. */ parseMetaTagsFromDoc(doc) { const metaTags = doc.querySelectorAll("meta"); const metaInfo = {}; // TODO: Define the meta tags we are interested in const desiredMetaNames = [ "description", "og:image", "title", "og:title", "twitter:title", "og:description", "twitter:description", "twitter:image:src", ]; metaTags.forEach(tag => { const name = tag.getAttribute("name") || tag.getAttribute("property"); const content = tag.getAttribute("content"); if (name && content) { if (desiredMetaNames.includes(name.toLowerCase())) { metaInfo[name] = content; } } }); const title = doc.querySelector("title")?.textContent; if (title) { metaInfo["html:title"] = title; } return metaInfo; } /** * Extracts article data from the provided Document using ReaderMode. * * @param {Document} doc - The parsed HTML document. * @returns {Promise} The extracted article data including specified fields. */ async getArticleDataFromDoc(doc) { try { const article = await lazy.ReaderMode.parseDocument(doc); if (article) { const { title, byline, content, detectedLanguage, length, siteName, excerpt, readingTimeMinsSlow, readingTimeMinsFast, } = article; // parseDocument return a `textContent` that strips structure and newlines, which we need for the model. // So we convert the HTML `content` to plain text directly, preserving formatting and newlines. const textContent = Cc["@mozilla.org/parserutils;1"] .getService(Ci.nsIParserUtils) .convertToPlainText( content, Ci.nsIDocumentEncoder.OutputSelectionOnly | // Use only selected reader-view fragment Ci.nsIDocumentEncoder.OutputAbsoluteLinks | Ci.nsIDocumentEncoder.OutputFormatted, // Pretty-print formatting 0 // No line-wrapping ); return { title, byline, textContent, detectedLanguage, length, siteName, excerpt, readingTimeMinsFast, readingTimeMinsSlow, }; } } catch (error) { console.error("Error parsing document with ReaderMode:", error); } return {}; } /** * Decides whether or not the document is reader-able without parsing the whole thing. * * @param {Document} doc - The document to check for readability * @param {object} [options={}] Configuration object. * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable. * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable. * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible. * @returns {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object. */ isProbablyReaderable(doc, options = {}) { // For backward compatibility reasons 'options' can either be a configuration object or the function used // to determine if a node is visible. if (typeof options == "function") { options = { visibilityChecker: options }; } var defaultOptions = { minScore: 20, minContentLength: 140, visibilityChecker: this.isNodeVisible, }; options = Object.assign(defaultOptions, options); var nodes = doc.querySelectorAll("p, pre, article"); // Get
nodes which have
node(s) and append them into the `nodes` variable. // Some articles' DOM structures might look like //
// Sentences
//
// Sentences
//
var brNodes = doc.querySelectorAll("div > br"); if (brNodes.length) { var set = new Set(nodes); [].forEach.call(brNodes, function (node) { set.add(node.parentNode); }); nodes = Array.from(set); } var score = 0; // This is a little cheeky, we use the accumulator 'score' to decide what to return from // this callback: return [].some.call(nodes, function (node) { if (!options.visibilityChecker(node)) { return false; } var REGEXPS = { // NOTE: These two regular expressions are duplicated in // Readability.js. Please keep both copies in sync. unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, }; var matchString = node.className + " " + node.id; if ( REGEXPS.unlikelyCandidates.test(matchString) && !REGEXPS.okMaybeItsACandidate.test(matchString) ) { return false; } if (node.matches("li p")) { return false; } var textContentLength = node.textContent.trim().length; if (textContentLength < options.minContentLength) { return false; } score += Math.sqrt(textContentLength - options.minContentLength); if (score > options.minScore) { return true; } return false; }); } /** * Determines whether a node is visible in the document. * * @param {Node} node - The DOM node to check for visibility * @returns {boolean} True if the node is considered visible, false otherwise * * This method checks several visibility attributes: * - Verifies the node's display style is not 'none' * - Checks that the node doesn't have a 'hidden' attribute * - Ensures the aria-hidden attribute is not 'true' (with an exception for fallback images) */ isNodeVisible(node) { // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes. return ( (!node.style || node.style.display != "none") && !node.hasAttribute("hidden") && //check for "fallback-image" so that wikimedia math images are displayed (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.includes && node.className.includes("fallback-image"))) ); } }