jsClass representing the Puppeteer-based wrapper for get-page-textjsconst playwright = requireplaywright getPageText = requireinjectedget-page-text;const URL_PREFIX = httpsexamplecom;class TextExtract
/*
Class representing the Puppeteer-based wrapper for get-page-text.js
*/
const { JSDOM } = require("jsdom");
const URL_PREFIX = "https://example.com/";
class TextExtractor {
/*
Options:
getFile: function(href) returns {type:, contents:}
logError: function(msg)
*/
constructor(options) {
this.getFile = options.getFile;
this.logError = options.logError;
this.dom = new JSDOM();
this.window = this.dom.window;
this.document = this.window.document;
}
async initialise() {
// No need to initialise anything for jsdom
}
async getPageText(href) {
// console.log("processing page",href)
const pageURL = URL_PREFIX + href;
const { type, contents } = await this.getFile(href);
if (!type) {
this.logError(`Missing file \`${href}\``);
return "";
} else {
this.document.documentElement.innerHTML = contents;
const text = this.document.documentElement.textContent;
return text;
}
}
async close() {
// No need to close anything for jsdom
}
}
exports.TextExtractor = TextExtractor;
``
原文地址: https://www.cveoy.top/t/topic/hYN6 著作权归作者所有。请勿转载和采集!