/*
Class representing the Puppeteer-based wrapper for get-page-text.js
*/

const { JSDOM } = require("jsdom");

const URL_PREFIX = "https://example.com/";

class TextExtractor {

	/*
	Options:
	getFile: function(href) returns {type:, contents:}
	logError: function(msg)
	*/
	constructor(options) {
		this.getFile = options.getFile;
		this.logError = options.logError;
		this.dom = new JSDOM();
		this.window = this.dom.window;
		this.document = this.window.document;
	}

	async initialise() {
		// No need to initialise anything for jsdom
	}

	async getPageText(href) {
		// console.log("processing page",href)
		const pageURL = URL_PREFIX + href;
		const { type, contents } = await this.getFile(href);
		if (!type) {
			this.logError(`Missing file \`${href}\``);
			return "";
		} else {
			this.document.documentElement.innerHTML = contents;
			const text = this.document.documentElement.textContent;
			return text;
		}
	}

	async close() {
		// No need to close anything for jsdom
	}

}

exports.TextExtractor = TextExtractor;
``
jsClass representing the Puppeteer-based wrapper for get-page-textjsconst playwright = requireplaywright	getPageText = requireinjectedget-page-text;const URL_PREFIX = httpsexamplecom;class TextExtract

原文地址: https://www.cveoy.top/t/topic/hYN6 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录