HTML 文件文本提取测试 - getPageText() 功能测试
/\nRun tests of getPageText() from HTML files in the ./fixtures/html/ directory\n/\n\nconst inputDir = "./bin/fixtures/html/"\n\nconst fs = require("fs"),\n\tpath = require("path"),\n\t{promisify} = require("util"),\n\treadFileAsync = promisify(fs.readFile),\n\twriteFileAsync = promisify(fs.writeFile),\n\t{TextExtractor} = require("./text-extractor"),\n\tflattenTree = require("./flatten-tree").flattenTree;\n\nasync function main() {\n\t// Collect up paths of .HTML files in the input directory\n\tconst filepaths = [];\n\tfunction scanDirectory(pathname) {\n\t const files = fs.readdirSync(pathname);\n\t files.forEach(function(filename) {\n\t const p = path.resolve(pathname,filename),\n\t s = fs.lstatSync(p),\n\t x = path.extname(filename);\n\t if(s.isDirectory()) {\n\t scanDirectory(p);\n\t } else if(x === ".html") {\n\t filepaths.push(p);\n\t }\n\t });\n\t}\n\tscanDirectory(inputDir);\n\t// Accumulate test failures\n\tconst failures = [];\n\t// Test each page in turn\n\tfor(const filepath of filepaths) {\n\t const results = await testPage(filepath);\n\t // Compare the results\n\t if(!compareResults(results)) {\n\t failures.push(results);\n\t }\n\t}\n\t// Check for failures\n\treturn failures;\n}\n\nasync function testPage(filepath) {\n\t// Setup the text extractor\n\tconst textExtractor = new TextExtractor({\n\t getFile: async fileHref => {\n\t if(fileHref === "index.html") {\n\t return {\n\t type: "text/html",\n\t contents: await readFileAsync(filepath,"utf8")\n\t }\n\t } else {\n\t return {\n\t type: null,\n\t contents: null\n\t }\n\t }\n\t },\n\t logError: msg => {\n\t console.log("Text extractor error: " + msg)\n\t }\n\t});\n\t// Get the text of the page \n\tconst results = await textExtractor.getPageText("index.html");\n\t// Flatten the nodes of the results\n\tfor(const chunk of results.chunks) {\n\t chunk.text = flattenTree(chunk.nodes);\n\t delete chunk.nodes;\n\t}\n\tresults.filepath = filepath;\n\treturn results;\n}\n\nfunction compareResults(results) {\n if(results.chunks.length === results.expectedResults.length) {\n for(let index = 0; index < results.chunks.length; index++) {\n let r = results.chunks[index],\n e = results.expectedResults[index];\n if(r.text !== e.text || (r.anchorIds || []).join(",") !== (e.anchorIds || []).join(",")) {\n return false;\n }\n }\n return true;\n }\n return false;\n};\n\nmain().then(results => {\n // Check for failures\n if(results.length === 0) {\n process.exit(0);\t \n } else {\n console.error("Tests failed");\n console.error(JSON.stringify(results,null,4));\n process.exit(1);\n }\n}).catch(err => {\n console.error(err);\n process.exit(1);\n});
原文地址: https://www.cveoy.top/t/topic/pH3P 著作权归作者所有。请勿转载和采集!