diff options
author | Sarah German <sgerman@gitlab.com> | 2022-08-18 18:02:30 +0300 |
---|---|---|
committer | Marcel Amirault <mamirault@gitlab.com> | 2022-08-18 18:02:30 +0300 |
commit | ab74a97edcc8a2c366cc68ffa2c5434fb32df423 (patch) | |
tree | 8684d88d9b5d1a4b335d7bb2638242313463f60e /scripts | |
parent | 5a2916f6bcdf04f06376213c2e6435f5b44aaae1 (diff) |
Add Lunr.js index script
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/lunr/preindex.js | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/scripts/lunr/preindex.js b/scripts/lunr/preindex.js new file mode 100644 index 00000000..ffae7d8c --- /dev/null +++ b/scripts/lunr/preindex.js @@ -0,0 +1,106 @@ +#!/usr/bin/env node + +/** + * @file preindex.js + * Creates data files required for Lunr search. + * + * This script creates two JSON files: + * - lunr-index.js: A serialized search index. + * - lunr-map.js: Maps index item IDs to their human-readable titles. + * + * @see https://lunrjs.com/guides/index_prebuilding.html + */ + +/* eslint-disable no-console */ + +const fs = require('fs'); +const lunr = require('lunr'); +const cheerio = require('cheerio'); +const glob = require('glob'); + +const htmlSrc = 'public/'; +const outputDir = `${htmlSrc}assets/javascripts`; + +/** + * Find all HTML files within a given path, + * then execute a callback function to build the index. + */ +const buildIndex = (path, callback) => { + glob(`${path}/**/*.html`, callback); +}; + +/** + * Extracts text from a given HTML element. + * + * @param {cheerio} $ + * A Cheerio page object + * @param {String} element + * An HTML element to search for + * + * @return {String} + * All text contained within the given element + */ +const getText = ($, element) => { + const headingText = []; + $(element) + .toArray() + .forEach((el) => { + headingText.push($(el).text().replace('\n', '')); + }); + return headingText.join(' '); +}; + +/** + * Build the index and output files. + */ +buildIndex(htmlSrc, (err, filenames) => { + if (err) { + console.error(err); + } + + // Create an array of objects containing each page's text content. + const pages = []; + Object.keys(filenames).forEach((key) => { + const filename = filenames[key]; + const $ = cheerio.load(fs.readFileSync(filename)); + const title = getText($, 'h1'); + + if (title.length) { + pages.push({ + id: filename.slice(htmlSrc.length), + h1: title, + h2: getText($, 'h2'), + h3: getText($, 'h3'), + }); + } + }); + + // Build the serialized Lunr search index. + const idx = lunr((e) => { + e.ref('id'); + e.field('h1', { boost: 10 }); + e.field('h2', { boost: 5 }); + e.field('h3', { boost: 2 }); + pages.forEach((doc) => { + e.add(doc); + }, e); + }); + + // Write the index file. + fs.writeFile(`${outputDir}/lunr-index.json`, JSON.stringify(idx), (fsErr) => { + if (fsErr) { + console.error(fsErr); + } + }); + + // Write the map file. + // We can drop h2s and h3s from this since we don't display those in results. + const pageMap = pages.map(({ h2, h3, ...rest }) => { + return rest; + }); + fs.writeFile(`${outputDir}/lunr-map.json`, JSON.stringify(pageMap), (fsErr) => { + if (fsErr) { + console.error(fsErr); + } + }); +}); |