Saturday, October 15, 2022

Web Crawler with Redis Indexing

Summary

I'll be demonstrating a simple web crawler implementation in Nodejs in this post.  The crawled results are then stored in Redis and indexed with RediSearch.  Apache Tika is used for document parsing.

Architecture

High Level


Detailed


Crawler POST End Point


 
/**
 * Crawl endpoint.  Starts a Worker thread (spider.js) that crawls the given fqdn, extracts text via Tika,
 * and then store the resulting text in Redis as JSON documents
 * This returns immediately and provides a taskID for the Worker thread. 
 */
app.post('/crawl', (req, res) => {
    console.log(`app - POST /crawl ${req.body.fqdn}`);
    const taskID = uuidv4();
    try {
        new Worker('./app/spider.js', { workerData : { 'fqdn': req.body.fqdn, 'taskID': taskID }});
        res.status(201).json({'taskID': taskID});
    }
    catch (err) {
        console.error(`app - POST /crawl ${req.body.fqdn} - ${err.message}`)
        res.status(400).json({ 'error': err.message });
    }
});

Text Extraction with Tika

    async extract(doc, data, hash) {
        const stream = Readable.from(data);  //get a stream from the arrayBuffer obj
        const response = await axios({  //send that stream to Tika for automatic mime-type detection and text extraction
            method: 'PUT',
            url: `${tikaUrl}/tika`,
            data: stream,
            responseType: 'text',
            headers: {
                'Content-Type': 'application/octet-stream',
                'Accept': 'text/plain'
            }
        });
        const json = { "doc": doc, "text": response.data, "hash": hash };
        await this.client.json.set(`${PREFIX}:${doc}`, '.', json);
    }

Index Creation

async function buildIndex() {
    console.log(`app - buildIndex`);
    let rc = await clientFactory();
    try {
        await rc.ft.create('docIdx', {
            '$.doc': {
                type: redis.SchemaFieldTypes.TEXT,
                AS: 'doc'
            },
            '$.text': {
                type: redis.SchemaFieldTypes.TEXT,
                AS: 'text'
            }   
        }, {
            ON: 'JSON',
            PREFIX: 'DOC'
        });
    }
    catch(err) { 
        console.error(`app - buildIndex - ${err.message}`); 
    }
}

Source


Copyright ©1993-2024 Joey E Whelan, All rights reserved.