Tech Tips: Web Crawler with Redis Indexing

Summary

I'll be demonstrating a simple web crawler implementation in Nodejs in this post. The crawled results are then stored in Redis and indexed with RediSearch. Apache Tika is used for document parsing.

Architecture

High Level

Detailed

Crawler POST End Point

 
/**
 * Crawl endpoint.  Starts a Worker thread (spider.js) that crawls the given fqdn, extracts text via Tika,
 * and then store the resulting text in Redis as JSON documents
 * This returns immediately and provides a taskID for the Worker thread. 
 */
app.post('/crawl', (req, res) => {
    console.log(`app - POST /crawl ${req.body.fqdn}`);
    const taskID = uuidv4();
    try {
        new Worker('./app/spider.js', { workerData : { 'fqdn': req.body.fqdn, 'taskID': taskID }});
        res.status(201).json({'taskID': taskID});
    }
    catch (err) {
        console.error(`app - POST /crawl ${req.body.fqdn} - ${err.message}`)
        res.status(400).json({ 'error': err.message });
    }
});

Text Extraction with Tika

    async extract(doc, data, hash) {
        const stream = Readable.from(data);  //get a stream from the arrayBuffer obj
        const response = await axios({  //send that stream to Tika for automatic mime-type detection and text extraction
            method: 'PUT',
            url: `${tikaUrl}/tika`,
            data: stream,
            responseType: 'text',
            headers: {
                'Content-Type': 'application/octet-stream',
                'Accept': 'text/plain'
            }
        });
        const json = { "doc": doc, "text": response.data, "hash": hash };
        await this.client.json.set(`${PREFIX}:${doc}`, '.', json);
    }

Index Creation

async function buildIndex() {
    console.log(`app - buildIndex`);
    let rc = await clientFactory();
    try {
        await rc.ft.create('docIdx', {
            '$.doc': {
                type: redis.SchemaFieldTypes.TEXT,
                AS: 'doc'
            },
            '$.text': {
                type: redis.SchemaFieldTypes.TEXT,
                AS: 'text'
            }   
        }, {
            ON: 'JSON',
            PREFIX: 'DOC'
        });
    }
    catch(err) { 
        console.error(`app - buildIndex - ${err.message}`); 
    }
}

Source

https://github.com/Redislabs-Solution-Architects/doc-crawler

Saturday, October 15, 2022

Web Crawler with Redis Indexing

Summary

Architecture

High Level

Detailed

Crawler POST End Point

Text Extraction with Tika

Index Creation

Source