Simple Puppeteer-based Scraper: Rule based extraction

This is the second part of our Puppeteer based simple scraper series. In this article, we show how to scrape any website with a given set of rules using the Puppeteer library. If you're new to Puppeteer, check out our introduction on how to set up Puppeteer and an overview of its capabilities, or the first part of this series which provides a step-by-step tutorial for scraping content using Puppeteer in Node JS.

Rules format

First let's define the format of our rules:

interface Rule {
     selector: string;
     type:  'text' | 'link' | 'image' | 'attr' | 'obj';
     attribute?: string;
     multiple?: boolean;
     children?: Rule[];
 }

selector : defines the CSS selector of the element to scrape,
type: defines the type of the content to get from the element,
attribute: optional, required only when the type is attr,
multiple: whether to scrape all matches or just the first one,
children: when type is obj defines the rules to scrape data from the matching elements.

Scrape helper functions

Let's define a set of functions for extracting each of the types above:

vim ./utils/scrape-utils.js

Scrape element of type text

// ./utils/scrape-utils.js

/**
 * @param {Document|Element} document
 * @param {string} selector
 * @param {boolean} isMultiple
 * @returns {(string|null)[]|string|null}
 */
function scrapeText(document, selector, isMultiple = false) {
    if (!isMultiple) {
        let element = document.querySelector(selector);
        return element ? element.textContent : null;
    }

    const elements = Array.from(document.querySelectorAll(selector));
    return elements.map(el => el ? el.textContent : null);

}

2. Scrape element of type attr

// ./utils/scrape-utils.js

/**
 *
 * @param {Document|Element} document
 * @param {string} selector
 * @param {string} attribute
 * @param {boolean} isMultiple
 * @returns {(string|null)[]|string|null}
 */
function scrapeAttribute(document, selector, attribute, isMultiple = false) {
    if (!isMultiple) {
        let element = document.querySelector(selector);
        return element ? element.getAttribute(attribute) : null;
    }

    const elements = Array.from(document.querySelectorAll(selector));
    return elements.map(el => el ? el.getAttribute(attribute) : null);

}

3. Scrape element of type image and link

These two types are just special cases of type attr

// ./utils/scrape-utils.js

/**
 * @param {Document|Element} document
 * @param {string} selector
 * @param {boolean} isMultiple
 * @returns {(string|null)[]|string|null}
 */
function scrapeImage(document, selector, isMultiple = false) {
    return scrapeAttribute(document, selector, 'src', isMultiple);
}

/**
 * @param {Document|Element} document
 * @param {string} selector
 * @param {boolean} isMultiple
 * @returns {(string|null)[]|string|null}
 */
function scrapeLink(document, selector, isMultiple = false) {
    return scrapeAttribute(document, selector, 'href', isMultiple);
}

4. Scrape element of type obj

// ./utils/scrape-utils.js

/**
 * @param {Document|Element} document
 * @param {string} selector
 * @param {boolean} isMultiple
 * @param {Object[]}children
 * @returns {{}[]|{}}
 */
function scrapeObject(document, selector, isMultiple, children) {

    const elements = isMultiple ? Array.from(document.querySelectorAll(selector)) : [document.querySelector(selector)];

    const data = elements.map(element => {
        let data = {};
        for (let key in children) {
            data[key] = scrapeRule(element, children[key]);
        }
        return data;
    });

    return isMultiple ? data : data[0];

}

Parameter document could be the global document object, or an html element in the page.

Now let's define function scrapeRule to use them all:

// ./utils/scrape-utils.js

/**
 * @param {Document|Element} document
 * @param {Rule} rule
 * @returns {{}[]|{}|(string|null)[]|string|null}
 */
function scrapeRule(document, rule) {
    switch (rule.type) {
        case 'text':
            return scrapeText(document, rule.selector, rule.multiple);

        case 'link':
            return scrapeLink(document, rule.selector, rule.multiple);

        case 'image':
            return scrapeImage(document, rule.selector, rule.multiple);

        case 'attr':
            return scrapeAttribute(document, rule.selector, rule.attribute, rule.multiple);

        case 'obj':
            return scrapeObject(document, rule.selector, rule.multiple, rule.children);

        default:
            throw new Error(`Unknown rule type ${rule.type}`);

    }
}

Customize our `scraper` function:

Let's go back to our scraper function which we defined in the last article. Here it is one more time:

// simple-scraper.js

const debug = require('debug')('simple-scraper');

// import the function we created before
const {startBrowser} = require("./utils/browser-utils");


async function scraper(url, options = {}) {
    let browser;

    const {
        headless = true,
        gotoOptions = {
            timeout: null,
            waitUntil: 'domcontentloaded'
        },
    } = options;

    try {
        debug('Starting');
        browser = await startBrowser({
            headless,
        });

        const page = await browser.newPage();

        debug(`Navigate to URL ${url}`);
        await page.goto(url, {...gotoOptions});

        debug(`Wait for selector: .products-list`)
        await page.waitForSelector('.products-list', {timeout: 0})


        // we used .$$ to select all products
        debug(`Getting product elements: .product-card`);
        const products = await page.$$('.product-card');
        const data = [];

        debug(`Scraping product information of ${products.length} products`);
        for (const product of products) {
            const productData = await product.evaluate(node => {
                return {
                    'title': node.querySelector('.title').innerText,
                    'price': node.querySelector('.price').innerText,
                    'description': node.querySelector('.description').innerText,
                    'image': node.querySelector('.card__image>img').getAttribute('src'),
                }
            });
            data.push(productData);
        }

        return data;


    } catch (e) {
        // handle error
        debug(`Error ${e.message}`);
        return null;
    } finally {

        // close browser
        if (browser) {
            await browser.close();
        }
    }
}

Now let's customize it...

Improve waitFor

First let's improve the waitFor part so it's configurable and so we can wait for element to load, or wait for an amount of time:

vim ./utils/page-utils.js

// ./utils/page-utils.js

/**
 *
 * @param {import("puppeteer").Page} page
 * @param {number|string|null} waitFor - The number of ms to wait for, or selector to wait for in page and custom function to execute
 * @param {number} waitForTimeout - timeout of waitFor if function or selector is provided
 */
async function waitForTask(page, waitFor, waitForTimeout) {

    // if it's a number just wait
    if (!isNaN(waitFor)) {
        return await new Promise((r) => setTimeout(r, parseInt(waitFor)));
    }

    // Let's assume it's a selector or a function if it's not a number
    const defaultWaitTimeout = 30000;
    // validate the selector
    const isSelector = await page.evaluate((s) => {
        try {
            document.createDocumentFragment().querySelector(s);
        } catch (e) {
            return false;
        }
        return true;
    }, waitFor);

    let waitForPromise;
    if (isSelector) {
        waitForPromise = page.waitForSelector(waitFor, {timeout: waitForTimeout ? 0 : defaultWaitTimeout});
    } else {
        
        // if it's not a selector! let's assume it's a function
        waitForPromise = page.evaluate(`(${waitFor})()`);
    }

    // if timeout if defined...
    if (waitForTimeout) {

        // wait for the selection/function or timeout
        return await Promise.race([
            waitForPromise,
            new Promise((r) => setTimeout(r, waitForTimeout))
        ]);

    }


    return await waitForPromise;

}

module.exports = {
    waitForTask
}

Let's incorporate our helper function:

// simple-scraper.js

const debug = require('debug')('simple-scraper');

// import the function we created before
const {startBrowser} = require("./utils/browser-utils");

// import waitForTask we defined before
const {waitForTask} = require("./utils/page-utils");


async function scraper(url, rules, options = {}) {
    
    const {
        headless = true,
        gotoOptions = {
            timeout: null,
            waitUntil: 'domcontentloaded'
        },
        waitFor, // the selector to wait for
        waitForTimeout // waitFor timeout
    } = options;
    
    // ... code before waitFor
    
    // only if waitFor is specified
    if (waitFor) {
        debug(`Wait for ${waitFor}`)
        await waitForTask(page, waitFor, waitForTimeout);
    }
    
       // ... code after waitFor
}

Define scrape function helper

Next, let's define a function that takes a list of rules and uses it to extract data from a page.

Edit file ./utils/page-utils.js

// ./utils/page-utils.js

// import scrape functions we defined earlier
let scrapeFunctions = require("./scrape-utils");

//... rest of code


/**
 *
 * @param {import("puppeteer").Page} page
 * @param {Object[]} rules - The number of ms to wait for, or selector to wait for in page and custom function to execute
 */
async function scrapeTask(page, rules) {
    debug(`Start scraping`);
    
    // prepare functions so we can inject them in browser context
    for (const func in scrapeFunctions) {
        scrapeFunctions[func] = scrapeFunctions[func].toString();
    }


    return await page.evaluate((rules, scrapeFunctions
        ) => {
            console.log(`Start scraping rules`, rules);

            // let's define all functions in global scope,
            // so they can be accessible from each-other
            // another way would be to use puppeteer.exposeFunction or use puppeteer.addScriptTag
            for (const func in scrapeFunctions) {
                globalThis[func] = new Function('return ' + scrapeFunctions[func])();
            }

            return rules.map((rule) => {
                const data = {};
                for (const name in rule) {
                    // scrapeRule function is defined in the global scope...
                    data[name] = scrapeRule(document, rule[name]);

                }

                return data;
            });
        }, rules, scrapeFunctions
    );
}


module.exports = {
    waitForTask,
    scrapeTask
}

Our scrapeTask function takes a list of rules in the format we defined earlier, then extracts data from the matching elements per those rules.

We used Javascript's Function constructor to inject our scrape helper functions (defined earlier) into the browser's page context, and we used variable globalThis which refers to the window object in the browser context to define our functions globally. This is because Function constructor creates functions which execute in the global scope only, and our functions would not have access to each other if we define them locally.

Customize scraper to use the scrapeTask function:

// simple-scraper.js

const debug = require('debug')('simple-scraper');

// import the function we created before
const {startBrowser} = require("./utils/browser-utils");

// import waitForTask and scrapeTask we defined before
const {waitForTask, scrapeTask} = require("./utils/page-utils");


async function scraper(url, rules, options = {}) {
    
    const {
        headless = true,
        gotoOptions = {
            timeout: null,
            waitUntil: 'domcontentloaded'
        },
        waitFor, // the selector to wait for
        waitForTimeout // waitFor timeout
    } = options;
    
    // ... code before waitFor
    
    // only if waitFor is specified
    if (waitFor) {
        debug(`Wait for ${waitFor}`)
        await waitForTask(page, waitFor, waitForTimeout);
    }
    
    debug(`Start scraping`);
    // Scrape data using the DOM API
    return await scrapeTask(page, rules);
    
       // ... code after waitFor
}

Our scraper function now looks like this:

// simple-scraper.js

const debug = require('debug')('simple-scraper');

// import the function we created before
const {startBrowser} = require("./utils/browser-utils");

// import waitForTask and scrapeTask we defined before
const {waitForTask, scrapeTask} = require("./utils/page-utils");



async function scraper(url, rules, options = {}) {
    let browser;

    const {
        headless = true,
        gotoOptions = {
            timeout: null,
            waitUntil: 'domcontentloaded'
        },
        waitFor, // the selector to wait for
        waitForTimeout // waitFor timeout
    } = options;

    try {
        debug('Starting');
        browser = await startBrowser({
            headless,
        });

        const page = await browser.newPage();

        debug(`Navigate to URL ${url}`);
        await page.goto(url, {...gotoOptions});

        // only if waitFor is specified
        if (waitFor) {
            debug(`Wait for ${waitFor}`)
            await waitForTask(page, waitFor, waitForTimeout);
        }

        debug(`Start scraping`);

        // Scrape data using the DOM API
        return await scrapeTask(page, rules);


    } catch (e) {
        // handle error
        debug(`Error ${e.message}`);
        return null;
    } finally {

        // close browser
        if (browser) {
            await browser.close();
        }
    }
}

module.exports = {
    scraper,
}

Testing it all

const {scraper} = require("./scraper");


(async () => {
    const url = "https://ujeebu.com/docs/scrape-me/load-more";
    const data = await scraper(url, [
        {
            "products": {
                "selector": ".product-card",
                "type": "obj",
                "multiple": true,
                "children": {
                    "name": {
                        "selector": ".title",
                        "type": "text"
                    },
                    "description": {
                        "selector": ".description",
                        "type": "text"
                    },
                    "price": {
                        "selector": ".price",
                        "type": "text"
                    },
                    "image": {
                        "selector": ".card__image > img",
                        "type": "image",
                    }
                }
            }
        }
    ],  {
        headless: true,
        waitFor: '.products-list',
    });

    console.log(JSON.stringify(data, null, 2));
})();

the above will output the following:

[
  {
    "products": [
      {
        "name": "Fantastic Plastic Salad",
        "description": "Andy shoes are designed to keeping in mind durability as well as trends, the most stylish range of shoes & sandals",
        "price": "2082.23$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Intelligent Concrete Chicken",
        "description": "Andy shoes are designed to keeping in mind durability as well as trends, the most stylish range of shoes & sandals",
        "price": "7152.76$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Refined Frozen Ball",
        "description": "Andy shoes are designed to keeping in mind durability as well as trends, the most stylish range of shoes & sandals",
        "price": "3396.32$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Awesome Fresh Chair",
        "description": "New ABC 13 9370, 13.3, 5th Gen CoreA5-8250U, 8GB RAM, 256GB SSD, power UHD Graphics, OS 10 Home, OS Office A & J 2016",
        "price": "3242.52$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Awesome Steel Towels",
        "description": "New ABC 13 9370, 13.3, 5th Gen CoreA5-8250U, 8GB RAM, 256GB SSD, power UHD Graphics, OS 10 Home, OS Office A & J 2016",
        "price": "3956.33$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Small Rubber Shoes",
        "description": "The slim & simple Maple Gaming Keyboard from Dev Byte comes with a sleek body and 7- Color RGB LED Back-lighting for smart functionality",
        "price": "7216.89$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Fantastic Frozen Soap",
        "description": "Carbonite web goalkeeper gloves are ergonomically designed to give easy fit",
        "price": "7332.42$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Rustic Granite Fish",
        "description": "Bostons most advanced compression wear technology increases muscle oxygenation, stabilizes active muscles",
        "price": "1180.94$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Sleek Plastic Chips",
        "description": "The Football Is Good For Training And Recreational Purposes",
        "price": "1571.19$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Incredible Fresh Bike",
        "description": "Bostons most advanced compression wear technology increases muscle oxygenation, stabilizes active muscles",
        "price": "654.61$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Awesome Metal Computer",
        "description": "New range of formal shirts are designed keeping you in mind. With fits and styling that will make you stand apart",
        "price": "4858.48$",
        "image": "https://placeimg.com/500/500"
      },
      {
        "name": "Awesome Steel Towels",
        "description": "The Nagasaki Lander is the trademarked name of several series of Nagasaki sport bikes, that started with the 1984 ABC800J",
        "price": "8467.43$",
        "image": "https://placeimg.com/500/500"
      }
    ]
  }
]

Conclusion

In this article, we built a rule based scraper using Puppeteer. In the next post we will add more options to further enhance our rule based scraper.

Why Puppeteer? At Ujeebu we use a plethora of tools for scraping and are constantly experimenting with new as well as proven technologies to help our customers achieve their data extraction goals in a cost effective way.

If you don't have time to deal with headless browsers with a library such as Puppeteer and would like to automate your scraping efforts as much as possible, we have an API just for you. Try us out today. The first 5000 credits (approx. 1000 requests) are on us, and no credit card is required.

Simple Puppeteer-based Scraper: Rule based extraction

Rules format

Scrape helper functions

Customize our `scraper` function:

Improve waitFor

Define scrape function helper

Testing it all

Conclusion

A Simple Rule-based Scraper using Puppeteer's native methods

A Simple Scraper using Puppeteer

Rules format

Scrape helper functions

Customize our scraper function:

Improve waitFor

Define scrape function helper

Testing it all

Conclusion

Customize our `scraper` function: