In our previous article of the Puppeteer series we implemented a rule-based scraper based on headless Chrome using Puppeteer. We injected our scraping functions into the browser's context (window) then used those to execute scraping scenarios inside the browser.
In this article we will try to achieve the same thing, but this time using Puppeteer's methods without injecting functions into the browser's context.
Rewrite scrape helper functions
In our rule-based extractor tutorial we defined several types in the rules data structure. Namely text
, text
, image
, attr
and obj
. We then implemented a helper function to extract data from each of those types. Now let's rewrite those functions to make them use Puppeteer's built-in methods.
Puppeteer provides us with methods to select element(s) in the DOM
and evaluate custom Javascript functions on those selected elements. We are particularly interested in the following:
$eval
: selects an element inDOM
by its selector - passed as first argument - and evaluates a Javascript function - passed as second argument - on this element in the context of a specified page, and returns the result of the function$$eval
: does the same thing as$eval
but on multiple elements.$$eval
is to$eval
whatdocument.querySelectorAll
is todocument.querySelector
.
We start by creating a Node.js class that holds all of those helper methods:
vim ./utils/rules-scraper.js
// file ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
/**
*
* @param {boolean} ignoreErrors
*/
constructor(ignoreErrors = false) {
// whether to ignore errors if a selector is not found or to stop script execution
this.ignoreErrors = ignoreErrors;
}
// A helper method to handle errors
returnOrThrow(e) {
if (this.ignoreErrors) {
debug(`IgnoredError: ${e.message}`);
return null;
}
throw e;
}
}
- Scrape elements of type
text
Here we're using the textContent
property of Puppeteer's ElementHandle
object to extract an element's text
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
// other methods...
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeText(pageOrEl, selector, isMultiple = false) {
try {
if (!isMultiple) {
return await pageOrEl.$eval(selector, el => el.textContent);
}
return await pageOrEl.$$eval(selector, (els) => els.map(el => el.textContent));
} catch (e) {
return this.returnOrThrow(e);
}
}
}
2. Scrape elements of type attr
The ElementHandle
class has method getAttribute
which takes attribute's name as argument and returns the value of the attribute.
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
// other methods...
/**
*
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {string} attribute
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeAttribute(pageOrEl, selector, attribute, isMultiple = false) {
try {
if (!isMultiple) {
return await pageOrEl.$eval(selector, (el, attr) => el.getAttribute(attr), attribute);
}
return await pageOrEl.$$eval(selector, (els, attr) => els.map(el => el.getAttribute(attr)), attribute);
} catch (e) {
return this.returnOrThrow(e);
}
}
}
3. Scrape element of type image
and link
image
and link
are just special case of attr
type
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
// other methods...
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeImage(pageOrEl, selector, isMultiple = false) {
return this.scrapeAttribute(pageOrEl, selector, 'src', isMultiple);
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeLink(pageOrEl, selector, isMultiple = false) {
return this.scrapeAttribute(pageOrEl, selector, 'href', isMultiple);
}
}
4. Scrape element of type obj
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
// other methods...
/**
*
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {Record<string, Rule>} children
* @returns {Promise<*[]>}
*/
async scrapeChildren(pageOrEl, children) {
let data = {};
for (const name in children) {
data[name] = await this.scrapeRule(pageOrEl, children[name]);
}
return data;
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @param {Record<string, Rule>}children
* @returns {{}[]|{}}
*/
async scrapeObject(pageOrEl, selector, isMultiple, children) {
let elements = [];
if (isMultiple) {
elements = await pageOrEl.$$(selector);
} else {
elements.push(await pageOrEl.$(selector));
}
const results = [];
for (const element of elements) {
results.push(await this.scrapeChildren(element, children));
}
return isMultiple ? results : results[0];
}
}
Below we're using Puppeteer's methods .$$
and .$
which will just select and return the matching elements of a given selector in the DOM. They are basically just like .$$eval
and .$eval
methods without the evaluate function
The scrapeChildren
method just loops over the sub-rules of obj
type and scrapes those rules from the element(s) that match the rule's selector.
The scrapeRule
method that will execute the matching method based on the rule's type. It's defined as follow:
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
// other methods...
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {Rule} rule
* @returns {{}[]|{}|(string|null)[]|string|null}
*/
async scrapeRule(pageOrEl, rule) {
switch (rule.type) {
case 'text':
return this.scrapeText(pageOrEl, rule.selector, rule.multiple);
case 'link':
return this.scrapeLink(pageOrEl, rule.selector, rule.multiple);
case 'image':
return this.scrapeImage(pageOrEl, rule.selector, rule.multiple);
case 'attr':
return this.scrapeAttribute(pageOrEl, rule.selector, rule.attribute, rule.multiple);
case 'obj':
return this.scrapeObject(pageOrEl, rule.selector, rule.multiple, rule.children);
default:
throw new Error(`Unknown rule type ${rule.type}`);
}
}
}
Finally let's add method scrapeRules
which extracts the provided rules
from page
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
// other methods
/**
* @param {import('puppeteer').Page} page
* @param {Record<string, Rule>[]} rules
* @returns {{}[]|{}|(string|null)[]|string|null}
*/
async scrapeRules(page, rules) {
const results = [];
for (const rule of rules) {
results.push(await this.scrapeChildren(page, rule));
}
return results;
}
}
In all the methods outlined here, the first argument pageOrEl
could be either Puppeteer's Page
object (in the case of top level rules) or Puppeteer's ElementHandle
(in the case of sub-rules of obj
type).
Our complete class should look something like this:
// ./utils/rules-scraper.js
const debug = require('debug')('rules-scraper');
class RulesScraper {
/**
*
* @param {boolean} ignoreErrors
*/
constructor(ignoreErrors = false) {
// whether to ignore errors if a selector is not found or to stop script execution
this.ignoreErrors = ignoreErrors;
}
// A helper method to handle errors
returnOrThrow(e) {
if (this.ignoreErrors) {
debug(`IgnoredError: ${e.message}`);
return null;
}
throw e;
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeText(pageOrEl, selector, isMultiple = false) {
try {
if (!isMultiple) {
return await pageOrEl.$eval(selector, el => el.textContent);
}
return await pageOrEl.$$eval(selector, (els) => els.map(el => el.textContent));
} catch (e) {
return this.returnOrThrow(e);
}
}
/**
*
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {string} attribute
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeAttribute(pageOrEl, selector, attribute, isMultiple = false) {
try {
if (!isMultiple) {
return await pageOrEl.$eval(selector, (el, attr) => el.getAttribute(attr), attribute);
}
return await pageOrEl.$$eval(selector, (els, attr) => els.map(el => el.getAttribute(attr)), attribute);
} catch (e) {
return this.returnOrThrow(e);
}
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeImage(pageOrEl, selector, isMultiple = false) {
return this.scrapeAttribute(pageOrEl, selector, 'src', isMultiple);
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
async scrapeLink(pageOrEl, selector, isMultiple = false) {
return this.scrapeAttribute(pageOrEl, selector, 'href', isMultiple);
}
/**
*
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {Record<string, Rule>} children
* @returns {Promise<*[]>}
*/
async scrapeChildren(pageOrEl, children) {
let data = {};
for (const name in children) {
data[name] = await this.scrapeRule(pageOrEl, children[name]);
}
return data;
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {string} selector
* @param {boolean} isMultiple
* @param {Record<string, Rule>}children
* @returns {{}[]|{}}
*/
async scrapeObject(pageOrEl, selector, isMultiple, children) {
let elements = [];
if (isMultiple) {
elements = await pageOrEl.$$(selector);
} else {
elements.push(await pageOrEl.$(selector));
}
const results = [];
for (const element of elements) {
results.push(await this.scrapeChildren(element, children));
}
return isMultiple ? results : results[0];
}
/**
* @param {import('puppeteer').Page|import('puppeteer').ElementHandle} pageOrEl
* @param {Rule} rule
* @returns {{}[]|{}|(string|null)[]|string|null}
*/
async scrapeRule(pageOrEl, rule) {
switch (rule.type) {
case 'text':
return this.scrapeText(pageOrEl, rule.selector, rule.multiple);
case 'link':
return this.scrapeLink(pageOrEl, rule.selector, rule.multiple);
case 'image':
return this.scrapeImage(pageOrEl, rule.selector, rule.multiple);
case 'attr':
return this.scrapeAttribute(pageOrEl, rule.selector, rule.attribute, rule.multiple);
case 'obj':
return this.scrapeObject(pageOrEl, rule.selector, rule.multiple, rule.children);
default:
throw new Error(`Unknown rule type ${rule.type}`);
}
}
/**
* @param {import('puppeteer').Page} page
* @param {Record<string, Rule>[]} rules
* @returns {{}[]|{}|(string|null)[]|string|null}
*/
async scrapeRules(page, rules) {
const results = [];
for (const rule of rules) {
results.push(await this.scrapeChildren(page, rule));
}
return results;
}
}
module.exports = {
RulesScraper
}
Customize our scraper
function
Now let's make our scraper
function, which we first defined here and then customized here, use this class. We will define new task puppeteerScrapeTask
in our ./utils/page-utils.js
file:
// ./utils/page-utils.js
// ...other imports
const RulesScraper = require("./rules-scraper");
// ...other tasks
/**
*
* @param {import("puppeteer").Page} page
* @param {Record<string, Rule>[]} rules - The number of ms to wait for, or selector to wait for in page and custom function to execute
* @param {{ignoreErrors: boolean}} options
*/
async function puppeteerScrapeTask(page, rules, options = {}) {
const {
ignoreErrors
} = options;
const scraper = new RulesScraper(ignoreErrors);
return scraper.scrapeRules(page, rules);
}
module.exports = {
// ...other exports
puppeteerScrapeTask,
}
We then make scraper
in simple-sraper.js
configurable to either use puppeteerScrapeTask
or scraperTask
based on the browser
function.
// simple-scraper.js
// ...other imports
// import waitForTask and scrapeTask we defined before
const {waitForTask, scrapeTask: browserScrapeTask, puppeteerScrapeTask} = require("./utils/page-utils");
async function scraper(url, rules, options = {}) {
let browser;
const {
// ...previous options
inBrowserScraper = false, // use puppeteerTask by default,
ignoreErrors = true, // ignore errors by default ( errors like selector not found...)
} = options;
try {
// ...previous calls
debug(`Start scraping`);
if (inBrowserScraper) {
// Scrape data using the DOM API
return await browserScrapeTask(page, rules);
}
return await puppeteerScrapeTask(page, rules, {ignoreErrors});
}
// ... catch and finally blocks
}
module.exports = {
scraper,
}
Our scraper
function should now look like this:
// simple-scraper.js
const debug = require('debug')('simple-scraper');
// import the function we created before
const {startBrowser} = require("./utils/browser-utils");
// import waitForTask and scrapeTask we defined before
const {waitForTask, scrapeTask: browserScrapeTask, puppeteerScrapeTask} = require("./utils/page-utils");
async function scraper(url, rules, options = {}) {
let browser;
const {
headless = true,
gotoOptions = {
timeout: null,
waitUntil: 'domcontentloaded'
},
waitFor, // the selector to wait for
waitForTimeout, // waitFor timeout
inBrowserScraper = false, // use puppeteerTask by default,
ignoreErrors = true, // ignore errors by default ( errors like selector not found...)
} = options;
try {
debug('Starting');
browser = await startBrowser({
headless,
});
const page = await browser.newPage();
debug(`Navigate to URL ${url}`);
await page.goto(url, {...gotoOptions});
// only if waitFor is specified
if (waitFor) {
debug(`Wait for ${waitFor}`)
await waitForTask(page, waitFor, waitForTimeout);
}
debug(`Start scraping`);
if (inBrowserScraper) {
// Scrape data using the DOM API
return await browserScrapeTask(page, rules);
}
return await puppeteerScrapeTask(page, rules, {ignoreErrors});
} catch (e) {
// handle error
debug(`Error ${e.message}`);
return null;
} finally {
// close browser
if (browser) {
await browser.close();
}
}
}
module.exports = {
scraper,
}
Testing the scraper
function
We will use the same test
function as in our previous article:
const {scraper} = require("./scraper");
(async () => {
const url = "https://ujeebu.com/docs/scrape-me/load-more";
const data = await scraper(url, [
{
"products": {
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": {
"name": {
"selector": ".title",
"type": "text"
},
"description": {
"selector": ".description",
"type": "text"
},
"price": {
"selector": ".price",
"type": "text"
},
"image": {
"selector": ".card__image > img",
"type": "image",
}
}
}
}
], {
headless: true,
waitFor: '.products-list',
inBrowserScraper: false,
ignoreErrors: true
});
console.log(JSON.stringify(data, null, 2));
})();
Conclusion
In this post we showed how we can use Puppeteer native functions to implement rule based scraping functionality.
Why Puppeteer? Puppeteer provides a high level API for controlling Chrome or Chromium over the DevTools protocol which makes it a great tool for scraping automation using headless browsers.
If you don't have time to deal with headless browsers with a library such as Puppeteer and would like to automate your scraping efforts as much as possible, we have an API just for you. Try us out today. The first 5000 credits (approx. 1000 requests) are on us, and no credit card is required.