This is the second part of our Puppeteer based simple scraper series. In this article, we show how to scrape any website with a given set of rules using the Puppeteer library. If you're new to Puppeteer, check out our introduction on how to set up Puppeteer and an overview of its capabilities, or the first part of this series which provides a step-by-step tutorial for scraping content using Puppeteer in Node JS.
Rules format
First let's define the format of our rules:
interface Rule {
selector: string;
type: 'text' | 'link' | 'image' | 'attr' | 'obj';
attribute?: string;
multiple?: boolean;
children?: Rule[];
}
selector
: defines the CSS selector of the element to scrape,type
: defines the type of the content to get from the element,attribute
: optional, required only when the type isattr
,multiple
: whether to scrape all matches or just the first one,children
: when type isobj
defines the rules to scrape data from the matching elements.
Scrape helper functions
Let's define a set of functions for extracting each of the types above:
vim ./utils/scrape-utils.js
- Scrape element of type
text
// ./utils/scrape-utils.js
/**
* @param {Document|Element} document
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
function scrapeText(document, selector, isMultiple = false) {
if (!isMultiple) {
let element = document.querySelector(selector);
return element ? element.textContent : null;
}
const elements = Array.from(document.querySelectorAll(selector));
return elements.map(el => el ? el.textContent : null);
}
2. Scrape element of type attr
// ./utils/scrape-utils.js
/**
*
* @param {Document|Element} document
* @param {string} selector
* @param {string} attribute
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
function scrapeAttribute(document, selector, attribute, isMultiple = false) {
if (!isMultiple) {
let element = document.querySelector(selector);
return element ? element.getAttribute(attribute) : null;
}
const elements = Array.from(document.querySelectorAll(selector));
return elements.map(el => el ? el.getAttribute(attribute) : null);
}
3. Scrape element of type image
and link
These two types are just special cases of type attr
// ./utils/scrape-utils.js
/**
* @param {Document|Element} document
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
function scrapeImage(document, selector, isMultiple = false) {
return scrapeAttribute(document, selector, 'src', isMultiple);
}
/**
* @param {Document|Element} document
* @param {string} selector
* @param {boolean} isMultiple
* @returns {(string|null)[]|string|null}
*/
function scrapeLink(document, selector, isMultiple = false) {
return scrapeAttribute(document, selector, 'href', isMultiple);
}
4. Scrape element of type obj
// ./utils/scrape-utils.js
/**
* @param {Document|Element} document
* @param {string} selector
* @param {boolean} isMultiple
* @param {Object[]}children
* @returns {{}[]|{}}
*/
function scrapeObject(document, selector, isMultiple, children) {
const elements = isMultiple ? Array.from(document.querySelectorAll(selector)) : [document.querySelector(selector)];
const data = elements.map(element => {
let data = {};
for (let key in children) {
data[key] = scrapeRule(element, children[key]);
}
return data;
});
return isMultiple ? data : data[0];
}
Parameter document
could be the global document
object, or an html element in the page.
Now let's define function scrapeRule to use them all:
// ./utils/scrape-utils.js
/**
* @param {Document|Element} document
* @param {Rule} rule
* @returns {{}[]|{}|(string|null)[]|string|null}
*/
function scrapeRule(document, rule) {
switch (rule.type) {
case 'text':
return scrapeText(document, rule.selector, rule.multiple);
case 'link':
return scrapeLink(document, rule.selector, rule.multiple);
case 'image':
return scrapeImage(document, rule.selector, rule.multiple);
case 'attr':
return scrapeAttribute(document, rule.selector, rule.attribute, rule.multiple);
case 'obj':
return scrapeObject(document, rule.selector, rule.multiple, rule.children);
default:
throw new Error(`Unknown rule type ${rule.type}`);
}
}
Customize our scraper
function:
Let's go back to our scraper
function which we defined in the last article. Here it is one more time:
// simple-scraper.js
const debug = require('debug')('simple-scraper');
// import the function we created before
const {startBrowser} = require("./utils/browser-utils");
async function scraper(url, options = {}) {
let browser;
const {
headless = true,
gotoOptions = {
timeout: null,
waitUntil: 'domcontentloaded'
},
} = options;
try {
debug('Starting');
browser = await startBrowser({
headless,
});
const page = await browser.newPage();
debug(`Navigate to URL ${url}`);
await page.goto(url, {...gotoOptions});
debug(`Wait for selector: .products-list`)
await page.waitForSelector('.products-list', {timeout: 0})
// we used .$$ to select all products
debug(`Getting product elements: .product-card`);
const products = await page.$$('.product-card');
const data = [];
debug(`Scraping product information of ${products.length} products`);
for (const product of products) {
const productData = await product.evaluate(node => {
return {
'title': node.querySelector('.title').innerText,
'price': node.querySelector('.price').innerText,
'description': node.querySelector('.description').innerText,
'image': node.querySelector('.card__image>img').getAttribute('src'),
}
});
data.push(productData);
}
return data;
} catch (e) {
// handle error
debug(`Error ${e.message}`);
return null;
} finally {
// close browser
if (browser) {
await browser.close();
}
}
}
Now let's customize it...
Improve waitFor
First let's improve the waitFor
part so it's configurable and so we can wait for element
to load, or wait for an amount of time:
vim ./utils/page-utils.js
// ./utils/page-utils.js
/**
*
* @param {import("puppeteer").Page} page
* @param {number|string|null} waitFor - The number of ms to wait for, or selector to wait for in page and custom function to execute
* @param {number} waitForTimeout - timeout of waitFor if function or selector is provided
*/
async function waitForTask(page, waitFor, waitForTimeout) {
// if it's a number just wait
if (!isNaN(waitFor)) {
return await new Promise((r) => setTimeout(r, parseInt(waitFor)));
}
// Let's assume it's a selector or a function if it's not a number
const defaultWaitTimeout = 30000;
// validate the selector
const isSelector = await page.evaluate((s) => {
try {
document.createDocumentFragment().querySelector(s);
} catch (e) {
return false;
}
return true;
}, waitFor);
let waitForPromise;
if (isSelector) {
waitForPromise = page.waitForSelector(waitFor, {timeout: waitForTimeout ? 0 : defaultWaitTimeout});
} else {
// if it's not a selector! let's assume it's a function
waitForPromise = page.evaluate(`(${waitFor})()`);
}
// if timeout if defined...
if (waitForTimeout) {
// wait for the selection/function or timeout
return await Promise.race([
waitForPromise,
new Promise((r) => setTimeout(r, waitForTimeout))
]);
}
return await waitForPromise;
}
module.exports = {
waitForTask
}
Let's incorporate our helper function:
// simple-scraper.js
const debug = require('debug')('simple-scraper');
// import the function we created before
const {startBrowser} = require("./utils/browser-utils");
// import waitForTask we defined before
const {waitForTask} = require("./utils/page-utils");
async function scraper(url, rules, options = {}) {
const {
headless = true,
gotoOptions = {
timeout: null,
waitUntil: 'domcontentloaded'
},
waitFor, // the selector to wait for
waitForTimeout // waitFor timeout
} = options;
// ... code before waitFor
// only if waitFor is specified
if (waitFor) {
debug(`Wait for ${waitFor}`)
await waitForTask(page, waitFor, waitForTimeout);
}
// ... code after waitFor
}
Define scrape function helper
Next, let's define a function that takes a list of rules and uses it to extract data from a page.
Edit file ./utils/page-utils.js
// ./utils/page-utils.js
// import scrape functions we defined earlier
let scrapeFunctions = require("./scrape-utils");
//... rest of code
/**
*
* @param {import("puppeteer").Page} page
* @param {Object[]} rules - The number of ms to wait for, or selector to wait for in page and custom function to execute
*/
async function scrapeTask(page, rules) {
debug(`Start scraping`);
// prepare functions so we can inject them in browser context
for (const func in scrapeFunctions) {
scrapeFunctions[func] = scrapeFunctions[func].toString();
}
return await page.evaluate((rules, scrapeFunctions
) => {
console.log(`Start scraping rules`, rules);
// let's define all functions in global scope,
// so they can be accessible from each-other
// another way would be to use puppeteer.exposeFunction or use puppeteer.addScriptTag
for (const func in scrapeFunctions) {
globalThis[func] = new Function('return ' + scrapeFunctions[func])();
}
return rules.map((rule) => {
const data = {};
for (const name in rule) {
// scrapeRule function is defined in the global scope...
data[name] = scrapeRule(document, rule[name]);
}
return data;
});
}, rules, scrapeFunctions
);
}
module.exports = {
waitForTask,
scrapeTask
}
Our scrapeTask
function takes a list of rules in the format we defined earlier, then extracts data from the matching elements per those rules.
We used Javascript's Function constructor to inject our scrape helper functions (defined earlier) into the browser's page context, and we used variable globalThis which refers to the window object in the browser context to define our functions globally. This is because Function
constructor creates functions which execute in the global scope only, and our functions would not have access to each other if we define them locally.
Customize scraper
to use the scrapeTask
function:
// simple-scraper.js
const debug = require('debug')('simple-scraper');
// import the function we created before
const {startBrowser} = require("./utils/browser-utils");
// import waitForTask and scrapeTask we defined before
const {waitForTask, scrapeTask} = require("./utils/page-utils");
async function scraper(url, rules, options = {}) {
const {
headless = true,
gotoOptions = {
timeout: null,
waitUntil: 'domcontentloaded'
},
waitFor, // the selector to wait for
waitForTimeout // waitFor timeout
} = options;
// ... code before waitFor
// only if waitFor is specified
if (waitFor) {
debug(`Wait for ${waitFor}`)
await waitForTask(page, waitFor, waitForTimeout);
}
debug(`Start scraping`);
// Scrape data using the DOM API
return await scrapeTask(page, rules);
// ... code after waitFor
}
Our scraper
function now looks like this:
// simple-scraper.js
const debug = require('debug')('simple-scraper');
// import the function we created before
const {startBrowser} = require("./utils/browser-utils");
// import waitForTask and scrapeTask we defined before
const {waitForTask, scrapeTask} = require("./utils/page-utils");
async function scraper(url, rules, options = {}) {
let browser;
const {
headless = true,
gotoOptions = {
timeout: null,
waitUntil: 'domcontentloaded'
},
waitFor, // the selector to wait for
waitForTimeout // waitFor timeout
} = options;
try {
debug('Starting');
browser = await startBrowser({
headless,
});
const page = await browser.newPage();
debug(`Navigate to URL ${url}`);
await page.goto(url, {...gotoOptions});
// only if waitFor is specified
if (waitFor) {
debug(`Wait for ${waitFor}`)
await waitForTask(page, waitFor, waitForTimeout);
}
debug(`Start scraping`);
// Scrape data using the DOM API
return await scrapeTask(page, rules);
} catch (e) {
// handle error
debug(`Error ${e.message}`);
return null;
} finally {
// close browser
if (browser) {
await browser.close();
}
}
}
module.exports = {
scraper,
}
Testing it all
const {scraper} = require("./scraper");
(async () => {
const url = "https://ujeebu.com/docs/scrape-me/load-more";
const data = await scraper(url, [
{
"products": {
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": {
"name": {
"selector": ".title",
"type": "text"
},
"description": {
"selector": ".description",
"type": "text"
},
"price": {
"selector": ".price",
"type": "text"
},
"image": {
"selector": ".card__image > img",
"type": "image",
}
}
}
}
], {
headless: true,
waitFor: '.products-list',
});
console.log(JSON.stringify(data, null, 2));
})();
the above will output the following:
[
{
"products": [
{
"name": "Fantastic Plastic Salad",
"description": "Andy shoes are designed to keeping in mind durability as well as trends, the most stylish range of shoes & sandals",
"price": "2082.23$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Intelligent Concrete Chicken",
"description": "Andy shoes are designed to keeping in mind durability as well as trends, the most stylish range of shoes & sandals",
"price": "7152.76$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Refined Frozen Ball",
"description": "Andy shoes are designed to keeping in mind durability as well as trends, the most stylish range of shoes & sandals",
"price": "3396.32$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Awesome Fresh Chair",
"description": "New ABC 13 9370, 13.3, 5th Gen CoreA5-8250U, 8GB RAM, 256GB SSD, power UHD Graphics, OS 10 Home, OS Office A & J 2016",
"price": "3242.52$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Awesome Steel Towels",
"description": "New ABC 13 9370, 13.3, 5th Gen CoreA5-8250U, 8GB RAM, 256GB SSD, power UHD Graphics, OS 10 Home, OS Office A & J 2016",
"price": "3956.33$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Small Rubber Shoes",
"description": "The slim & simple Maple Gaming Keyboard from Dev Byte comes with a sleek body and 7- Color RGB LED Back-lighting for smart functionality",
"price": "7216.89$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Fantastic Frozen Soap",
"description": "Carbonite web goalkeeper gloves are ergonomically designed to give easy fit",
"price": "7332.42$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Rustic Granite Fish",
"description": "Bostons most advanced compression wear technology increases muscle oxygenation, stabilizes active muscles",
"price": "1180.94$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Sleek Plastic Chips",
"description": "The Football Is Good For Training And Recreational Purposes",
"price": "1571.19$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Incredible Fresh Bike",
"description": "Bostons most advanced compression wear technology increases muscle oxygenation, stabilizes active muscles",
"price": "654.61$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Awesome Metal Computer",
"description": "New range of formal shirts are designed keeping you in mind. With fits and styling that will make you stand apart",
"price": "4858.48$",
"image": "https://placeimg.com/500/500"
},
{
"name": "Awesome Steel Towels",
"description": "The Nagasaki Lander is the trademarked name of several series of Nagasaki sport bikes, that started with the 1984 ABC800J",
"price": "8467.43$",
"image": "https://placeimg.com/500/500"
}
]
}
]
Conclusion
In this article, we built a rule based scraper using Puppeteer. In the next post we will add more options to further enhance our rule based scraper.
Why Puppeteer? At Ujeebu we use a plethora of tools for scraping and are constantly experimenting with new as well as proven technologies to help our customers achieve their data extraction goals in a cost effective way.
If you don't have time to deal with headless browsers with a library such as Puppeteer and would like to automate your scraping efforts as much as possible, we have an API just for you. Try us out today. The first 5000 credits (approx. 1000 requests) are on us, and no credit card is required.