This script parses the sitemap.xml of different Jekyll-powered sites to show the amount of URLs each websites consists of.
The outcome was used for my Thesis presentation at the FH Hagenberg.
const Promise = require('bluebird');
const got = require('got');
const xml = require('xml2js').parseString;
// The sites from which the sitemap.xml files are fetched
const urls = [
'jekyllrb.com',
'dynatrace.com',
'help.dynatrace.com',
];
// Create an array of promises
const promises = urls.map(url => got(`https://${url}/sitemap.xml`, {
// Set the encoding to 'utf8' to not receive a Buffer
encoding: 'utf8',
})
.then(res => res.body)
.then(body => new Promise((resolve, reject) => {
// Parse the returned XML file
xml(body, (err, json) => {
if (err) {
return reject(err);
}
return resolve(json);
});
}))
// Return a string containing the number of URL entries in each sitemap.xml
.then(result => `${url} contains ${result.urlset.url.length} URL entries.`));
// Call Promise.all to wait until every sitemap.xml was fetched and parsed,
// then console.log() their outcome
Promise.all(promises)
.then(results => results.forEach(entry => console.log(entry)));