Skip to content

Commit

Permalink
Merge pull request #28 from paceaux/develop
Browse files Browse the repository at this point in the history
Merges update for allowing nested sitemaps
  • Loading branch information
paceaux authored Sep 28, 2023
2 parents 8065a31 + 7a551fa commit 8163fae
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 14 deletions.
9 changes: 6 additions & 3 deletions cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -215,13 +215,14 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file
`);
await siteCrawler.produceSiteLinks();

const numberOfSiteLinks = siteCrawler.linkSet.size;
if (!mainConfig.useExportedSitemap) {
await log.toConsole(`
||-> Site links exported to ${siteCrawler.exportFileName}
||-> ${numberOfSiteLinks} URLs exported to ${siteCrawler.exportFileName}.sitemap.json
`);
} else {
await log.toConsole(`
||-> Site links read from ${siteCrawler.exportFileName}
||-> ${numberOfSiteLinks} URLs read from ${siteCrawler.exportFileName}.sitemap.json
`);
}

Expand All @@ -247,8 +248,10 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file
await outputter.writeDataAsync(formattedResult, outputFileName);

log.endTimer();
const { elapsedTime } = log;
const friendlyTime = elapsedTime > 300 ? `${(elapsedTime / 60).toFixed(2)}m` : `${elapsedTime}s`;
const endMessage = `
| Finished after ${log.elapsedTime}s
| Finished after ${friendlyTime}
| Pages Scanned: ${totalPagesSearched}
| Pages with a Match: ${pagesWithSelector.length}
| Total Results: ${totalMatches}
Expand Down
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "selector-hound",
"version": "2.0.0",
"version": "2.1.0",
"description": "Find an element that matches a particular CSS selector on a website ",
"type": "module",
"keywords": [
Expand Down
60 changes: 53 additions & 7 deletions src/site-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,22 @@ export default class SiteCrawler {
const parser = new this.libraries.Parser();
parsedXml = await parser.parseStringPromise(data);
} catch (getSitemapError) {
await log
.errorToFileAsync(getSitemapError)
.errorToConsoleAsync(
`Couldn't get the sitemap:\n ${getSitemapError}`,
);
await log.errorToFileAsync(getSitemapError);
await log.errorToConsoleAsync(
`Couldn't get the sitemap:\n ${getSitemapError}`,
);
}
return parsedXml;
}

/**
* @description gets links to pages from a sitemap
* @param {Object} sitemapJson
* @returns {string[]} an array of href values to sitemaps
*/
static getLinksFromSitemap(sitemapJson) {
if (!sitemapJson) throw new Error('Sitemap JSON was not provided');
if (!sitemapJson.urlset) return [];
const pageLinks = sitemapJson
.urlset
.url // note: each url node in the xml becomes object in an array called url
Expand All @@ -177,6 +182,22 @@ export default class SiteCrawler {
return pageLinks;
}

/**
* @description gets links to sitemaps from a sitemap
* @param {object} sitemapJson
* @returns {string[]} an array of href values to sitemaps
*/
static getSitemapsFromSitemap(sitemapJson) {
if (!sitemapJson) throw new Error('Sitemap JSON was not provided');
if (!sitemapJson.sitemapindex) return [];
const sitemapLinks = sitemapJson
.sitemapindex
.sitemap
.map((urlObject) => urlObject.loc[0]);

return sitemapLinks;
}

/**
* @description Gets only links from a string containing markup
* @param {string} pageMarkup string containing markup
Expand Down Expand Up @@ -301,6 +322,32 @@ export default class SiteCrawler {
}
}

/**
* @description Fetches a sitemap and returns the links from it
* @param {string} [sitemapUrl=this.config.startPage]
* @returns {string[]} an array of href values
*/
async getSitemapLinks(sitemapUrl = this.config.startPage) {
let sitemapUrls = [];
let nestedSitemaps = [];
try {
const sitemapJson = await this.getSitemapAsync(sitemapUrl);
sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson);
nestedSitemaps = SiteCrawler.getSitemapsFromSitemap(sitemapJson);

if (nestedSitemaps.length > 0) {
await forEachAsync(nestedSitemaps, async (nestedSitemap) => {
const nestedSitemapLinks = await this.getSitemapLinks(nestedSitemap);
sitemapUrls = [...sitemapUrls, ...nestedSitemapLinks];
});
}
} catch (setSitemapError) {
await log.errorToFileAsync(setSitemapError);
}

return sitemapUrls;
}

/**
* @description Fetches a sitemap and adds links to linkset
* @param {string} [sitemapUrl=this.config.startPage]
Expand All @@ -309,8 +356,7 @@ export default class SiteCrawler {
this.config.startPage = sitemapUrl;

try {
const sitemapJson = await this.getSitemapAsync(sitemapUrl);
const sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson);
const sitemapUrls = await this.getSitemapLinks(sitemapUrl);
this.addLinks(sitemapUrls);
} catch (setSitemapError) {
await this.errorToFileAsync(setSitemapError);
Expand Down
79 changes: 78 additions & 1 deletion test/site-crawler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,40 @@ const MOCK_DATA = {
</url>
</urlset>
`,
otherSitemap: `<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://frankmtaylor.com/foo.html</loc>
<lastmod>2022-01-06T16:36:33.516Z</lastmod>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>http://frankmtaylor.com/bar.html</loc>
<lastmod>2022-01-06T16:36:33.618Z</lastmod>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>http://frankmtaylor.com/baz.html</loc>
<lastmod>2022-01-06T16:36:33.664Z</lastmod>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>http://frankmtaylor.com/beep.html</loc>
<lastmod>2022-01-06T16:36:33.721Z</lastmod>
<changefreq>monthly</changefreq>
</url>
</urlset>
`,

nestedSitemap: `<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
<sitemap>
<loc>https://frankmtaylor.com/sitemap.xml</loc>
<lastmod>2022-01-06T16:36:33.721Z</lastmod>
</sitemap>
<sitemap>
<loc>https://frankmtaylor.com/other-sitemap.xml</loc>
<lastmod>2022-01-06T16:36:33.721Z</lastmod>
</sitemap>
</sitemapindex>`,
};

axios.mockImplementation((url) => {
Expand All @@ -99,6 +133,14 @@ axios.mockImplementation((url) => {
return Promise.resolve({
data: MOCK_DATA.sitemap,
});
case 'https://frankmtaylor.com/nested-sitemap.xml':
return Promise.resolve({
data: MOCK_DATA.nestedSitemap,
});
case 'https://frankmtaylor.com/other-sitemap.xml':
return Promise.resolve({
data: MOCK_DATA.otherSitemap,
});
case 'https://frankmtaylor.com/portfolio/':
return Promise.resolve({
data: MOCK_DATA.portfolio,
Expand Down Expand Up @@ -132,7 +174,6 @@ describe('getting file', () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;


test('getFileAsync', async () => {
const result = await siteCrawler.getFileAsync('https://frankmtaylor.com/qualified/');

Expand Down Expand Up @@ -387,6 +428,32 @@ describe('SiteCrawler: Fetching Sitemap', () => {
expect(sitemapLinks.length).toEqual(7);
});
});
describe('static getsitemaps', () => {
test('it will create an array from a json object', async () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;
const siteMapJson = await siteCrawler.getSitemapAsync('https://frankmtaylor.com/nested-sitemap.xml');
const sitemapLinks = SiteCrawler.getSitemapsFromSitemap(siteMapJson);
expect(sitemapLinks).toBeInstanceOf(Array);
expect(sitemapLinks.length).toEqual(2);
});
});
describe('getSitemapLinks', () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;

test('it gets links from a sitemap', async () => {
const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/sitemap.xml');
expect(sitemapLinks).toBeInstanceOf(Array);
expect(sitemapLinks.length).toEqual(7);
});

test('it gets links from a nested sitemap', async () => {
const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/nested-sitemap.xml');
expect(sitemapLinks).toBeInstanceOf(Array);
expect(sitemapLinks.length).toEqual(11);
});
});
describe('setSitemap', () => {
test('The linkSet will have the same links from sitemap', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
Expand All @@ -406,10 +473,20 @@ describe('SiteCrawler: Fetching Sitemap', () => {
describe('produceSiteLinks', () => {
test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
siteCrawler.libraries.ajax = axios;
await siteCrawler.produceSiteLinks();
expect(siteCrawler.hasExportedLinks).toEqual(true);
expect(siteCrawler.linkSet.size).toBeGreaterThan(0);
expect(siteCrawler.linkSet.has('http://frankmtaylor.com'));
});
});
describe('nested sitemap', () => {
test('it can crawl a nested sitemap', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/nested-sitemap.xml' });
siteCrawler.libraries.ajax = axios;

await siteCrawler.setSitemap();
expect(siteCrawler.linkSet.size).toEqual(11);
});
});
});

0 comments on commit 8163fae

Please sign in to comment.