Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merges update for allowing nested sitemaps #28

Merged
merged 11 commits into from
Sep 28, 2023
9 changes: 6 additions & 3 deletions cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -215,13 +215,14 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file
`);
await siteCrawler.produceSiteLinks();

const numberOfSiteLinks = siteCrawler.linkSet.size;
if (!mainConfig.useExportedSitemap) {
await log.toConsole(`
||-> Site links exported to ${siteCrawler.exportFileName}
||-> ${numberOfSiteLinks} URLs exported to ${siteCrawler.exportFileName}.sitemap.json
`);
} else {
await log.toConsole(`
||-> Site links read from ${siteCrawler.exportFileName}
||-> ${numberOfSiteLinks} URLs read from ${siteCrawler.exportFileName}.sitemap.json
`);
}

Expand All @@ -247,8 +248,10 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file
await outputter.writeDataAsync(formattedResult, outputFileName);

log.endTimer();
const { elapsedTime } = log;
const friendlyTime = elapsedTime > 300 ? `${(elapsedTime / 60).toFixed(2)}m` : `${elapsedTime}s`;
const endMessage = `
| Finished after ${log.elapsedTime}s
| Finished after ${friendlyTime}
| Pages Scanned: ${totalPagesSearched}
| Pages with a Match: ${pagesWithSelector.length}
| Total Results: ${totalMatches}
Expand Down
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "selector-hound",
"version": "2.0.0",
"version": "2.1.0",
"description": "Find an element that matches a particular CSS selector on a website ",
"type": "module",
"keywords": [
Expand Down
60 changes: 53 additions & 7 deletions src/site-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,22 @@ export default class SiteCrawler {
const parser = new this.libraries.Parser();
parsedXml = await parser.parseStringPromise(data);
} catch (getSitemapError) {
await log
.errorToFileAsync(getSitemapError)
.errorToConsoleAsync(
`Couldn't get the sitemap:\n ${getSitemapError}`,
);
await log.errorToFileAsync(getSitemapError);
await log.errorToConsoleAsync(
`Couldn't get the sitemap:\n ${getSitemapError}`,
);
}
return parsedXml;
}

/**
* @description gets links to pages from a sitemap
* @param {Object} sitemapJson
* @returns {string[]} an array of href values to sitemaps
*/
static getLinksFromSitemap(sitemapJson) {
if (!sitemapJson) throw new Error('Sitemap JSON was not provided');
if (!sitemapJson.urlset) return [];
const pageLinks = sitemapJson
.urlset
.url // note: each url node in the xml becomes object in an array called url
Expand All @@ -177,6 +182,22 @@ export default class SiteCrawler {
return pageLinks;
}

/**
* @description gets links to sitemaps from a sitemap
* @param {object} sitemapJson
* @returns {string[]} an array of href values to sitemaps
*/
static getSitemapsFromSitemap(sitemapJson) {
if (!sitemapJson) throw new Error('Sitemap JSON was not provided');
if (!sitemapJson.sitemapindex) return [];
const sitemapLinks = sitemapJson
.sitemapindex
.sitemap
.map((urlObject) => urlObject.loc[0]);

return sitemapLinks;
}

/**
* @description Gets only links from a string containing markup
* @param {string} pageMarkup string containing markup
Expand Down Expand Up @@ -301,6 +322,32 @@ export default class SiteCrawler {
}
}

/**
* @description Fetches a sitemap and returns the links from it
* @param {string} [sitemapUrl=this.config.startPage]
* @returns {string[]} an array of href values
*/
async getSitemapLinks(sitemapUrl = this.config.startPage) {
let sitemapUrls = [];
let nestedSitemaps = [];
try {
const sitemapJson = await this.getSitemapAsync(sitemapUrl);
sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson);
nestedSitemaps = SiteCrawler.getSitemapsFromSitemap(sitemapJson);

if (nestedSitemaps.length > 0) {
await forEachAsync(nestedSitemaps, async (nestedSitemap) => {
const nestedSitemapLinks = await this.getSitemapLinks(nestedSitemap);
sitemapUrls = [...sitemapUrls, ...nestedSitemapLinks];
});
}
} catch (setSitemapError) {
await log.errorToFileAsync(setSitemapError);
}

return sitemapUrls;
}

/**
* @description Fetches a sitemap and adds links to linkset
* @param {string} [sitemapUrl=this.config.startPage]
Expand All @@ -309,8 +356,7 @@ export default class SiteCrawler {
this.config.startPage = sitemapUrl;

try {
const sitemapJson = await this.getSitemapAsync(sitemapUrl);
const sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson);
const sitemapUrls = await this.getSitemapLinks(sitemapUrl);
this.addLinks(sitemapUrls);
} catch (setSitemapError) {
await this.errorToFileAsync(setSitemapError);
Expand Down
79 changes: 78 additions & 1 deletion test/site-crawler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,40 @@ const MOCK_DATA = {
</url>
</urlset>
`,
otherSitemap: `<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://frankmtaylor.com/foo.html</loc>
<lastmod>2022-01-06T16:36:33.516Z</lastmod>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>http://frankmtaylor.com/bar.html</loc>
<lastmod>2022-01-06T16:36:33.618Z</lastmod>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>http://frankmtaylor.com/baz.html</loc>
<lastmod>2022-01-06T16:36:33.664Z</lastmod>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>http://frankmtaylor.com/beep.html</loc>
<lastmod>2022-01-06T16:36:33.721Z</lastmod>
<changefreq>monthly</changefreq>
</url>
</urlset>
`,

nestedSitemap: `<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
<sitemap>
<loc>https://frankmtaylor.com/sitemap.xml</loc>
<lastmod>2022-01-06T16:36:33.721Z</lastmod>
</sitemap>
<sitemap>
<loc>https://frankmtaylor.com/other-sitemap.xml</loc>
<lastmod>2022-01-06T16:36:33.721Z</lastmod>
</sitemap>
</sitemapindex>`,
};

axios.mockImplementation((url) => {
Expand All @@ -99,6 +133,14 @@ axios.mockImplementation((url) => {
return Promise.resolve({
data: MOCK_DATA.sitemap,
});
case 'https://frankmtaylor.com/nested-sitemap.xml':
return Promise.resolve({
data: MOCK_DATA.nestedSitemap,
});
case 'https://frankmtaylor.com/other-sitemap.xml':
return Promise.resolve({
data: MOCK_DATA.otherSitemap,
});
case 'https://frankmtaylor.com/portfolio/':
return Promise.resolve({
data: MOCK_DATA.portfolio,
Expand Down Expand Up @@ -132,7 +174,6 @@ describe('getting file', () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;


test('getFileAsync', async () => {
const result = await siteCrawler.getFileAsync('https://frankmtaylor.com/qualified/');

Expand Down Expand Up @@ -387,6 +428,32 @@ describe('SiteCrawler: Fetching Sitemap', () => {
expect(sitemapLinks.length).toEqual(7);
});
});
describe('static getsitemaps', () => {
test('it will create an array from a json object', async () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;
const siteMapJson = await siteCrawler.getSitemapAsync('https://frankmtaylor.com/nested-sitemap.xml');
const sitemapLinks = SiteCrawler.getSitemapsFromSitemap(siteMapJson);
expect(sitemapLinks).toBeInstanceOf(Array);
expect(sitemapLinks.length).toEqual(2);
});
});
describe('getSitemapLinks', () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;

test('it gets links from a sitemap', async () => {
const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/sitemap.xml');
expect(sitemapLinks).toBeInstanceOf(Array);
expect(sitemapLinks.length).toEqual(7);
});

test('it gets links from a nested sitemap', async () => {
const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/nested-sitemap.xml');
expect(sitemapLinks).toBeInstanceOf(Array);
expect(sitemapLinks.length).toEqual(11);
});
});
describe('setSitemap', () => {
test('The linkSet will have the same links from sitemap', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
Expand All @@ -406,10 +473,20 @@ describe('SiteCrawler: Fetching Sitemap', () => {
describe('produceSiteLinks', () => {
test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
siteCrawler.libraries.ajax = axios;
await siteCrawler.produceSiteLinks();
expect(siteCrawler.hasExportedLinks).toEqual(true);
expect(siteCrawler.linkSet.size).toBeGreaterThan(0);
expect(siteCrawler.linkSet.has('http://frankmtaylor.com'));
});
});
describe('nested sitemap', () => {
test('it can crawl a nested sitemap', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/nested-sitemap.xml' });
siteCrawler.libraries.ajax = axios;

await siteCrawler.setSitemap();
expect(siteCrawler.linkSet.size).toEqual(11);
});
});
});