homeurl = 'https://cats.com';
iurls = [];
ourls = [];
maxiters = 100; // safety catch so we can't ever get stuck in an infinit loop
while (iurls.length>0 && maxiters>0)
{
maxiters--;
let iurl = iurls.pop();
iurl = iurl.trim();
if ( ourls.includes( iurl ) ) { console.log('repeat:', iurl ); continue; }
if ( !iurl.includes( 'https://' + homeurl ) ) { continue; } // only recurse this address
log('iteration:', maxiters, 'url:', iurl, 'num out:', ourls.length, 'remaining:', iurls.length );
// get contents and find any child links
const response = await fetch( iurl , {cache: "no-store"} );
response.ok; // => false
response.status; // => 404
if ( response.ok == false || response.status == 404 )
{
log('404 error: ' + iurl + ',' + iurlparent );
continue;
}
ourls.push( iurl );
let text = await response.text();
iurl = iurl.replace('www.'+homeurl, homeurl); // e.g., www.cat.com to cat.com
const url = iurl;
const domain = (new URL(url));
const urlhost = domain.protocol + '//' + domain.hostname;
const urlbody = domain.pathname.substring(0, domain.pathname.lastIndexOf('/')+1);
const urlbase = urlhost+urlbody;
var parser = new DOMParser();
var htmlDoc = parser.parseFromString(text, 'text/html');
let baseEl = htmlDoc.createElement('base');
baseEl.setAttribute('href', urlbase);
htmlDoc.head.append(baseEl);
var linksx = htmlDoc.getElementsByTagName('a');
linksx = [...linksx];
console.log('url:', url);
console.log(' friendly url:', urlhost + domain.pathname )
console.log(' url base:', urlbase )
console.log(' num links:', linksx.length)
for (let bb=0; bb<linksx.length; bb++)
{
let link = linksx[bb].href;
if ( !link.includes( 'https://'+homeurl ) ) { continue; }
if ( ourls.includes( link ) )
{
console.log(' link ', bb, ' :', link, ' (already in list)' );
continue;
}
if ( iurls.includes ( link ) )
{
console.log(' link ', bb, ' :', link, ' (in wait list)' );
continue;
}
console.log(' new link ', bb, ' :', link );
link = link.trim();
link = link.replaceAll('http://', 'https://');
iurls.push( link );
}// end for bb
}// end iteration loop
console.log('');
console.log('number unique urls found:', ourls.length );
console.log('list of log unique urls:');
console.log( ourls.join("\n") );
| Building a SiteMap.xml | |
Once you've processed the website, you've got all the URL and webapges - so you can generate a sitemap.xml which search engines usually want.
let xmlsitemap = '<' + '?' + 'xml version="1.0" encoding="UTF-8"' + '?' + '>';
xmlsitemap += `
<!-- generator="SimpleSitemapGenerator/1.2.0" -->
<!-- sitemap-generator-url="https://www.xbdev.net"
sitemap-generator-version="1.2.0" -->
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
`;
ourls.forEach( (uu)=>{
uu = uu.replaceAll('&', '&');
xmlsitemap += `<url><loc>${uu}</loc>
<lastmod>2022-10-21</lastmod>
</url>\n`;
});
xmlsitemap += `</urlset>`;
xmlsitemap += "\n";
xmlsitemap = xmlsitemap.replaceAll('http://', 'https://' );
// add a download button so you can download the sitemap after finished
const blob = new Blob([ xmlsitemap ], {type: 'text/xml'});
const elem = window.document.createElement('a');
document.body.appendChild( elem );
elem.innerHTML = 'Download Sitemap.xml';
elem.href = window.URL.createObjectURL(blob);
elem.download = 'sitemap.xml';
Things to Try
• Develop a visualization resource showing which pages are linked to which other pages
• Create a table which lists all the pages and how many links are on each page
• Extract other information for each page (e.g., h1 tag, title, meta data, number of words, )
• Use a webworker to run the process in the background - so it doesn't lock up when the page looses focus
• Asynchronous non-blocking version (crawl multiple pages at the same time)