Client-Based Javascript Webcrawler (Runs Web-Browser)

homeurl  = 'https://cats.com';
iurls    = [];
ourls    = [];
maxpages = 100; // safety catch so we can't ever get stuck in an infinit loop

while (iurls.length>0 && maxpages>0)
{
    maxpages--;

    const iurl = iurls.pop();
    const response = await fetch( iurl , {cache: "no-store"} );

    response.ok;     // => false
    response.status; // => 404

    if ( response.ok == false || response.status == 404 )
    {
        console.log('404 error: ' + iurl);
        continue;
    }

    let text = await response.text();

    // find all the links
    const linkRx = /<a\s+(?:[^>]*?\s+)?href=(["'])(.*?)\1/g;
    linksx = text.matchAll( linkRx );
    linksx = [...linksx]

    // this is where it gets messy
    // url can have any of these formats:
    // https:/..
    // http:/..
    // site.com
    // index.html
    // ../index.html
    // ../../../index.html
    // ./test.php
    // mailto:me@happychicken.com

    // for testing, I'll just do a vanilla loop
    for (let kk=0; kk<linksx.length; kk++)
    {
        let link = linksx[kk];

        // do checks and fix the link

        if ( link.startsWith ('/')  ){ link = homeurl + '/' + link;        }
        if ( link.startsWith ('./') ){ link = homeurl + link.substring(1); }

        // just website without any base (e.g., cat/cats.html - need to append the base url)
        if ( !link.startsWith( 'https:' ) &&
             !link.startsWith( 'http:'  ) &&
             !link.startsWith( 'www.' ))
        {
              link = homeurl + '/' + link;
        }

        // if it isn't from this website - we skip reading its contents
        if ( !hr.includes( 'https://xbdev.net/' ) )
        {
            ourls.push( link );
            continue;
        }

        // add the webpage to the input list
        iurls.push( link );

    }// end for kk
}// end for iurls

homeurl  = 'https://cats.com';
iurls    = [];
ourls    = [];
maxiters = 100; // safety catch so we can't ever get stuck in an infinit loop

while (iurls.length>0 && maxiters>0)
{
    maxiters--;

    let iurl = iurls.pop();
    iurl     = iurl.trim();
    if ( ourls.includes( iurl ) ) { console.log('repeat:', iurl );  continue; }

    if ( !iurl.includes( 'https://' + homeurl ) ) { continue; } // only recurse this address

    log('iteration:', maxiters, 'url:', iurl, 'num out:', ourls.length, 'remaining:', iurls.length );

    // get contents and find any child links
    const response = await fetch( iurl , {cache: "no-store"} );

    response.ok;     // => false
    response.status; // => 404

    if ( response.ok == false || response.status == 404 )
    {
        log('404 error: ' + iurl + ',' + iurlparent );
        continue;
    }

    ourls.push( iurl );

    let text = await response.text();

    iurl = iurl.replace('www.'+homeurl, homeurl); // e.g., www.cat.com to cat.com

    const url = iurl;
    const domain = (new URL(url));
    const urlhost = domain.protocol + '//' + domain.hostname;
    const urlbody = domain.pathname.substring(0, domain.pathname.lastIndexOf('/')+1);
    const urlbase = urlhost+urlbody;

    var parser = new DOMParser();
    var htmlDoc  = parser.parseFromString(text, 'text/html');
    let baseEl = htmlDoc.createElement('base');
    baseEl.setAttribute('href', urlbase);
    htmlDoc.head.append(baseEl);
    var linksx = htmlDoc.getElementsByTagName('a');
    linksx = [...linksx];

    console.log('url:', url);
    console.log('  friendly url:', urlhost + domain.pathname )
    console.log('  url base:',     urlbase )
    console.log('    num links:', linksx.length)

    for (let bb=0; bb<linksx.length; bb++)
    {
        let link = linksx[bb].href;

        if ( !link.includes( 'https://'+homeurl ) )    {  continue;  }


        if ( ourls.includes( link ) )
        {
            console.log('    link ', bb, ' :', link, ' (already in list)' );
            continue;
        }
        if ( iurls.includes ( link ) )
        {
            console.log('    link ', bb, ' :', link, ' (in wait list)' );
            continue;
        }
        console.log('    new link ', bb, ' :', link );

        link = link.trim();

        link = link.replaceAll('http://', 'https://');

        iurls.push( link );
    }// end for bb

}// end iteration loop

console.log('');
console.log('number unique urls found:', ourls.length );
console.log('list of log unique urls:');
console.log( ourls.join("\n") );

Basics - Getting a HTML Website (Loading File)

Hard Way

Smarter Way (Built-in Webpage Parser)

Building a SiteMap.xml

Things to Try