'Puppeteer timeout error when crawling a heavy website even after set the timeout to 0

I am crawling this website https://startupjobs.asia/job/search?q=&job-list-dpl-page=, the website is heavy and load very slow.

The website has 7 pages and so far I've managed to crawl until the third page. If lucky the 4th page.

I've tried cloning the code to make it open a new browser and made it to the 4th page but the problem still persists when I tried to clone it for the 5th and 6th page.

Below is my code.

const puppeteer = require ('puppeteer')
const fs = require("fs")


async function start (){

    const browser = await puppeteer.launch({
        headless: true,
        defaultViewport: null,
        args: [ '--ignore-certificate-errors' ]
    });
    

    var name     = ["Job Name"];
    var country  = ["Country"];
    var company  = ["Company Name"];
    var type     = ["Job Type"];
    var salary   = ["Salary"];
    var skills   = ["Skills Require"];
    var desc     = ["Job Description"];
    var req      = ["Job Requirements"];
    var resp     = ["Job Responsibility"];
    var industry = ["Industry"];

    for (var j=1; j<3; j++){
        await page.waitForXPath(xpath);
        const page = await browser.newPage();
        page.setDefaultNavigationTimeout( 0 );
        
        await page.goto('https://startupjobs.asia/job/search?q=&job-list-dpl-page='+j, {
            waitUntil: "networkidle2",
            timeout: 0
        });

        for ( var i=1; i < 31; i++){

            await page.waitForXPath("/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li["+i+"]/div/div[1]/div/h5/a")
            var b = await page.$x("/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li["+i+"]/div/div[1]/div/h5/a")
            await b[0].click();
            
            const elementsToFind = [
                { xpath: "/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li["+i+"]/div/div[1]/div/h5/a",  propName: 'job_name' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[1]/div[2]/div/h6[2]/a',                    propName: 'country' },      
                { xpath: "/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li["+i+"]/div/div[1]/div/p[1]/a",propName: 'company' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[3]/p',         propName: 'job_type' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[1]/p',         propName: 'salary' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/p',         propName: 'skills' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[1]/div',       propName: 'job_description' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[3]/div',       propName: 'job_requirement' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[2]/div',       propName: 'job_responsibility' },
                { xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[2]/p',         propName: 'industry' },
                // ...
            ];

            var results = {};

            for (var { xpath, propName } of elementsToFind) {
                
                var [el] = await page.$x(xpath);
                results[propName] = !el ? 'Not Found' : await (await el.getProperty('textContent')).jsonValue();     
            }    

            name.push(results['job_name']);
            country.push(results['country']);
            company.push(results['company']);
            type.push(results['job_type']);
            salary.push(results['salary']);
            skills.push(results['skills']);
            desc.push(results['job_description']);
            req.push(results['job_requirement']);
            resp.push(results['job_responsibility']);
            industry.push(results['industry']);

            //await page.evaluate(() => document.querySelector("#suj-single-jobdetail-wrapper > div.detail-body > div.row > div.col.s12.tabs-wrapper.suj-company-review-tabs-wrapper > ul > li:nth-child(2) > a").click())            
        }
        await page.close();
    }

    await browser.close();

    var k=1;
    name.forEach(function(a, index) {
       console.log(k, a);
       k++;
    });
}

start()

This is the error that I got enter image description here

Is there any way I can crawl the rest of the page by removing the timeout or repeat the code process?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source