我应该在哪里实现带分页的for循环?

人气:827 发布:2022-10-16 标签: javascript node.js web-scraping puppeteer

问题描述

我正在尝试用Puppeteer和Node.js抓取https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1

为此,我首先使用函数scrapeJobsInIndexPage(url)获取每个作业的url,然后运行函数scrapeDescriptionPage(url, page),该函数迭代每个作业的url并抓取作业描述。

问题:代码获取每个作业的url并成功分页,但是它不执行scrapeDescriptionPage(url, page)函数,并且我无法获取作业描述。

如何打开每个作业URL并获取每个作业的作业说明?

此部分代码起作用-它获取每个作业的url。

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");

async function scrapeJobsInIndexPage(url) {
  try {
    const [page] = await browser.pages();

    await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
      waitUntil: "networkidle0",
    });
    await page.click("#close-cookies", {
      delay: 200,
    });
    await page.type("[name='email_1']", "natan.chapman@gmail.com", {
      delay: 200,
    });
    await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
    await page.click("#formContainer > form > div:nth-child(5) > input", {
      delay: 200,
    });

    await page.waitForNavigation();
    await page.goto(url, { waitUntil: "networkidle0" });

    const totalPagesSelector = ".pag_txt_tot";
    const currentPageSelector = ".pag_txt_current";

    await page.waitForSelector(totalPagesSelector);

    const totalPages = await page.$eval(totalPagesSelector, (el) =>
      Number(el.innerText)
    );

    for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
      await page.waitForFunction(
        (sel, page) => document.querySelector(sel)?.innerText === String(page),
        {},
        currentPageSelector,
        currentPage
      );
      const html = await page.evaluate(() => document.body.innerHTML);
      const $ = await cheerio.load(html);

      const jobs = $(".tsize a:even")
        .map(
          (i, element) =>
            "https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
        )
        .get();
      console.log(jobs);

  

      const data = await page.evaluate(() => {
        const firstDataCell =
          document.querySelector("#news_tbl tr td")?.innerText;
        return firstDataCell;
      });
      console.log(`${currentPage}: ${data}`);

      await page.evaluate(() => {
        document
          .querySelector("span.current")
          .nextElementSibling?.querySelector("a")
          .click();
      });
    }
  } catch (err) {
    console.error(err);
  }
}

此部件应在URL打开后获取作业详细信息,但是,我不知道如何将其连接到以前的函数。

async function scrapeDescriptionPage(url, page) {
    let jobText;

    try {
      jobText = $("#formContainer").text();

      const companyImage = await page.$eval(
        ".cs-media img",
        (img) => img.src
      );

      const applyLinkRedirect = $(".ltp-btn").attr("href");
      const jobDescription = $(
        "#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
      ).html();
      await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
      const applyLink = await page.url();

      let ventureLoopResult = new testVentureLoopDB({
        url,
        applyLink,
        jobDescription,
        companyImage,
      });
      ventureLoopResults.push(ventureLoopResult);
      console.log(ventureLoopResults);
      ventureLoopResult.save();
      return ventureLoopResults;
    } catch (err) {
      console.log(err);
    }
  }


let browser;

这是将前两个函数连接在一起的最后一个函数(但scrapeDescriptionPage不起作用)

async function main() {
  browser = await puppeteer.launch({ headless: false });
  const descriptionPage = await browser.newPage();
  const jobs = await scrapeJobsInIndexPage(
    "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
  );
  for (var i = 1; i < jobs.length; i++) {
    const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
    console.log(result);
  }
}

main();

推荐答案

我不知道cheerio,所以这些只是一些猜测。

要收集所有作业URL,需要在循环外声明jobs,循环后返回:
async function scrapeJobsInIndexPage(url) {
  try {
    //...
    const jobs = [];

    for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
      // ...
      const currentJobs = $(".tsize a:even")
        .map(
          (i, element) =>
            "https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
        )
        .get();
      console.log(currentJobs);
      jobs.push(...currentJobs);
      // ...
    }

    return jobs;
  } catch (err) {
    console.error(err);
  }
}
那么,scrapeDescriptionPage似乎是从不同的上下文中采用的函数。但是,如果您需要对每个作业页面使用cheerio,则需要添加以前已经使用过的内容:
async function scrapeDescriptionPage(url, page) {
    await page.goto(url, { waitUntil: "networkidle0" });
    const html = await page.evaluate(() => document.body.innerHTML);
    const $ = await cheerio.load(html);

    let jobText;

    // ...

815