puppeteer抓取SPA(客户端渲染)InfoQ.cn内容

Posted on 2020-08-30 20:47:12
Author: 可乐小可爱メ
1. 背景

内容数据不够,那就去抓一些大站的公开数据来吧。


2. 目标网站分析

target: https://www.infoq.cn/ InfoQ是一个实践驱动的社区资讯站点,致力于促进软件开发及相关领域知识与创新的传播

查看源码分析: 网站技术选型为 Vue.js 客户端渲染; 

即 所有内容是在客户端fetch --> render 完成;


3. 技术选型

http://puppeteerjs.com/

    3.1 模拟访问 target site; 

    3.2 模拟交互行为,获取全部渲染内容;

    3.3 解析dom树,整理数据。


4. 数据存入 & 定时任务

    mysql && node-schedule or crontab;


5. 代码部分

    5.1 数据抓取   fetchInfoQ.js

const puppeteer = require("puppeteer");
const target = "https://www.infoq.cn";

const userAgent =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36";

const article = async () => {
return new Promise((resolve) => {
puppeteer.launch({ headless: false }).then(async (browser) => {
const page = await browser.newPage();
const maxWait = 100;
const makLoop = 5;
page.setUserAgent(userAgent);
await page.goto(target);
for (let i = 0; i < makLoop; i++) {
await page.evaluate(() =>
window.scrollTo(0, document.body.scrollHeight)
);
await page.waitForNavigation({
timeout: maxWait,
waitUntil: ["networkidle0"],
});
}

const select = "#layout .page-home.layout-content .main .list .item-main";
const elements = await page.evaluate((select) => {
const els = document.querySelectorAll(select);

const obj = Array.from(els, (item) => {
const img = item.querySelector("img").src;
const a = item.querySelector("h6").querySelector("a");
const title = a.innerText;
const href = a.href;
const description = item.querySelector(".summary").innerText;
const author = item.querySelector(".com-author-name").innerText;
const date = item.querySelector(".date").innerText.replace(/ /g, "");
const time = new Date();
const timeNow = time.getTime();
const yr = time.getFullYear();
let createTime = "";
if (date.indexOf("前") !== -1) {
if (date.indexOf("小时") > 0) {
createTime = timeNow - parseInt(date) * 1000 * 60 * 60;
} else {
createTime = timeNOw;
}
} else {
const mon = date.split("月")[0];
const day = parseInt(date.split("月")[1]);
createTime = new Date(`${yr}-${mon}-${day}`).getTime();
}
return { img, title, href, description, author, createTime };
});
return obj;
}, select);
resolve(elements.slice(0, 5));
await browser.close();
});
});
};

module.exports = article;

    

    5.2 数据处理 

     5.2.1 mysql.js

const mysql = require("mysql");
class MysqlInit {
constructor() {
this.config = {
host: "127.0.0.1",
port: "3306",
user: "root",
password: "",
database: "agent_resources",
};
this.db = null;
}
connect() {
this.db = mysql.createConnection(this.config);
this.db.connect((err) => {
if (err) return console.log("sql", { loc: "connect mysql err", err });
});
}
query(querySQL, callback) {
if (!this.db) {
this.connect();
}
this.db.query(querySQL, callback);
}
end() {
this.db.end((err) => {
if (err) return console.log("sql", { loc: "end mysql err", err });
console.log(`mysql end.`);
});
}
}

module.exports = MysqlInit;


     5.2.2 sql.js

const MysqlInit = require("./mysql");
const Sql = new MysqlInit();
const articleArr = require("./fetchInfoQ");
(async () => {
const agentResult = await articleArr();
const valArr = Array.from(agentResult, (item) => {
return `"${item.createTime}", "${item.author}", "${item.href}", "${item.title}", "${item.img}", "${item.description}"`;
});
const val = "(" + valArr.join("), (") + ")";
const key = `create_time, author, href, title, ilustri, description`;
const insertSQL = `INSERT INTO external_article (${key}) VALUES ${val}`;
Sql.query(insertSQL, (err) => {
if (err) {
return console.log("sql", { loc: "add article err", insertSQL, err });
}
console.log("insert success!");
Sql.end();
});
})();


    5.3 定时任务(两种实现)

     5.3.1 node-schedule方案    schedule-fetch.js

const schedule = require("node-schedule");
const shell = require("shelljs");
const doFetch = () => {
schedule.scheduleJob("0 10 9 * * *", () => {
shell.exec("node sql.js");
});
};
doFetch();

     5.3.2 crontab方案

10 9 * * * /root/.nvm/versions/node/v12.10.0/bin/node /home/www/sql.js


6. 注意点

    6.1 puppeteer.launch({ headless: false }) 需要设置 headless, 用于客户端解析。

    6.2 page.setUserAgent(userAgent); 需要设置 代理agent, 用户伪装客户端请求。

    6.3  模拟客户端行为,加载更多首页咨询数据;

await page.evaluate(() =>
window.scrollTo(0, document.body.scrollHeight)
);
await page.waitForNavigation({
timeout: maxWait,
waitUntil: ["networkidle0"],
});

    6.4 定时任务: node-schedule 与 crontab 定时语法不一;(node多一个second 秒的指针配置.)


    

当前评论 (0) 登录后评论

暂无评论