一个用 Node.js 写的美女写真网站爬虫
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
3.3 KiB

3 years ago
const axios = require("axios");
const cheerio = require("cheerio");
3 years ago
const saveImages = require("./utils/saveImages");
3 years ago
const request = axios.create({
methods: "GET",
timeout: 20 * 1000,
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
dnt: "1",
"if-modified-since": "Thu, 15 Jan 2022 17:48:09 GMT",
3 years ago
"if-none-match": "1641491296",
"sec-ch-ua":
'" Not A;Brand";v="99", "Chromium";v="96", "Microsoft Edge";v="96"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": 1,
"user-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62",
},
});
urlArray = [
"https://www.yitudao.com/meinv/xinggan/",
"https://www.yitudao.com/meinv/siwameitui/",
"https://www.yitudao.com/meinv/chemo/",
"https://www.yitudao.com/meinv/wangluomeinv/",
"https://www.yitudao.com/meinv/tiyumeinv/",
"https://www.yitudao.com/meinv/rentiyishu/",
];
3 years ago
let CURRY_PAGENUMBER = 2; // 爬取起始页码
3 years ago
let MAX_PAGENUMBER = 573; // 爬取最大页码
// 爬取队列
const spiderQueue = async (soureUrl) => {
const url = `${soureUrl}${CURRY_PAGENUMBER}.html`;
request({ url }).then(async (res) => {
3 years ago
const $ = cheerio.load(res.data);
3 years ago
const requestQueue = [];
$("a[title]").each((i, elem) => {
requestQueue.push({
title: $(elem).attr("title"),
url: $(elem).attr("href"),
});
});
for (let index = 0; index < requestQueue.length; index++) {
console.log(
`正在抓取第${CURRY_PAGENUMBER}页 ==>`,
3 years ago
requestQueue[index].title,
requestQueue[index].url
3 years ago
);
await loadHtml(requestQueue[index].url, requestQueue[index].title);
}
console.log(`${CURRY_PAGENUMBER}页全部抓取完成`);
CURRY_PAGENUMBER++;
if (CURRY_PAGENUMBER <= MAX_PAGENUMBER) spiderQueue(soureUrl);
});
};
// 加载 HTML
const loadHtml = async (url, title) => {
try {
const { data } = await request({ url });
const $ = cheerio.load(data);
const soureURL = url.substring(0, url.length - 5);
3 years ago
const forNum = +$("#title .imageset-sum").html().slice(2);
3 years ago
const loadQueue = [];
3 years ago
for (let index = 1; index <= forNum; index++) {
if (index === 1) {
const imageUrl = await loadImages(`${soureURL}.html`);
loadQueue.push(imageUrl);
continue;
}
3 years ago
const imageUrl = await loadImages(`${soureURL}_${index}.html`);
loadQueue.push(imageUrl);
}
3 years ago
await saveImages({ [title]: loadQueue }, forNum);
3 years ago
} catch (error) {
console.log(`loadHtml: 下载${title}时出现错误!`);
console.log(error);
}
};
// 获取图片 URL
const loadImages = async (url) => {
try {
const resultHtml = await request({ url });
const $ = cheerio.load(resultHtml.data);
return $(".img_box a img").attr("src");
} catch (error) {
console.log(`loadImages: 下载${url}时出现错误!`);
console.log(error);
}
};
spiderQueue(urlArray[0]);