You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
3.1 KiB
104 lines
3.1 KiB
const axios = require("axios"); |
|
const cheerio = require("cheerio"); |
|
const saveImages = require("./utils/saveImages"); |
|
|
|
const request = axios.create({ |
|
methods: "GET", |
|
timeout: 20 * 1000, |
|
headers: { |
|
accept: |
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", |
|
"accept-encoding": "gzip, deflate, br", |
|
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", |
|
dnt: "1", |
|
"if-modified-since": "Thu, 15 Jan 2022 17:48:09 GMT", |
|
"if-none-match": "1641491296", |
|
"sec-ch-ua": |
|
'" Not A;Brand";v="99", "Chromium";v="96", "Microsoft Edge";v="96"', |
|
"sec-ch-ua-mobile": "?0", |
|
"sec-ch-ua-platform": "Windows", |
|
"sec-fetch-dest": "document", |
|
"sec-fetch-mode": "navigate", |
|
"sec-fetch-site": "none", |
|
"sec-fetch-user": "?1", |
|
"upgrade-insecure-requests": 1, |
|
"user-agent": |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62", |
|
}, |
|
}); |
|
|
|
urlArray = [ |
|
"https://www.yitudao.com/meinv/xinggan/", |
|
"https://www.yitudao.com/meinv/siwameitui/", |
|
"https://www.yitudao.com/meinv/chemo/", |
|
"https://www.yitudao.com/meinv/wangluomeinv/", |
|
"https://www.yitudao.com/meinv/tiyumeinv/", |
|
"https://www.yitudao.com/meinv/rentiyishu/", |
|
]; |
|
|
|
let CURRY_PAGENUMBER = 1; // 爬取起始页码 |
|
let MAX_PAGENUMBER = 573; // 爬取最大页码 |
|
|
|
// 爬取队列 |
|
const spiderQueue = async (soureUrl) => { |
|
const url = `${soureUrl}${CURRY_PAGENUMBER}.html`; |
|
request({ url }).then(async (res) => { |
|
const $ = cheerio.load(res.data); |
|
|
|
const requestQueue = []; |
|
$("a[title]").each((i, elem) => { |
|
requestQueue.push({ |
|
title: $(elem).attr("title"), |
|
url: $(elem).attr("href"), |
|
}); |
|
}); |
|
|
|
for (let index = 0; index < requestQueue.length; index++) { |
|
console.log( |
|
`正在抓取第${CURRY_PAGENUMBER}页 ==>`, |
|
requestQueue[index].title |
|
); |
|
await loadHtml(requestQueue[index].url, requestQueue[index].title); |
|
} |
|
|
|
console.log(`第${CURRY_PAGENUMBER}页全部抓取完成`); |
|
CURRY_PAGENUMBER++; |
|
if (CURRY_PAGENUMBER <= MAX_PAGENUMBER) spiderQueue(soureUrl); |
|
}); |
|
}; |
|
|
|
// 加载 HTML |
|
const loadHtml = async (url, title) => { |
|
try { |
|
const { data } = await request({ url }); |
|
const $ = cheerio.load(data); |
|
const soureURL = url.substring(0, url.length - 5); |
|
|
|
const forNum = $("#title .imageset-sum").html().slice(2); |
|
|
|
const loadQueue = []; |
|
for (let index = 2; index <= forNum; index++) { |
|
const imageUrl = await loadImages(`${soureURL}_${index}.html`); |
|
loadQueue.push(imageUrl); |
|
} |
|
|
|
await saveImages({ [title]: loadQueue }); |
|
} catch (error) { |
|
console.log(`loadHtml: 下载${title}时出现错误!`); |
|
console.log(error); |
|
} |
|
}; |
|
|
|
// 获取图片 URL |
|
const loadImages = async (url) => { |
|
try { |
|
const resultHtml = await request({ url }); |
|
const $ = cheerio.load(resultHtml.data); |
|
return $(".img_box a img").attr("src"); |
|
} catch (error) { |
|
console.log(`loadImages: 下载${url}时出现错误!`); |
|
console.log(error); |
|
} |
|
}; |
|
|
|
spiderQueue(urlArray[0]); |