文章出自GitHub:alsotang 的 https://github.com/alsotang/node-lessons 感谢大神
代码
const async = require('async')
const superagent = require('superagent')
const cheerio = require('cheerio')
var eventproxy = require('eventproxy')
const url = require('url')
// 并发计数器
var concurrencyCount = 0
var fetchUrl = (url, callback) => {
// delay 的值为 2000 以内的随机整数
const delay = parseInt((Math.random() * 10000000) % 2000, 10)
concurrencyCount++
console.log('现在的并发数是', concurrencyCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒');
setTimeout(() => {
concurrencyCount--
callback(null, url)
}, delay)
}
var urls = []
var condeUrl = 'https://cnodejs.org/'
// 爬取url
superagent.get(condeUrl)
.end((err, res) => {
if (err) return console.error(err);
var $ = cheerio.load(res.text)
// 爬取首页的url
$('#topic_list .topic_title').each((idx, el) => {
var $el = $(el)
// $element.attr('href') 本来的样子是 /topic/542acd7d5d28233425538b04
// 我们用 url.resolve 来自动推断出完整 url,变成
// https://cnodejs.org/topic/542acd7d5d28233425538b04 的形式
var href = url.resolve(condeUrl, $el.attr('href'))
urls.push(href)
})
async.mapLimit(urls, 5, (url, callback) => {
fetchUrl(url, callback)
}, (err, result) => {
console.log('final:')
console.log(result)
var ep = new eventproxy()
ep.after('topic_html', urls.length, (topics) => {
topics = topics.map((topicPair) => {
var topicUrl = topicPair[0]
var topicHtml = topicPair[1]
var $ = cheerio.load(topicHtml)
return ({
title: $('.topic_full_title').text().trim(),
href: topicUrl,
comment1: $('.reply_content').eq(0).text().trim(),
author: $('.user_card .user_name a').text().trim(),
score: $('.user_card .floor .big').text().trim(),
content: $('.markdown-text p').text().trim(),
})
})
console.log('topics:');
console.log(topics);
})
urls.forEach((topicUrl) => {
superagent.get(topicUrl)
.end((err, res) => {
console.log('fetch ' + topicUrl + ' successful');
ep.emit('topic_html', [topicUrl, res.text]);
})
})
})
})