文章出自GitHubalsotanghttps://github.com/alsotang/node-lessons 感谢大神

代码

const async = require('async')
const superagent = require('superagent')
const cheerio = require('cheerio')
var eventproxy = require('eventproxy')
const url = require('url')

// 并发计数器
var concurrencyCount = 0
var fetchUrl = (url, callback) => {
    // delay 的值为 2000 以内的随机整数
    const delay = parseInt((Math.random() * 10000000) % 2000, 10)
    concurrencyCount++
    console.log('现在的并发数是', concurrencyCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒');
    setTimeout(() => {
        concurrencyCount--
        callback(null, url)
    }, delay)
}

var urls = []
var condeUrl = 'https://cnodejs.org/'
// 爬取url
superagent.get(condeUrl)
    .end((err, res) => {
        if (err) return console.error(err);

        var $ = cheerio.load(res.text)
        // 爬取首页的url
        $('#topic_list .topic_title').each((idx, el) => {
            var $el = $(el)
            // $element.attr('href') 本来的样子是 /topic/542acd7d5d28233425538b04
            // 我们用 url.resolve 来自动推断出完整 url,变成
            // https://cnodejs.org/topic/542acd7d5d28233425538b04 的形式
            var href = url.resolve(condeUrl, $el.attr('href'))
            urls.push(href)
        })

        async.mapLimit(urls, 5, (url, callback) => {
            fetchUrl(url, callback)
        }, (err, result) => {
            console.log('final:')
            console.log(result)

            var ep = new eventproxy()

            ep.after('topic_html', urls.length, (topics) => {
                topics = topics.map((topicPair) => {
                    var topicUrl = topicPair[0]
                    var topicHtml = topicPair[1]
                    var $ = cheerio.load(topicHtml)
                    return ({
                        title: $('.topic_full_title').text().trim(),
                        href: topicUrl,
                        comment1: $('.reply_content').eq(0).text().trim(),
                        author: $('.user_card .user_name a').text().trim(),
                        score: $('.user_card .floor .big').text().trim(),
                        content: $('.markdown-text p').text().trim(),
                    })
                })

                console.log('topics:');
                console.log(topics);
            })

            urls.forEach((topicUrl) => {
                superagent.get(topicUrl)
                    .end((err, res) => {
                        console.log('fetch ' + topicUrl + ' successful');
                        ep.emit('topic_html', [topicUrl, res.text]);
                    })
            })

        })
    })