这篇文章主要介绍了nodejs爬虫中如何设置动态ip,具有一定借鉴价值,感兴趣的朋友可以参考下,希望大家阅读完这篇文章之后大有收获,下面让小编带着大家一起了解一下。
说明
1、建立动态IP需要使用superagent-proxy插件,superagent-proxy。
2、为了避免每次爬取时都要获得一次动态IP列表,在redis中保存爬取到的动态IP列表,并设置10分钟的过期时间。重新发送获取动态IP的请求后,请在数据过期。
实例
package.json { "name": "xxx", "version": "1.0.0", "description": "xxx", "main": "arf.js", "scripts": { "arf": "nodemon src/app.js --exec babel-node --config package.json" }, "keywords": [ "爬虫" ], "author": "lidikang", "license": "MIT", "dependencies": { "bluebird": "^3.5.1", "cheerio": "^1.0.0-rc.2", "eventproxy": "^1.0.0", "mongoose": "^4.13.6", "mongoose-findorcreate": "^2.0.0", "progress": "^2.0.0", "redis": "^2.8.0", "superagent": "^3.8.1", "superagent-proxy": "^1.0.2" }, "devDependencies": { "babel-cli": "^6.26.0", "babel-preset-es2015": "^6.24.1", "babel-preset-stage-2": "^6.24.1", "nodemon": "^1.12.4" }, "nodemonConfig": { "ignore": [ "ips.json", "docs/*" ], "delay": "2500" } } app.js import request from 'superagent' import requestProxy from 'superagent-proxy' import redis from 'redis' // superagent添加使用代理ip的插件 requestProxy(request) // redis promise化 bluebird.promisifyAll(redis.RedisClient.prototype) bluebird.promisifyAll(redis.Multi.prototype) // 建立mongoose和redis连接 const redisClient = connectRedis() /** * 初始化redis */ function connectRedis() { let client = redis.createClient(config.REDIS_URL) client.on("ready", function(err) { console.log('redis连接 √') }) client.on("error", function(err) { console.log(`redis错误,${err} ×`); }) return client } /** * 请求免费代理,读取redis,如果代理信息已经过期,重新请求免费代理请求 */ async function getProxyIp() { // 先从redis读取缓存ip let localIpStr = await redisClient.getAsync('proxy_ips') let ips = null // 如果本地存在,则随机返回其中一个ip,否则重新请求 if (localIpStr) { let localIps = localIpStr.split(',') return localIps[parseInt(Math.random() * localIps.length)] } else { let ipsJson = (await request.get('http://api.pcdaili.com/?orderid=888888888&num=100&protocol=1&method=1&an_ha=1&sp1=1&sp2=1&format=json&sep=1')).body let isRequestSuccess = false if (ipsJson && ipsJson.data.proxy_list) { ips = ipsJson.data.proxy_list isRequestSuccess = true } else { ips = ['http://127.0.0.1'] } // 将爬取结果存入本地,缓存时间10分钟 if (isRequestSuccess) { redisClient.set("proxy_ips", ips.join(','), 'EX', 10 * 60) } return ips[parseInt(Math.random() * ips.length)] } } async function doRequest(){ let userAgent = userAgents[parseInt(Math.random() * userAgents.length)] let ip = await getProxyIp() let useIp = 'http://' + ip request.get('http://www.xxx.com') .set({ 'User-Agent': userAgent }) .timeout({ response: 5000, deadline: 60000 }) .proxy(ip) .end(async(err, res) => { // 处理数据 }) }
感谢你能够认真阅读完这篇文章,希望小编分享的“nodejs爬虫中如何设置动态ip”这篇文章对大家有帮助,同时也希望大家多多支持亿速云,关注亿速云行业资讯频道,更多相关知识等着你来学习!
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。