26

request停止维护:用node.js实现http网页爬虫抓取,模拟ajax\post请求,大文件上传下载

 4 years ago
source link: http://ourjs.com/detail/k881e3m0s7d7
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

最近 node.js 一个比较出名的 http request 模块停止维护了。其实这个模块已经变得非常臃肿,模块依赖过多,体积过大,接口不统一。

其实现在node.js的http模块已经非常完善,几行代码就能自己写一个,比如:

Node.js网页抓取:一个最简单的http请求客户端示例(request client)

不过上面的示例并不支持post和文件下载,只要稍加改动即可。

源代码

var http = require('http')
var https = require('https')
var url = require('url')
var qs = require('querystring')

var filters = []

/*
settings => {
 url: '/sync/list'
 , data: { a,b,c } / stream
 , type: 'qs' / 'json'
 , dataType: 'json' / 'qs' / 'raw' / 'stream'
}

cb => (err, res, {})
*/
var request = function(settings, cb) {
 if (typeof settings == 'string') {
 settings = { url: settings }
 }

 settings.headers = settings.headers || {}

 var data = settings.data || settings.body || settings.json
 var dataType = settings.dataType
 var stream
 var rawData

 if (data && data.pipe) {
 stream = data
 // rawData = data
 } else if (typeof data == 'object') {
 if (settings.type == 'qs') {
 rawData = qs.stringify(data)
 } else {
 rawData = JSON.stringify(data)
 settings.headers['content-type'] = 'application/json'
 }
 } else if (data) {
 rawData = data
 }

 if (rawData) {
 rawData = Buffer.from(rawData)
 settings.headers['content-length'] = rawData.length
 }

 var reqUrl = settings.url
 var urlObj = url.parse(reqUrl)

 var options = {
 hostname : urlObj.hostname
 , port : urlObj.port
 , path : urlObj.path
 , method : settings.method || ((stream || rawData) ? 'POST' : 'GET')
 , headers : settings.headers
 }

 for (var i = 0; i < filters.length; i++) {
 var filter = filters[i]
 filter(settings, options)
 }

 var requestHandler = function(res) {
 var receives = []
 var err = null
 var statusCode = res.statusCode
 var headers = res.headers

 //重定向
 if ((statusCode == 302 || statusCode == 301) && headers.location) {
 options.url = headers.location
 request(options, cb)
 return
 }

 if (statusCode > 300) {
 err = new Error('Request Failed. Status Code: ' + res.statusCode + ' ' + reqUrl)
 }

 //doesn't parse data
 if (dataType == 'stream' || settings.stream) {
 cb && cb(err, res, {})
 return
 }

 res.on('data', function(chunk) {
 receives.push(chunk)
 })

 res.on('end', function() {
 var resData = Buffer.concat(receives).toString()
 if (dataType != 'raw') {
 try {
 resData = dataType == 'qs'
 ? qs.parse(resData)
 : JSON.parse(resData)
 } catch (e) { }
 }

 cb && cb(err, res, resData)
 })
 }

 var req = urlObj.protocol == 'https:'
 ? https.request(options, requestHandler)
 : http.request(options, requestHandler)

 req.on('error', function(e) {
 cb && cb(e, null, {})
 })

 if (stream) {
 stream.pipe(req)
 } else {
 rawData && req.write(rawData)
 req.end()
 }
}

var addFilter = function(filter) {
 if (typeof filter == 'function') {
 filters.push(filter)
 } else {
 console.log('request middware is not a function')
 }
}

module.exports = {
 request : request
 , use : addFilter
}

参数

请求的网址: url: '/sync/list'
请求POST的数据,如果没有则为GET: data: { a,b,c } / stream
请求的数据类型: type: 'qs' / 'json'
返回的数据类型: dataType: 'json' / 'qs' / 'raw' / 'stream'

使用方法

模拟GET

const request = require('./request').request

request({ url: 'http://ourjs.com/home' }, function(err, response, data) {
 console.log(data)
})

模拟POST

指定 data 即可:

request({ url: 'http://ourjs.com/home', data: { abc: 1 } }, function(err, response, data) {
 console.log(data)
})

下载流文件

将请求文件下载到本地,使用流可避免使用进程的缓冲区,可下载大文件

const fs = require('fs')

request({ url: 'http://ourjs.com/home', dataType: 'stream' }, function(err, response, data) {
 let ws = fs.createWriteStream('./ourjs.text')
 response.pipe(ws)
})

文件上传到http request流

为简化操作,提高性能,这里并没有使用HTTP from的文件模式,而是直接将文件流输出到http流,需要在http那端直接将流写入文件。一次仅支持上传一个文件。同样支持大文件上传。

var rs = fs.createReadStream('./ourjs.text')
request({ url: 'http://receive.url', data: rs }, function(err, response, data) {
 console.log(data)
})

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK