使用Express给Python提供API
Express 初步认识
Express是一个简洁而灵活的 Node.js Web应用框架, 提供了一系列强大特性帮助你创建各种 Web 应用,和丰富的 HTTP 工具。使用 Express 可以快速地搭建一个完整功能的网站。
Express 官方文档
教程参考:Node.js Express 框架
安装Express
在项目目录中使用指令:
npm install express --save
来安装(--save
和-S
等效)
DEMO
//index.js
var express = require('express');
var app = express();
app.get('/', function (req, res) {
res.send('Hello World');
})
var server = app.listen(8081, function () {
var host = server.address().address
var port = server.address().port
console.log("应用实例,访问地址为 http://%s:%s", host, port)
})
启动
$ node index.js
应用实例,访问地址为 http://0.0.0.0:8081
在浏览器中访问localhost:8001
会看到喜闻乐见的Hello World
Express在爬虫中的应用
当我们遇到一些JS加密函数时候,我们可以不用去尝试完全用Python改写,而是可以直接利用它,因为我们仅仅需要加密后的结果,因此如何让JS源码运行然后得到结果给Python才是我们关心的,除了使用Python的第三方库,我们还可以使用Express这个基于Node.js环境下的服务器去跑JS函数然后提供API给Python调用
简易的Python调用API示例
//index.js
var express = require('express');
var app = express();
function target_func() {
//加密函数
return 'I AM SECRET';
}
app.get('/', function (req, res) {
let data = taget_func();
res.send(data);
})
var server = app.listen(8081, function () {
var host = server.address().address
var port = server.address().port
console.log("应用实例,访问地址为 http://%s:%s", host, port)
})
此时便可以用Python获取
import requests
res = requests.get('http://0.0.0.0:8001/')
print(res.text)
输出结果:
I AM SECRET
参数传递
继续之前例子,新增以下函数:
app.get('/params',function(req,res){
let data = req.query; //返回Object
res.send(data);
})
尝试访问:localhost:8001/params?name=EXAMPLE
响应结果:
{"name":"EXAMPLE"}
表单传递
注意,表单传递需要安装body-parser
才能使用.body()
npm install body-parser -S
完整代码如下:
var express = require('express')
var bodyParser = require('body-parser')
var app = express();
app.use(bodyParser.urlencoded({extended:false}))
app.use(bodyParser.json())
app.post('/form',function(req,res){
let data = req.body; //返回Object
res.send(data);
})
而对于Python这边,如果需要传递JSON数据,则需要在请求头加入content-type
一般为application/json
,并且需要使用json.dumps()
处理字典然后传递表单。完整Python代码如下:
import requests
import json
form = {
"name": "example"
}
res = requests.post(url='http://localhost:8081/getIt',data=json.dumps(form),headers={"content-type": "application/json"})
print(res.content)
响应结果:
{"name":"example"}
例子:七麦数据榜单
七麦数据榜单API有一个JS生成的加密参数analysis
,参考此篇文章:七麦数据榜单API加密参数逆向
Express 提供加密参数生成 API
在逆向文章中,收尾部分提到了名为params
的信息很关键:
"params": {
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}
它直接反应了请求的榜单关键信息,因此我会要求Python脚本提供这些参数,而对于其它基本不会变动的信息则由express
中脚本负责(拼接、传递),所以JS代码会是如下:
负责生成参数的js文件中追加:
module.exports = {get:get_analysis}
EXPRESS脚本:
const analysis = require('./qm')
const express = require('express')
const bodyParser = require('body-parser')
const app = express()
app.use(bodyParser.urlencoded({extended:false}))
app.use(bodyParser.json())
const get_analysis = analysis.get
app.post('/getIt', function (req, res) {
const data = req.body
console.log(data)
let t = {
"url": "/rank/index",
"method": "get",
"headers": {
"common": {
"Accept": "application/json, text/plain, */*"
},
"delete": {},
"get": {},
"head": {},
"post": {
"Content-Type": "application/x-www-form-urlencoded"
},
"put": {
"Content-Type": "application/x-www-form-urlencoded"
},
"patch": {
"Content-Type": "application/x-www-form-urlencoded"
}
},
"params": {
"brand": "",
"device": "",
"country": "",
"genre": "",
"date": "",
"page": 1,
"is_rank_index": 1,
"snapshot": ""
},
"baseURL": "https://api.qimai.cn",
"transformRequest": [
null
],
"transformResponse": [
null
],
"timeout": 15000,
"withCredentials": true,
"xsrfCookieName": "XSRF-TOKEN",
"xsrfHeaderName": "X-XSRF-TOKEN",
"maxContentLength": -1,
"maxBodyLength": -1
}
t["params"] = data
let result = {
"msg":"DONE",
"analysis":get_analysis(t)
}
res.send(result)
})
const server = app.listen(8081, function () {
const host = server.address().address
const port = server.address().port
console.log("应用实例,访问地址为 http://%s:%s", host, port)
})
脚本会监听localhost:8081
,向localhost:8081/getIt
发送类似于以下表单:
{
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}
响应结果:
{"msg":"DONE","analysis":"ezUnVisSKAN6W3ZSKQtwQSwIKhA1LT9BfF0iHi9UeBQ4DS4UIy1ZRlYkXRRjZ1xfI0dXCxtbWhgKCgVcTiFBVF1OS0gFAwNaViEaBQ=="}
Python请求API
Python简单DEMO:
import requests
import json
form = {
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}
req = requests.post(url='http://localhost:8081/getIt',data=json.dumps(form),headers={"content-type": "application/json"})
res_json = res.json()
print(res_json['analysis'])
结果:
ezUnVisSKAN6W3ZSKQtwQSwIKhA1LT9BfF0iHi9UeBQ4DS4UIy1ZRlYkXRRjZ1xfI0dXCxtbWhgKCgVcTiFBVF1OS0kMAgBVUiEaBQ==
至于params
上面Python传递的params
:
{
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}
brand
对应的是 免费榜单 类型,device
对应的是获取的是iOS商店的榜单,country
对应的是商店分区,genre
暂时不明,写死即可,is_rank_index
同理,这些参数基本不用变化,或者在爬取前根据需求手动改写以下即可,而对于date
、page
、snapshot
这三个需要在爬取时动态更新的参数,下面给出解决办法:
首先可以直接获取https://api.qimai.cn/rank/index
,请求参数如下(JSON格式展示,下同):
{
"analysis": "exs4CQYVIwNvZmETByZRQAMPNlk4WlVHUFkISwhXUgQaI0dPQEwGCQVXVlAJdkJR",
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36"
}
只根据四个参数尝试生成analysis
(利用上面EXPRESS脚本API):
POST http://0.0.0.0:8081/getIt
content-type: application/json
{
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36"
}
生成结果:
{
"msg": "DONE",
"analysis": "exs4CQYVIwNvZmETByZRQAMPNlk4WlVHUFkISwhXUgQaI0dPQEwAAgRWVlUJdkJR"
}
可以看到,生成结果一致
请求响应结果:
{
"code": 10000,
"msg": "\u6210\u529f",
"downloadVip": false,
"maxPage": 4,
"rankInfo": [...],
"snapshot": "14:54:04",
"tag": 0,
"is_logout": 0
}
其中rankInfo
是返回的榜单数据(即第一页数据),此处省略。
该返回数据包含了两个较为有用的信息。首先是snapshot
的值,对应的是网页上14:54~至今
,而maxPage
则是指明该分区下最大的数据页。
如果需要获取其它时间分区下的数据,如上图的01:31
,可以请求以下https://api.qimai.cn/rank/indexSnapshot
,参数如下:
{
"analysis": "ezUnGik8MAN5ZXIbKDVwTCwYOQ4aFCBMa2A1FAJ XhcADjFFOVkaQ1YND05QWAUHGzcWGApGWVgXJEIOD1dRW1ZMQE4FcRRQ",
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-23",
"page": "1",
"is_rank_index": "1"
}
这里省略生成analysis
测试,参考上面
响应结果:
{
"code": 10000,
"msg": "\u6210\u529f",
"timeData": [
{
"id": "5839748",
"btn": "01:31",
"up_num": "24",
"down_num": "0",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "01:31:03",
"clear": ""
},
{
"id": "5839758",
"btn": "01:46",
"up_num": "0",
"down_num": "19",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "01:46:04",
"clear": ""
},
{
"id": "5839770",
"btn": "02:49",
"up_num": "67",
"down_num": "87",
"new_num": "2",
"out_num": "2",
"date": "2022-11-23",
"param": "02:49:04",
"clear": ""
},
{
"id": "5839899",
"btn": "04:46",
"up_num": "0",
"down_num": "127",
"new_num": "2",
"out_num": "2",
"date": "2022-11-23",
"param": "04:46:04",
"clear": ""
},
{
"id": "5839954",
"btn": "05:47",
"up_num": "53",
"down_num": "68",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "05:47:03",
"clear": ""
},
{
"id": "5840022",
"btn": "07:50",
"up_num": "0",
"down_num": "81",
"new_num": "4",
"out_num": "4",
"date": "2022-11-23",
"param": "07:50:03",
"clear": ""
},
{
"id": "5840038",
"btn": "08:48",
"up_num": "88",
"down_num": "61",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "08:48:04",
"clear": ""
},
{
"id": "5840159",
"btn": "09:35",
"up_num": "91",
"down_num": "91",
"new_num": "3",
"out_num": "3",
"date": "2022-11-23",
"param": "09:35:03",
"clear": ""
},
{
"id": "5840246",
"btn": "11:50",
"up_num": "91",
"down_num": "72",
"new_num": "5",
"out_num": "5",
"date": "2022-11-23",
"param": "11:50:04",
"clear": ""
},
{
"id": "5840388",
"btn": "14:54",
"up_num": "90",
"down_num": "72",
"new_num": "4",
"out_num": "4",
"date": "2022-11-23",
"param": "14:54:04",
"clear": ""
}
],
"is_logout": 0
}
可以看到已经获取到其对应值了:
{
"id": "5839748",
"btn": "01:31",
"up_num": "24",
"down_num": "0",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "01:31:03",
"clear": ""
}
即01:31:03
,将其加入params
即可
现在三个参数获取途径均已知晓,通过代码获取拼接即可。