Express 初步认识

Express是一个简洁而灵活的 Node.js Web应用框架, 提供了一系列强大特性帮助你创建各种 Web 应用,和丰富的 HTTP 工具。使用 Express 可以快速地搭建一个完整功能的网站。

Express 官方文档
教程参考:Node.js Express 框架

安装Express

项目目录中使用指令:
npm install express --save
来安装(--save-S等效)

DEMO

//index.js
var express = require('express');
var app = express();

app.get('/', function (req, res) {
res.send('Hello World');
})

var server = app.listen(8081, function () {

var host = server.address().address
var port = server.address().port

console.log("应用实例,访问地址为 http://%s:%s", host, port)

})

启动

$ node index.js
应用实例,访问地址为 http://0.0.0.0:8081

在浏览器中访问localhost:8001会看到喜闻乐见的Hello World

Express在爬虫中的应用

当我们遇到一些JS加密函数时候,我们可以不用去尝试完全用Python改写,而是可以直接利用它,因为我们仅仅需要加密后的结果,因此如何让JS源码运行然后得到结果给Python才是我们关心的,除了使用Python的第三方库,我们还可以使用Express这个基于Node.js环境下的服务器去跑JS函数然后提供API给Python调用

简易的Python调用API示例

//index.js
var express = require('express');
var app = express();

function target_func() {
//加密函数
return 'I AM SECRET';
}

app.get('/', function (req, res) {
let data = taget_func();
res.send(data);
})

var server = app.listen(8081, function () {

var host = server.address().address
var port = server.address().port

console.log("应用实例,访问地址为 http://%s:%s", host, port)

})

此时便可以用Python获取

import requests

res = requests.get('http://0.0.0.0:8001/')
print(res.text)

输出结果:

I AM SECRET

参数传递

继续之前例子,新增以下函数:

app.get('/params',function(req,res){
let data = req.query; //返回Object
res.send(data);
})

尝试访问:localhost:8001/params?name=EXAMPLE
响应结果:

{"name":"EXAMPLE"}

表单传递

注意,表单传递需要安装body-parser才能使用.body()
npm install body-parser -S

完整代码如下:

var express = require('express')
var bodyParser = require('body-parser')
var app = express();
app.use(bodyParser.urlencoded({extended:false}))
app.use(bodyParser.json())

app.post('/form',function(req,res){
let data = req.body; //返回Object
res.send(data);
})

而对于Python这边,如果需要传递JSON数据,则需要在请求头加入content-type一般为application/json,并且需要使用json.dumps()处理字典然后传递表单。完整Python代码如下:

import requests
import json

form = {
"name": "example"
}

res = requests.post(url='http://localhost:8081/getIt',data=json.dumps(form),headers={"content-type": "application/json"})
print(res.content)

响应结果:

{"name":"example"}

例子:七麦数据榜单

七麦数据榜单API有一个JS生成的加密参数analysis,参考此篇文章:七麦数据榜单API加密参数逆向

Express 提供加密参数生成 API

在逆向文章中,收尾部分提到了名为params的信息很关键:

 "params": {
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}

它直接反应了请求的榜单关键信息,因此我会要求Python脚本提供这些参数,而对于其它基本不会变动的信息则由express中脚本负责(拼接、传递),所以JS代码会是如下:

负责生成参数的js文件中追加:

module.exports = {get:get_analysis}

EXPRESS脚本:

const analysis = require('./qm')
const express = require('express')
const bodyParser = require('body-parser')

const app = express()
app.use(bodyParser.urlencoded({extended:false}))
app.use(bodyParser.json())

const get_analysis = analysis.get

app.post('/getIt', function (req, res) {
const data = req.body
console.log(data)
let t = {
"url": "/rank/index",
"method": "get",
"headers": {
"common": {
"Accept": "application/json, text/plain, */*"
},
"delete": {},
"get": {},
"head": {},
"post": {
"Content-Type": "application/x-www-form-urlencoded"
},
"put": {
"Content-Type": "application/x-www-form-urlencoded"
},
"patch": {
"Content-Type": "application/x-www-form-urlencoded"
}
},
"params": {
"brand": "",
"device": "",
"country": "",
"genre": "",
"date": "",
"page": 1,
"is_rank_index": 1,
"snapshot": ""
},
"baseURL": "https://api.qimai.cn",
"transformRequest": [
null
],
"transformResponse": [
null
],
"timeout": 15000,
"withCredentials": true,
"xsrfCookieName": "XSRF-TOKEN",
"xsrfHeaderName": "X-XSRF-TOKEN",
"maxContentLength": -1,
"maxBodyLength": -1
}
t["params"] = data

let result = {
"msg":"DONE",
"analysis":get_analysis(t)
}

res.send(result)
})


const server = app.listen(8081, function () {

const host = server.address().address
const port = server.address().port

console.log("应用实例,访问地址为 http://%s:%s", host, port)

})

脚本会监听localhost:8081,向localhost:8081/getIt发送类似于以下表单:

{
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}

响应结果:

{"msg":"DONE","analysis":"ezUnVisSKAN6W3ZSKQtwQSwIKhA1LT9BfF0iHi9UeBQ4DS4UIy1ZRlYkXRRjZ1xfI0dXCxtbWhgKCgVcTiFBVF1OS0gFAwNaViEaBQ=="}

Python请求API

Python简单DEMO:

import requests
import json

form = {
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}


req = requests.post(url='http://localhost:8081/getIt',data=json.dumps(form),headers={"content-type": "application/json"})
res_json = res.json()
print(res_json['analysis'])

结果:

ezUnVisSKAN6W3ZSKQtwQSwIKhA1LT9BfF0iHi9UeBQ4DS4UIy1ZRlYkXRRjZ1xfI0dXCxtbWhgKCgVcTiFBVF1OS0kMAgBVUiEaBQ==

至于params

上面Python传递的params

{
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-20",
"page": 2,
"is_rank_index": 1,
"snapshot": "19:42:05"
}

brand对应的是 免费榜单 类型,device 对应的是获取的是iOS商店的榜单,country对应的是商店分区,genre暂时不明,写死即可,is_rank_index同理,这些参数基本不用变化,或者在爬取前根据需求手动改写以下即可,而对于datepagesnapshot这三个需要在爬取时动态更新的参数,下面给出解决办法:

首先可以直接获取https://api.qimai.cn/rank/index,请求参数如下(JSON格式展示,下同):

{
"analysis": "exs4CQYVIwNvZmETByZRQAMPNlk4WlVHUFkISwhXUgQaI0dPQEwGCQVXVlAJdkJR",
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36"
}

只根据四个参数尝试生成analysis(利用上面EXPRESS脚本API):

POST http://0.0.0.0:8081/getIt
content-type: application/json

{
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36"
}

生成结果:

{
"msg": "DONE",
"analysis": "exs4CQYVIwNvZmETByZRQAMPNlk4WlVHUFkISwhXUgQaI0dPQEwAAgRWVlUJdkJR"
}

可以看到,生成结果一致

请求响应结果:

{
"code": 10000,
"msg": "\u6210\u529f",
"downloadVip": false,
"maxPage": 4,
"rankInfo": [...],
"snapshot": "14:54:04",
"tag": 0,
"is_logout": 0
}

其中rankInfo是返回的榜单数据(即第一页数据),此处省略。
该返回数据包含了两个较为有用的信息。首先是snapshot的值,对应的是网页上14:54~至今,而maxPage则是指明该分区下最大的数据页。
BANNER

如果需要获取其它时间分区下的数据,如上图的01:31,可以请求以下https://api.qimai.cn/rank/indexSnapshot,参数如下:

{
"analysis": "ezUnGik8MAN5ZXIbKDVwTCwYOQ4aFCBMa2A1FAJ XhcADjFFOVkaQ1YND05QWAUHGzcWGApGWVgXJEIOD1dRW1ZMQE4FcRRQ",
"brand": "free",
"device": "iphone",
"country": "cn",
"genre": "36",
"date": "2022-11-23",
"page": "1",
"is_rank_index": "1"
}

这里省略生成analysis测试,参考上面

响应结果:

{
"code": 10000,
"msg": "\u6210\u529f",
"timeData": [
{
"id": "5839748",
"btn": "01:31",
"up_num": "24",
"down_num": "0",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "01:31:03",
"clear": ""
},
{
"id": "5839758",
"btn": "01:46",
"up_num": "0",
"down_num": "19",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "01:46:04",
"clear": ""
},
{
"id": "5839770",
"btn": "02:49",
"up_num": "67",
"down_num": "87",
"new_num": "2",
"out_num": "2",
"date": "2022-11-23",
"param": "02:49:04",
"clear": ""
},
{
"id": "5839899",
"btn": "04:46",
"up_num": "0",
"down_num": "127",
"new_num": "2",
"out_num": "2",
"date": "2022-11-23",
"param": "04:46:04",
"clear": ""
},
{
"id": "5839954",
"btn": "05:47",
"up_num": "53",
"down_num": "68",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "05:47:03",
"clear": ""
},
{
"id": "5840022",
"btn": "07:50",
"up_num": "0",
"down_num": "81",
"new_num": "4",
"out_num": "4",
"date": "2022-11-23",
"param": "07:50:03",
"clear": ""
},
{
"id": "5840038",
"btn": "08:48",
"up_num": "88",
"down_num": "61",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "08:48:04",
"clear": ""
},
{
"id": "5840159",
"btn": "09:35",
"up_num": "91",
"down_num": "91",
"new_num": "3",
"out_num": "3",
"date": "2022-11-23",
"param": "09:35:03",
"clear": ""
},
{
"id": "5840246",
"btn": "11:50",
"up_num": "91",
"down_num": "72",
"new_num": "5",
"out_num": "5",
"date": "2022-11-23",
"param": "11:50:04",
"clear": ""
},
{
"id": "5840388",
"btn": "14:54",
"up_num": "90",
"down_num": "72",
"new_num": "4",
"out_num": "4",
"date": "2022-11-23",
"param": "14:54:04",
"clear": ""
}
],
"is_logout": 0
}

可以看到已经获取到其对应值了:

   {
"id": "5839748",
"btn": "01:31",
"up_num": "24",
"down_num": "0",
"new_num": "1",
"out_num": "1",
"date": "2022-11-23",
"param": "01:31:03",
"clear": ""
}

01:31:03,将其加入params即可

现在三个参数获取途径均已知晓,通过代码获取拼接即可。