Express 初步认识

Express是一个简洁而灵活的 Node.js Web应用框架, 提供了一系列强大特性帮助你创建各种 Web 应用,和丰富的 HTTP 工具。使用 Express 可以快速地搭建一个完整功能的网站。

Express 官方文档
教程参考:Node.js Express 框架

安装Express

项目目录中使用指令:
npm install express --save
来安装(--save-S等效)

DEMO

//index.js
var express = require('express');
var app = express();
 
app.get('/', function (req, res) {
   res.send('Hello World');
})
 
var server = app.listen(8081, function () {
 
  var host = server.address().address
  var port = server.address().port
 
  console.log("应用实例,访问地址为 http://%s:%s", host, port)
 
})

启动

$ node index.js
应用实例,访问地址为 http://0.0.0.0:8081

在浏览器中访问localhost:8001会看到喜闻乐见的Hello World

Express在爬虫中的应用

当我们遇到一些JS加密函数时候,我们可以不用去尝试完全用Python改写,而是可以直接利用它,因为我们仅仅需要加密后的结果,因此如何让JS源码运行然后得到结果给Python才是我们关心的,除了使用Python的第三方库,我们还可以使用Express这个基于Node.js环境下的服务器去跑JS函数然后提供API给Python调用

简易的Python调用API示例

//index.js
var express = require('express');
var app = express();
 
function target_func() {
    //加密函数
    return 'I AM SECRET';
}

app.get('/', function (req, res) {
   let data = taget_func();
   res.send(data);
})
 
var server = app.listen(8081, function () {
 
  var host = server.address().address
  var port = server.address().port
 
  console.log("应用实例,访问地址为 http://%s:%s", host, port)

})

此时便可以用Python获取

import requests

res = requests.get('http://0.0.0.0:8001/')
print(res.text)

输出结果:

I AM SECRET

参数传递

继续之前例子,新增以下函数:

app.get('/params',function(req,res){
    let data = req.query;   //返回Object
    res.send(data);
})

尝试访问:localhost:8001/params?name=EXAMPLE
响应结果:

{"name":"EXAMPLE"}

表单传递

注意,表单传递需要安装body-parser才能使用.body()
npm install body-parser -S

完整代码如下:

var express = require('express')
var bodyParser = require('body-parser')
var app = express();
app.use(bodyParser.urlencoded({extended:false}))
app.use(bodyParser.json())

app.post('/form',function(req,res){
    let data = req.body;   //返回Object
    res.send(data);
})

而对于Python这边,如果需要传递JSON数据,则需要在请求头加入content-type一般为application/json,并且需要使用json.dumps()处理字典然后传递表单。完整Python代码如下:

import requests
import json

form = {
    "name": "example"
}

res = requests.post(url='http://localhost:8081/getIt',data=json.dumps(form),headers={"content-type": "application/json"})
print(res.content)

响应结果:

{"name":"example"}

例子:七麦数据榜单

七麦数据榜单API有一个JS生成的加密参数analysis,参考此篇文章:七麦数据榜单API加密参数逆向

Express 提供加密参数生成 API

在逆向文章中,收尾部分提到了名为params的信息很关键:

  "params": {
        "brand": "free",
        "device": "iphone",
        "country": "cn",
        "genre": "36",
        "date": "2022-11-20",
        "page": 2,
        "is_rank_index": 1,
        "snapshot": "19:42:05"
 }

它直接反应了请求的榜单关键信息,因此我会要求Python脚本提供这些参数,而对于其它基本不会变动的信息则由express中脚本负责(拼接、传递),所以JS代码会是如下:

负责生成参数的js文件中追加:

module.exports = {get:get_analysis}

EXPRESS脚本:

const analysis = require('./qm')
const express = require('express')
const bodyParser = require('body-parser')

const app = express()
app.use(bodyParser.urlencoded({extended:false}))
app.use(bodyParser.json())

const get_analysis = analysis.get

app.post('/getIt', function (req, res) {
    const data = req.body
    console.log(data)
    let t = {
        "url": "/rank/index",
        "method": "get",
        "headers": {
            "common": {
                "Accept": "application/json, text/plain, */*"
            },
            "delete": {},
            "get": {},
            "head": {},
            "post": {
                "Content-Type": "application/x-www-form-urlencoded"
            },
            "put": {
                "Content-Type": "application/x-www-form-urlencoded"
            },
            "patch": {
                "Content-Type": "application/x-www-form-urlencoded"
            }
        },
        "params": {
            "brand": "",
            "device": "",
            "country": "",
            "genre": "",
            "date": "",
            "page": 1,
            "is_rank_index": 1,
            "snapshot": ""
        },
        "baseURL": "https://api.qimai.cn",
        "transformRequest": [
            null
        ],
        "transformResponse": [
            null
        ],
        "timeout": 15000,
        "withCredentials": true,
        "xsrfCookieName": "XSRF-TOKEN",
        "xsrfHeaderName": "X-XSRF-TOKEN",
        "maxContentLength": -1,
        "maxBodyLength": -1
    }
    t["params"] = data

    let result = {
        "msg":"DONE",
        "analysis":get_analysis(t)
    }

    res.send(result)
 })


const server = app.listen(8081, function () {

    const host = server.address().address
    const port = server.address().port

    console.log("应用实例,访问地址为 http://%s:%s", host, port)

})

脚本会监听localhost:8081,向localhost:8081/getIt发送类似于以下表单:

{
    "brand": "free",
    "device": "iphone",
    "country": "cn",
    "genre": "36",
    "date": "2022-11-20",
    "page": 2,
    "is_rank_index": 1,
    "snapshot": "19:42:05"
}

响应结果:

{"msg":"DONE","analysis":"ezUnVisSKAN6W3ZSKQtwQSwIKhA1LT9BfF0iHi9UeBQ4DS4UIy1ZRlYkXRRjZ1xfI0dXCxtbWhgKCgVcTiFBVF1OS0gFAwNaViEaBQ=="}

Python请求API

Python简单DEMO:

import requests
import json

form = {
    "brand": "free",
    "device": "iphone",
    "country": "cn",
    "genre": "36",
    "date": "2022-11-20",
    "page": 2,
    "is_rank_index": 1,
    "snapshot": "19:42:05"
}


req = requests.post(url='http://localhost:8081/getIt',data=json.dumps(form),headers={"content-type": "application/json"})
res_json = res.json()
print(res_json['analysis'])

结果:

ezUnVisSKAN6W3ZSKQtwQSwIKhA1LT9BfF0iHi9UeBQ4DS4UIy1ZRlYkXRRjZ1xfI0dXCxtbWhgKCgVcTiFBVF1OS0kMAgBVUiEaBQ==

至于params

上面Python传递的params

{
    "brand": "free",
    "device": "iphone",
    "country": "cn",
    "genre": "36",
    "date": "2022-11-20",
    "page": 2,
    "is_rank_index": 1,
    "snapshot": "19:42:05"
}

brand对应的是 免费榜单 类型,device 对应的是获取的是iOS商店的榜单,country对应的是商店分区,genre暂时不明,写死即可,is_rank_index同理,这些参数基本不用变化,或者在爬取前根据需求手动改写以下即可,而对于datepagesnapshot这三个需要在爬取时动态更新的参数,下面给出解决办法:

首先可以直接获取https://api.qimai.cn/rank/index,请求参数如下(JSON格式展示,下同):

{
    "analysis": "exs4CQYVIwNvZmETByZRQAMPNlk4WlVHUFkISwhXUgQaI0dPQEwGCQVXVlAJdkJR",
    "brand": "free",
    "device": "iphone",
    "country": "cn",
    "genre": "36"
}

只根据四个参数尝试生成analysis(利用上面EXPRESS脚本API):

POST http://0.0.0.0:8081/getIt
content-type: application/json

{
    "brand": "free",
    "device": "iphone",
    "country": "cn",
    "genre": "36"
}

生成结果:

{
  "msg": "DONE",
  "analysis": "exs4CQYVIwNvZmETByZRQAMPNlk4WlVHUFkISwhXUgQaI0dPQEwAAgRWVlUJdkJR"
}

可以看到,生成结果一致

请求响应结果:

{
    "code": 10000,
    "msg": "\u6210\u529f",
    "downloadVip": false,
    "maxPage": 4,
    "rankInfo": [...],
    "snapshot": "14:54:04",
    "tag": 0,
    "is_logout": 0
}

其中rankInfo是返回的榜单数据(即第一页数据),此处省略。
该返回数据包含了两个较为有用的信息。首先是snapshot的值,对应的是网页上14:54~至今,而maxPage则是指明该分区下最大的数据页。
BANNER

如果需要获取其它时间分区下的数据,如上图的01:31,可以请求以下https://api.qimai.cn/rank/indexSnapshot,参数如下:

{
    "analysis": "ezUnGik8MAN5ZXIbKDVwTCwYOQ4aFCBMa2A1FAJ XhcADjFFOVkaQ1YND05QWAUHGzcWGApGWVgXJEIOD1dRW1ZMQE4FcRRQ",
    "brand": "free",
    "device": "iphone",
    "country": "cn",
    "genre": "36",
    "date": "2022-11-23",
    "page": "1",
    "is_rank_index": "1"
}

这里省略生成analysis测试,参考上面

响应结果:

{
    "code": 10000,
    "msg": "\u6210\u529f",
    "timeData": [
        {
            "id": "5839748",
            "btn": "01:31",
            "up_num": "24",
            "down_num": "0",
            "new_num": "1",
            "out_num": "1",
            "date": "2022-11-23",
            "param": "01:31:03",
            "clear": ""
        },
        {
            "id": "5839758",
            "btn": "01:46",
            "up_num": "0",
            "down_num": "19",
            "new_num": "1",
            "out_num": "1",
            "date": "2022-11-23",
            "param": "01:46:04",
            "clear": ""
        },
        {
            "id": "5839770",
            "btn": "02:49",
            "up_num": "67",
            "down_num": "87",
            "new_num": "2",
            "out_num": "2",
            "date": "2022-11-23",
            "param": "02:49:04",
            "clear": ""
        },
        {
            "id": "5839899",
            "btn": "04:46",
            "up_num": "0",
            "down_num": "127",
            "new_num": "2",
            "out_num": "2",
            "date": "2022-11-23",
            "param": "04:46:04",
            "clear": ""
        },
        {
            "id": "5839954",
            "btn": "05:47",
            "up_num": "53",
            "down_num": "68",
            "new_num": "1",
            "out_num": "1",
            "date": "2022-11-23",
            "param": "05:47:03",
            "clear": ""
        },
        {
            "id": "5840022",
            "btn": "07:50",
            "up_num": "0",
            "down_num": "81",
            "new_num": "4",
            "out_num": "4",
            "date": "2022-11-23",
            "param": "07:50:03",
            "clear": ""
        },
        {
            "id": "5840038",
            "btn": "08:48",
            "up_num": "88",
            "down_num": "61",
            "new_num": "1",
            "out_num": "1",
            "date": "2022-11-23",
            "param": "08:48:04",
            "clear": ""
        },
        {
            "id": "5840159",
            "btn": "09:35",
            "up_num": "91",
            "down_num": "91",
            "new_num": "3",
            "out_num": "3",
            "date": "2022-11-23",
            "param": "09:35:03",
            "clear": ""
        },
        {
            "id": "5840246",
            "btn": "11:50",
            "up_num": "91",
            "down_num": "72",
            "new_num": "5",
            "out_num": "5",
            "date": "2022-11-23",
            "param": "11:50:04",
            "clear": ""
        },
        {
            "id": "5840388",
            "btn": "14:54",
            "up_num": "90",
            "down_num": "72",
            "new_num": "4",
            "out_num": "4",
            "date": "2022-11-23",
            "param": "14:54:04",
            "clear": ""
        }
    ],
    "is_logout": 0
}

可以看到已经获取到其对应值了:

   {
        "id": "5839748",
        "btn": "01:31",
        "up_num": "24",
        "down_num": "0",
        "new_num": "1",
        "out_num": "1",
        "date": "2022-11-23",
        "param": "01:31:03",
        "clear": ""
}

01:31:03,将其加入params即可

现在三个参数获取途径均已知晓,通过代码获取拼接即可。