- 1、先用xpath定位到该script标签,并且抽取出文本内容
from lxml.html import soupparser
url = 'https://bingx.com/ja-jp/events/'
res = requests.get(url, impersonate="chrome101")
pattern = '//script[contains(text(), "window.__NUXT__=")]/text()'
tree = soupparser.fromstring(res.text)
result = tree.xpath(pattern)
if result:
js_code = result[0]
print(js_code)
- 2、给解析出的原始文本在头部加上一行
window = {};\n
如果不加, 将会报
window undifined
的错误,因为确实没有定义这个变量名;我们先要给初始化,定义下
js_code = 'window = {};\n' + js_code
- 3、用
execjs
解析该数据
import execjs
# 编译并执行 JavaScript 代码
ctx = execjs.compile(js_code)
# 获取 window.__NUXT__ 的值
nuxt_data = ctx.eval('window.__NUXT__')
# 提取 fetch 中的 items 数组
items = nuxt_data['fetch']['data-v-d36bc1ac:0']['items']
- 4、完整代码示例
import execjs
from curl_cffi import requests
from lxml.html import soupparser
from dateutil import parser
url = 'https://bingx.com/ja-jp/events/'
res = requests.get(url, impersonate="chrome101")
pattern = '//script[contains(text(), "window.__NUXT__=")]/text()'
tree = soupparser.fromstring(res.text)
result = tree.xpath(pattern)
if result:
js_code = result[0]
print(js_code)
js_code = 'window = {};\n' + js_code
# 编译并执行 JavaScript 代码
ctx = execjs.compile(js_code)
# 获取 window.__NUXT__ 的值
nuxt_data = ctx.eval('window.__NUXT__')
# 提取 fetch 中的 items 数组
items = nuxt_data['fetch']['data-v-d36bc1ac:0']['items']
for item in items:
print(item)
print(f"标题: {item['title']}")
print(f"内容: {item['content']}")
print(f"活动URL: {item['activityUrl']}")
# 发布时间
onTime = item['onTime']
dt = parser.isoparse(onTime)
formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S")
解决方案来自Grok3 Think模式;对比了chat-gpt4.5, deepseek,Grok3的回答效果最好
Top comments (0)