Skip to content

03 Douban

import json

import requests
from lxml import etree

page = requests.post("https://movie.douban.com/chart", headers={
    "referer": "https://movie.douban.com/explore",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
})

parsed = etree.HTML(page.text)
movies = parsed.xpath("""//table/tr""")
data = []
for movie in movies:
    cover_url = movie.xpath("""./td[1]/a/img/@src""")[0]
    name = movie.xpath("""./td[1]/a/img/@alt""")[0]
    description = movie.xpath("""./td[2]/div/p/text()""")[0]

    star = movie.xpath("""./td[2]/div/div/span[1]/@class""")[0][7:]
    star = float(f"{star[0]}.{star[1]}")

    rating_nums = movie.xpath("""./td[2]/div/div/span[@class="rating_nums"]/text()""")
    rating_nums = rating_nums[0] if len(rating_nums) == 1 else "无评分"

    comment_nums = movie.xpath("""./td[2]/div/div/span[@class="pl"]/text()""")[0]

    data.append({
        "movie": name,
        "cover_url": cover_url,
        "description": description,
        "star": star,
        "rating_nums": rating_nums,
        "comment_nums": comment_nums
    })

open("a.json", 'w', encoding="utf-8").write(json.dumps({"movies": data}, ensure_ascii=False))