Python

爬取网页上的情景对话

Jaydon · 2月13日 · 2019年 ·

爬取多段情景对话,网址如下:https://fangj.github.io/friends/

代码如下

import requests
from pyquery import PyQuery as pq


# 获取单个页面的内容
def get_html(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
        }
        r = requests.get(url=url, headers=headers)
        r.raise_for_status()
    except Exception as e:
        print(repr(e))
    else:
        return r.text


# 获取每个场景对话的url
def get_chat_url(url):
    response = get_html(url)
    doc = pq(response)
    a = doc("ul li a")
    for item in a.items():
        yield item.attr("href")


# 写入文本
def write_text(content: list):
    filename = "dialogue.txt"
    with open(filename, "a+", encoding="utf8")as f:
        for i in content:
            f.write(i + "\n")
        f.write("\n")


if __name__ == '__main__':
    url = "https://fangj.github.io/friends/"
    text_url = []
    for href in get_chat_url(url):
        text_url.append(url + href)
    for url in text_url:
        html = get_html(url)
        doc = pq(html)
        items = doc("p")
        status = False
        l = []
        for item in items.items():
            texts = item.text()
            if "Scene" in texts:    # 每个场景开始对话都会有Scene这个关键字
                status = True
            if status:
                texts = texts.replace("\n", "")
                l.append(texts)
        write_text(l)
1 条回应
  1. Jaydon2019-7-21 · 10:01