1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
| import re import csv import time
import requests from requests.packages import urllib3
urllib3.disable_warnings() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.36' }
def match(mode: str) -> list: return re.findall(mode, html, re.S)
def writer_csv(data_list): ''' 将爬取的数据存储到 data.csv 文件 ''' with open('./data.csv', 'a+', newline='', encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerow(data_list)
def get_detail(detail_url_list): ''' 爬取链接、电影名称、评分、剧情简介 ''' global html for detail in detail_url_list: detail_url = f'https://ssr4.scrape.center{detail}' print(f'正在爬取详情页 {detail_url}') time.sleep(1) response = requests.get( detail_url, headers=headers, timeout=20, verify=False) html = response.text movies_name = match(r'class="m-b-sm">(.*?)</h2>') rating = match(r'm-b-n-sm">\n *(.*?)</p>') plot_summary = match(r'<p data-v-63864230="">\n *(.*?)\n *</p></div>')
data_list = [ detail_url, movies_name[0], rating[0], plot_summary[0] ] writer_csv(data_list)
def get_list_page(page): ''' 爬取详情页链接,用于下一步处理 ''' global html time.sleep(1) page_url = f'https://ssr4.scrape.center/page/{page}' response = requests.get(page_url, headers=headers, timeout=20, verify=False) html = response.text detail_url_list = match(r'data-v-7f856186="" href="(.*?)"') get_detail(detail_url_list)
def main(): for page in range(1, 11): get_list_page(page)
if __name__ == '__main__': main()
|