python-爬蟲-css提取-寫入csv-爬取貓眼電影榜單

2023-04-05 21:00:19

貓眼有一個電影榜單top100,我們將他的榜單電影資料(電影名、主演、上映時間、豆瓣評分)抓下來儲存到原生的excle中

本案例使用css方式提取頁面資料,所以會用到以下庫

import time
import requests
import parsel
#解析庫,解析css
import csv
#爬取的資料寫入csv

建立csv檔案檔頭資訊,也就是表格第一排內容

f = open('book.csv',mode='a',encoding='utf-8',newline='')
#表頭
csv_writer = csv.DictWriter(f,fieldnames=['電影名字','主演','上映時間','評分'])
csv_writer.writeheader() 

 

 

分析地址,每一頁地址的區別在最後一個「=」號後面的數位,第一頁是「10「,第二頁是」20「,以此類推到」90「,所以寫個迴圈翻頁

https://www.maoyan.com/board/4?timeStamp=1680685769327&channelId=40011&index=8&signKey=6fa9e474efd1ed595c394e9bc497cdaf&sVersion=1&webdriver=false&offset=10

https://www.maoyan.com/board/4?timeStamp=1680685769327&channelId=40011&index=8&signKey=6fa9e474efd1ed595c394e9bc497cdaf&sVersion=1&webdriver=false&offset=20

https://www.maoyan.com/board/4?timeStamp=1680685769327&channelId=40011&index=8&signKey=6fa9e474efd1ed595c394e9bc497cdaf&sVersion=1&webdriver=false&offset=90

for page in range(0,10):
    time.sleep(2)
    page = page *10
    url = 'https://www.maoyan.com/board/4?timeStamp=1680685769327&channelId=40011&index=8&signKey=6fa9e474efd1ed595c394e9bc497cdaf&sVersion=1&webdriver=false&offset={}'.format(page)
    print(url)

分析頁面,找到需要的資料

 

 提取資料指令碼如下

    response = requests.get(url, headers=headers)
    selector = parsel.Selector(response.text)
    li_s = selector.css('.board-wrapper dd')
    for li in li_s:
        name = li.css('.name a::text').get()
        #電影名稱
        star = li.css('.star::text').get()
        #主演
        star_string = star.strip()
        #strip() 方法用於移除字串頭尾指定的字元(預設為空格或換行符)或字元序列
        releasetime = li.css('.releasetime::text').get()
        #上映時間
        data_time = releasetime.strip()
        follow = li.css('.score i::text').getall()
        score = ''.join(follow)
        #join函數將列表內的值連串顯示,參考「https://blog.csdn.net/weixin_50853979/article/details/125119368」

最後將獲取到的資料字典化後存到csv檔案中

   dit = {
            '電影名字': name,
            '主演': star_string,
            '上映時間': data_time,
            '評分': score,
        }
        csv_writer.writerow(dit)

執行後csv檔案的內容

 

 全部程式碼

import time
import requests
import parsel
#解析庫,解析css
import csv
#爬取的資料寫入csv

f = open('book.csv',mode='a',encoding='utf-8',newline='')
#表頭
csv_writer = csv.DictWriter(f,fieldnames=['電影名字','主演','上映時間','評分'])
csv_writer.writeheader() 

for page in range(0,10):
    time.sleep(2)
    page = page *10
    url = 'https://www.maoyan.com/board/4?timeStamp=1680685769327&channelId=40011&index=8&signKey=6fa9e474efd1ed595c394e9bc497cdaf&sVersion=1&webdriver=false&offset={}'.format(page)
    print(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
        'Cookie': '__mta=20345351.1670903159717.1670903413872.1670903436333.5; uuid_n_v=v1; uuid=A8065B807A9811ED82C293D7E110319C9B09821067E1411AB6F4EC82889E1869; _csrf=916b8446658bd722f56f2c092eaae35ea3cd3689ef950542e202b39ddfe7c91e; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1670903160; _lxsdk_cuid=1850996db5dc8-07670e36da28-26021151-1fa400-1850996db5d67; _lxsdk=A8065B807A9811ED82C293D7E110319C9B09821067E1411AB6F4EC82889E1869; __mta=213622443.1670903327420.1670903417327.1670903424017.4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1670903436; _lxsdk_s=1850996db5e-8b2-284-88a%7C%7C18',
        'Host': 'www.maoyan.com',
        'Referer': 'https://www.maoyan.com/films/1200486'

    }
    response = requests.get(url, headers=headers)
    selector = parsel.Selector(response.text)
    li_s = selector.css('.board-wrapper dd')
    for li in li_s:
        name = li.css('.name a::text').get()
        #電影名稱
        star = li.css('.star::text').get()
        #主演
        star_string = star.strip()
        #strip() 方法用於移除字串頭尾指定的字元(預設為空格或換行符)或字元序列
        releasetime = li.css('.releasetime::text').get()
        #上映時間
        data_time = releasetime.strip()
        follow = li.css('.score i::text').getall()
        score = ''.join(follow)
        #join函數將列表內的值連串顯示,參考「https://blog.csdn.net/weixin_50853979/article/details/125119368」
        dit = {
            '電影名字': name,
            '主演': star_string,
            '上映時間': data_time,
            '評分': score,
        }
        csv_writer.writerow(dit)