緩存就是將頁面信息下載下來,避免二次下載
import os
import requests
from pyquery import PyQuery as pq
"""
這個爬蟲可以爬 10 個頁面, 把所有 TOP250 電影都爬出來
并且加入了緩存頁面功能
再也不用重復請求了(網絡請求很浪費時間)
這樣做有兩個好處
1, 增加新內容(比如增加評論人數)的時候不用重復請求網絡
2, 出錯的時候有原始數據對照(比如 消失的愛人 沒有 quote)
"""
class Model():
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
class Movie(Model):
def __init__(self):
self.name = ''
self.other = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
def cached_page(url):
folder = 'cacheddouban'
if not os.path.exists(folder):
os.makedirs(folder)
# https://movie.douban.com/top250?start=100
filename = '{}.html'.format(url.split('=', 1)[-1])
path = os.path.join(folder, filename)
print(path)
if os.path.exists(path):
with open(path, 'rb') as f:
s = f.read()
return s
else:
# 發(fā)送網絡請求, 把結果寫入到文件夾中
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
return r.content
- 如果不存在目標文件夾,則新建文件夾
- os.path 返回當前文件的容器路徑
- 建立文件存儲頁面信息,文件名根據當前 url 來
- os.path.join() 定位到當前文件路徑
- 如果存在當前路徑,讀取返回就行
- 如果不存在,就將獲取的頁面內容存進去
注意:
s == r.content
def movie_from_div(div):
e = pq(div)
m = Movie()
m.name = e('.title').text()
m.other = e('.other').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic').find('em').text()
return m
def movies_from_url(url):
page = cached_page(url)
e = pq(page)
items = e('.item')
# 調用 movie_from_div
movies = [movie_from_div(i) for i in items]
return movies
def main():
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}'.format(i)
movies = movies_from_url(url)
print('top250 movies', movies)
if __name__ == '__main__':
main()