發現沒什么好說的,主要是這個 pyquery 庫比較好用,能實現像 操縱DOM 一樣解析網頁。
主要功能:
- 將爬取的網頁先保存到本地,然后解析,避免重復請求。
- 將解析的結果保存到 MongoDB。
import requests
import pymongo
from pyquery import PyQuery as pq
class Model(object):
"""
基類, 用來顯示類的信息
"""
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
class Movie(Model):
"""
存儲電影信息
"""
def __init__(self):
self.name = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
def cached_url(url):
"""
緩存, 避免重復下載網頁浪費時間
"""
folder = 'cached'
filename = url.split('=', 1)[-1] + '.html'
path = os.path.join(folder, filename)
if os.path.exists(path):
with open(path, 'rb') as f:
s = f.read()
return s
else:
# 建立 cached 文件夾
if not os.path.exists(folder):
os.makedirs(folder)
headers = {
'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
}
# 發送網絡請求, 把結果寫入到文件夾中
r = requests.get(url, headers)
with open(path, 'wb') as f:
f.write(r.content)
return r.content
def movie_from_div(div):
"""
從一個 div 里面獲取到一個電影信息
"""
e = pq(div)
# 小作用域變量用單字符
m = Movie()
m.name = e('.title').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic').find('em').text()
return m
def movies_from_url(url):
"""
從 url 中下載網頁并解析出頁面內所有的電影
"""
page = cached_url(url)
e = pq(page)
# 2.父節點
items = e('.item')
# 調用 movie_from_div
# list comprehension
movies = [movie_from_div(i) for i in items]
return movies
def download_image(url, file):
folder = "img"
name = file.split("/")[0] + '.jpg'
path = os.path.join(folder, name)
if not os.path.exists(folder):
os.makedirs(folder)
if os.path.exists(path):
return
headers = {
'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
}
# 發送網絡請求, 把結果寫入到文件夾中
r = requests.get(url, headers)
with open(path, 'wb') as f:
f.write(r.content)
def savemovies(movies):
'''
保存到 MongoDB
'''
connection = pymongo.MongoClient()
DoubanMovies_db = connection.DoubanMovies_db
Movietable = DoubanMovies_db.movies
for m in movies:
movie = {}
movie['name'] = m.name
movie['score'] = m.score
movie['quote'] = m.quote
movie['ranking'] = m.ranking
movie['cover_url'] = m.cover_url
Movietable.insert_one(movie)
def main():
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}'.format(i)
movies = movies_from_url(url)
savemovies(movies)
print('top250 movies', movies)
[download_image(m.cover_url, str(m.name)) for m in movies]
if __name__ == '__main__':
main()