分析暫時(shí)不介紹
import base64
import re
import time
import requests
'''抓取美拍 指定分類 視頻'''
class MeiPai:
def __init__(self):
self.home_url = 'http://www.meipai.com'
self.category = {
'搞笑': ['13', '/squares/new_timeline'],
'愛豆': ['16', '/squares/new_timeline'],
'高顏值': ['474', '/squares/new_timeline'],
'舞蹈': ['63', '/topics/hot_timeline'],
'精選': ['488', '/squares/new_timeline'],
'音樂': ['62', '/topics/hot_timeline'],
'美食': ['59', '/topics/hot_timeline'],
'美妝': ['27', '/squares/new_timeline'],
'吃秀': ['423', '/squares/new_timeline'],
'寶寶': ['18', '/topics/hot_timeline'],
'寵物': ['6', '/topics/hot_timeline'],
'手工': ['450', '/topics/hot_timeline'],
'游戲': ['480', '/topics/hot_timeline'],
'運(yùn)動(dòng)': ['487', '/topics/hot_timeline'],
'穿秀': ['460', '/topics/hot_timeline'],
}
self.category_name = None
self.video_id = None # 視頻 id
self.total = 0 # 記錄下載視頻個(gè)數(shù)
self.page = 1 # 初始化開始下載的頁數(shù)
self.DEBUG = True # 默認(rèn)開啟 調(diào)試模式,不會(huì)真正下載視頻
def tid(self): # 為網(wǎng)頁源碼中的 interested_id: (\d+),
url = self.home_url + '/square/{}'.format(self.category[self.category_name][0])
html = requests.get(url).text
tid = re.compile('interested_id: (\d+)').findall(html)[0]
if tid == '0':
tid = self.category[self.category_name][0]
return tid
def ajax(self, tid, total_page=None):
"""
:param total_page: 希望下載多少頁的視頻,每頁下載24個(gè),默認(rèn)無限制
:type tid: object
"""
url = self.home_url + self.category[self.category_name][1]
while 1:
print('正在下載第{}頁...'.format(self.page))
params = {
'page': self.page,
'count': 24,
'tid': tid,
}
js_data = requests.get(url, params=params).json()
medias = js_data.get('medias')
for media in medias: # 每一個(gè)page至多有24條數(shù)據(jù)
self.video_id = media.get('id')
if self.video_id:
yield media.get('video')
if self.page == total_page or len(medias) < 24:
print('共下載了{(lán)}頁,{}個(gè)視頻'.format(self.page, self.total))
break
time.sleep(1)
self.page += 1
#################################################
@staticmethod
def decode(code):
"""
:type code: 解密之前的視頻鏈接
"""
first_4 = str(int(code[:4][::-1], 16))
pre = [int(x) for x in first_4[:2]]
tail = [int(x) for x in first_4[2:]]
code = code[4:]
code = code[:pre[0]] + code[pre[0]:].replace(code[pre[0]:pre[0] + pre[1]], '', 1)
tail[0] = len(code) - sum(tail)
code = code[:tail[0]] + code[tail[0]:].replace(code[tail[0]:tail[0] + tail[1]], '', 1)
return base64.b64decode(code).decode()
def download(self, video_url):
"""
默認(rèn)開啟 調(diào)試模式,不會(huì)真正下載視頻到本地
:type video_url: 解密之后的視頻下載地址
"""
print('正在下載...{}'.format(video_url, self.video_id))
if not self.DEBUG:
video = requests.get(video_url).content
with open('{}.mp4'.format(self.video_id), 'wb') as f:
f.write(video)
def start(self, total_page=None):
"""
:type total_page: 希望下載多少頁的視頻,每頁下載24個(gè),默認(rèn)無限制
"""
tid = self.tid()
for _ in self.ajax(tid, total_page):
try:
video = self.decode(_)
except Exception as e:
print(e, self.video_id, '解密失敗!')
continue
self.download(video)
self.total += 1
if __name__ == '__main__':
mp = MeiPai()
mp.category_name = '舞蹈'
mp.DEBUG = False
mp.start(1) # 參見start函數(shù)說明