- scrapy默認(rèn)的是get請求,當(dāng)網(wǎng)頁是post請求的時候需要重寫start_requests方法,重構(gòu)起始url請求需要瀏覽器--參數(shù)的數(shù)據(jù)請求
# windoms運行的時候如果出現(xiàn)了以下錯誤:
"""UnicodeEncodeError: 'gbk' codec can't encode character '\u2764' in position 261: illegal multibyte sequence"""
# 是由于windows標(biāo)準(zhǔn)輸出的默認(rèn)編碼(gbk)無法識別編碼格式,解決方法:
# 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
import sys,io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
# 可以把字符串變成xpath格式的標(biāo)簽
from scrapy.selector import Selector
# 移除文本中的標(biāo)簽
from w3lib.html import remove_tags
class SjjySpider(scrapy.Spider):
name = 'sjjy'
allowed_domains = ['jiayuan.com']
start_urls = ['http://search.jiayuan.com/v2/search_v2.php']
def start_requests(self):
form_data = {
'sex': 'f',
'key': '',
'stc': '1: 11, 2: 20.28, 23: 1',
'sn': 'default',
'sv': '1',
'p': '2',
'f': 'search',
'listStyle': 'bigPhoto',
'pri_uid': '0',
'jsversion': 'v5'
}
# formdata:對應(yīng)的表單數(shù)據(jù)
for url in self.start_urls:
yield scrapy.FormRequest(
url,
formdata=form_data,
meta={'form_data': form_data}
, dont_filter=True)
def parse(self, response):
# print(response.status)
print(response.text)
# 匹配json,轉(zhuǎn)化成字典
pattern = re.compile('##jiayser##(.*?)##jiayser##', re.S)
result = re.findall(pattern, response.text)[0]
data = json.loads(result)
for userinfo in data['userInfo']:
# print(userinfo)
item = ShijijiayuanItem()
# uid
item['uid'] = userinfo['uid']
# 頭像
item['header_img'] = userinfo['image']
# 性別
item['sex'] = userinfo['sex']
# remove_tags移出標(biāo)簽
item['randTag'] = remove_tags(userinfo['randTag'])
# 年齡
item['age'] = userinfo['age']
# 身高
item['height'] = userinfo['height']
# 個性簽名
item['shortnote'] = userinfo['shortnote']
# 工作地點
item['workAddress'] = userinfo['work_location']
# 對另一半要求
item['mathCtion'] = userinfo['matchCondition']
# 匿名名
item['nickname'] = userinfo['nickname']
print(item)
yield item
# 發(fā)起下一頁
form_data = response.meta['form_data']
# print(from_data)
cur_page = form_data['p']
next_page = int(cur_page) + 1
pageTotal = int(data['pageTotal'])
if next_page < pageTotal:
form_data['p'] = str(next_page)
print(cur_page)
yield scrapy.FormRequest('http://search.jiayuan.com/v2/search_v2.php',formdata=form_data,meta={'form_data': form_data},callback=self.parse)