本文共 1491 字,大约阅读时间需要 4 分钟。
控制爬取速度0.5间隔和32并行
# -*- coding: utf-8 -*-
import json from scrapy import Spider, Request from weibouser.items import WeibouserItem class WeiboSpider(Spider): name = 'weibo2' allowed_domains = ['weibo.cn'] id = "5702787827" # 第一次发起请求 def start_requests(self): # 发现每一页只有url后面的id不同,直接发送请求也可以得到响应 # for i in range(300): yield Request( url="https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_"+str(self.id)+"&luicode=10000011&lfid=107603"+str(self.id)+"&featurecode=20000320&since_id="+str(1), callback=self.parse ) # 处理得到的json def parse(self, response): result = json.loads(response.body.decode("utf-8")) try: result = result["data"]["cards"][0]["card_group"] except: print("没有爬取到数据,退出当前循环") return # print(result) item = WeibouserItem() # 提取json页面信息 # 当item中定义需要提取的键值队在result中就赋值,快捷的遍历所有 for data in result: for field in item.fields: # for i in data: if field in data["user"]: item[field] = data["user"][field] print(item[field]) # with open("ww","w") as f: # f.write(result) yield item # print(response.text) # 将得到的id取出来,在返回调用自己取出内容 if data["user"]['screen_name'] == '新手指南': return else:# num1 = 0 num = int(int(data["user"]["followers_count"])//20) print('当前循环次数',num) if num > 0: #粉丝数量大于20才爬取 if num > 250:# 大于250就可能读取不出来了 num = 250 for i in range(num): yield Request( url="https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_"+str(item["id"])+"&luicode=10000011&lfid=107603"+str(item["id"])+"&featurecode=20000320&since_id="+str(i), callback=self.parse )