流程如下:
#1 配置好MongoDB的依赖库
#2 模拟搜索街拍的请求信息
#3 通过请求返回的json返回的url地址再次爬取
#4 爬取新的url地址,并爬取相关的图片地址
#5 获取url地址,并将爬取数据写至MongoDB,且通过二进制流下载下来,若文件相同,则通过md5判断
1、相关配置文件如下:
config.py
1 MONGO_URL = 'localhost:27017'2 MONGO_DB = 'toutiao'3 MONGO_TABLE = 'toutiao1'4 5 GROUP_START = 06 GROUP_END = 197 KEYWORD='街拍'
2、爬虫代码如下:
1 import json 2 3 from urllib.parse import urlencode 4 from hashlib import md5 #导入MD5判断 5 6 import os 7 import pymongo 8 import requests 9 from requests.exceptions import RequestException 10 import re 11 from bs4 import BeautifulSoup as bs 12 from config import * 13 from multiprocessing import Pool 14 15 #mongodb数据库连接 16 client=pymongo.MongoClient(MONGO_URL) 17 db=client[MONGO_DB] 18 19 #保存在mongodb上 20 def save_to_mongo(result): 21 if db[MONGO_TABLE].insert(result): 22 print('存储到MongoDB成功',result) 23 return True 24 return False 25 26 27 #01 28 def get_page_index(offset,keyword):#获取索引页的json 29 data={ 30 'offset': offset, 31 'format': 'json', 32 'keyword':keyword, 33 'autoload':'true', 34 'count':20, 35 'cur_tab':3 36 } 37 url='https://www.toutiao.com/search_content/?'+urlencode(data)#将字典对象转为请求参数 38 try: 39 response=requests.get(url) 40 if response.status_code==200: 41 return response.text 42 return None 43 except RequestException: 44 print('请求索引页出错') 45 return None 46 47 #02 48 def parse_page_index(html):#获取索引页过来的json中的url地址 49 data=json.loads(html)#json字符串转换为json对象 50 if 'data' in data.keys(): 51 for item in data.get('data'): 52 yield item.get('article_url') 53 54 #03 55 def get_page_detail(url):#获取详情 56 try: 57 response = requests.get(url) 58 if response.status_code == 200: 59 return response.text 60 return None 61 except RequestException: 62 print('请求详情页出错',url) 63 return None 64 65 #04 66 def parse_page_detail(html,url): 67 soup=bs(html,'lxml') 68 title=soup.title.string 69 70 image_pattren = re.search(r'gallery:.*?parse\("(.*?)"\),', html, re.S) #这里一定要加r,表示不转义 71 image_pattren = re.sub(r'\\','',image_pattren.group(1)) 72 try: 73 data=json.loads(image_pattren) 74 except: #有些json需要修复 75 image_pattren = "{ " + re.search(r'("sub_images":\[\{.*?\}\]),"max', image_pattren).group(1) + "}" 76 data=json.loads(image_pattren) 77 78 if data and 'sub_images' in data.keys(): 79 sub_images = data.get('sub_images') 80 images = [item.get('url') for item in sub_images] 81 for img in images: 82 download_img(img) 83 return { 84 'title':title, 85 'url':url, 86 'images':images 87 } 88 89 #05 90 def download_img(url):#通过地址下载图片的二进制流 91 print('正在下载',url) 92 try: 93 response=requests.get(url) 94 if response.status_code==200: 95 save_img(response.content) #保存二进制流为图片 96 return None 97 except RequestException: 98 print('请求图片出错',url) 99 return None100 101 102 #06103 def save_img(content):104 file_path='{0}/{1}.{2}'.format('d:/123',md5(content).hexdigest(),'jpg')105 if not os.path.exists(file_path):106 with open(file_path,'wb') as f:107 f.write(content)108 109 110 111 def main(offset):112 html=get_page_index(offset,KEYWORD)113 urls=parse_page_index(html)114 for url in urls:115 html=get_page_detail(url)116 result=parse_page_detail(html,url)117 save_to_mongo(result)118 119 120 121 122 if __name__=='__main__':123 group=[x*20 for x in range(GROUP_START,GROUP_END+1)]124 pool = Pool()125 pool.map(main, group)