源码分享
import json
import os.path
import time
import jsonpath
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36',
'referer': 'https://www.zhihu.com/question/26297181',
'cookie': ''
}
# 发送请求获取json数据
def get_data(url):
# 发送请求获取json数据
resp = requests.get(url=url, headers=headers)
content = resp.content
# 返回json数据
return content
# 解析json数据
def parse_data(content):
# 加载json数据
data = json.loads(content)
# 解析获取回答数据
answer_list = jsonpath.jsonpath(data, '$..data')
# 解析获取下一页请求地址
answer_next = jsonpath.jsonpath(data, '$..paging..next')[0]
# 定义一个字典存放答主名称和src列表
answer_dict = {}
# 遍历获取每个回答的内容
for answer in answer_list[0]:
# 解析获取答主名称
answer_name = jsonpath.jsonpath(answer, '$..author..name')[0]
# 解析获取回答内容
answer_content = str(jsonpath.jsonpath(answer, '$..target..content'))
# 定义列表存放src路径
src_list = []
# 遍历获取内容中所有图片
while True:
# 图片src起始位置
start_index = answer_content.find('data-actualsrc=\"') + 16
# 图片src结束位置
end_index = answer_content.find('?source=1940ef5c')
# 如果能查到结束位置标记则获取src路径避免返回值为-1时的无效查找
if end_index != -1:
# 真实图片src路径
src = answer_content[start_index:end_index]
# 如果src在列表中不存在且不包含_r则添加到列表中
if src not in src_list and '_r' not in src and len(src) < 100 and len(src) != 0:
src = src.replace('_720w', '')
src_list.append(src)
# 更新截取后的内容
answer_content = answer_content[end_index + 17:-1]
# 如果查不到src结束索引则跳出循环
else:
break
answer_dict[answer_name] = src_list
# 将存放src的列表和下一页api返回
return answer_dict, answer_next
def download_img(answer_dict, save_path):
# 遍历字典获取名称\src列表
for name, src_list in answer_dict.items():
# 遍历src列表获取src
for src in src_list:
# 发送请求获取图片
img_resp = requests.get(src, headers=headers)
# 设置保存路径
path = save_path + f'{name}'
# 判断路径是否存在,不存在就创建
if not os.path.exists(path):
os.makedirs(path)
# 保存图片
with open(f'{path}/{int(time.time())}.jpg', 'wb') as img_f:
img_f.write(img_resp.content)
if __name__ == '__main__':
# 设置起始api
url = 'https://www.zhihu.com/api/v4/questions/26297181/feeds?cursor=e9d89ca6466891518544484e84026815&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Creaction_instruction%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cvip_info%2Cbadge%5B%2A%5D.topics%3Bdata%5B%2A%5D.settings.table_of_content.enabled&limit=5&offset=0&order=default&platform=desktop&session_id=1657444499711409954'
save_path = input('请输入要保存的路径,以/结尾:')
if save_path[-1] != '/':
save_path = input('没有以/结尾,请重新输入保存路径:')
while True:
try:
# 调用方法获取数据
content = get_data(url=url)
except Exception:
break
# 调用方法解析图片地址
answer_dict, url = parse_data(content=content)
# 调用方法下载图片
download_img(answer_dict, save_path)
实现一个知乎下的简单爬行图,虽然简单,但基本涵盖了所有爬虫的基础知识,小白可以学习,毕竟简单爬图还是没有问题的。
相关图片爬取文章:
© 版权声明
THE END
暂无评论内容