优雅处理网络异常,提高爬虫稳定性
小美负责监控公司的电商网站数据,但经常遇到网络超时和服务器繁忙的问题。 通过实施重试机制和错误日志记录,她成功提高了数据收集的稳定性。
# 小美的错误处理方案
import requests
import time
import logging
from typing import Optional
class DataMonitor:
def __init__(self):
self.session = requests.Session()
self.max_retries = 3
self.retry_delay = 1
# 设置日志
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def fetch_data(self, url: str) -> Optional[dict]:
"""获取数据,包含重试机制"""
for attempt in range(self.max_retries):
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response.json()
except requests.exceptions.Timeout:
self.logger.warning(f"请求超时,第{attempt + 1}次重试...")
time.sleep(self.retry_delay * (2 ** attempt))
except requests.exceptions.HTTPError as e:
if response.status_code == 503:
self.logger.warning("服务器繁忙,稍后重试...")
time.sleep(5)
else:
self.logger.error(f"HTTP错误:{e}")
break
except requests.exceptions.RequestException as e:
self.logger.error(f"网络错误:{e}")
break
self.logger.error("所有重试失败")
return None
# 使用示例
monitor = DataMonitor()
data = monitor.fetch_data("https://api.example.com/products")
if data:
print(f"成功获取{len(data)}条商品数据")
捕获并处理requests的基本异常
try:
response = requests.get(url)
# 处理ConnectionError, Timeout, HTTPError
except Exception as e:
print(f"错误:{e}")
根据HTTP状态码执行不同的处理逻辑
if response.status_code == 404:
print("页面不存在")
elif response.status_code == 500:
print("服务器错误")
创建一个带重试功能的装饰器
@retry(max_attempts=3, delay=2)
def fetch_data(url):
return requests.get(url)
设置合理的超时时间,避免无限等待
try:
response = requests.get(url, timeout=(5, 30))
except requests.Timeout:
print("请求超时")
记录错误日志到文件,便于后续分析
import logging
logging.basicConfig(filename='errors.log')
logger.error("网络错误")
主数据源失败时,切换到备用数据源
def fetch_with_backup(primary_url, backup_url):
# 主数据源失败时使用备用
验证返回数据的完整性和格式正确性
def validate_data(data):
# 检查必需字段是否存在
required_fields = ['id', 'name']
网络中断后,从断点继续下载
# 使用Range头实现断点续传
headers = {'Range': 'bytes=1000-2000'}