在JSON爬虫中实现数据缓存可以提高爬虫的效率,减少对目标网站的请求次数。以下是一些建议:
cachetools
或functools.lru_cache
。这些库提供了简单的缓存机制,可以帮助你在爬虫中实现数据缓存。例如,使用cachetools
库:
from cachetools import TTLCache
import requests
cache = TTLCache(maxsize=100, ttl=3600) # 设置缓存大小为100,缓存时间为3600秒
def get_data(url):
if url in cache:
return cache[url]
response = requests.get(url)
data = response.json()
cache[url] = data
return data
例如,将数据保存到JSON文件中:
import json
import requests
def save_data(data, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def load_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def get_data(url):
file_path = f"{url}.json"
if os.path.exists(file_path):
return load_data(file_path)
response = requests.get(url)
data = response.json()
save_data(data, file_path)
return data
例如,将数据保存到SQLite数据库中:
import sqlite3
import requests
def create_table():
conn = sqlite3.connect("data.db")
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS data (
url TEXT PRIMARY KEY,
data TEXT
)
""")
conn.commit()
conn.close()
def save_data(url, data):
conn = sqlite3.connect("data.db")
cursor = conn.cursor()
cursor.execute("INSERT OR REPLACE INTO data (url, data) VALUES (?, ?)", (url, json.dumps(data)))
conn.commit()
conn.close()
def load_data(url):
conn = sqlite3.connect("data.db")
cursor = conn.cursor()
cursor.execute("SELECT data FROM data WHERE url = ?", (url,))
data = cursor.fetchone()
conn.close()
return data[0] if data else None
def get_data(url):
data = load_data(url)
if data:
return json.loads(data)
response = requests.get(url)
data = response.json()
save_data(url, data)
return data
这些方法可以用于在JSON爬虫中实现数据缓存。你可以根据自己的需求选择合适的方法。