在Python中,使用多线程进行爬虫并处理数据存储可以通过以下几个步骤实现:
import threading
import requests
from bs4 import BeautifulSoup
import json
import sqlite3
def create_connection():
conn = sqlite3.connect('data.db')
return conn
def create_table(conn):
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS web_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
title TEXT NOT NULL,
content TEXT NOT NULL
)''')
conn.commit()
def process_data(url, title, content):
# 在这里可以对数据进行清洗、解析等操作
return {
'url': url,
'title': title,
'content': content
}
def save_data(conn, data):
cursor = conn.cursor()
cursor.execute('''INSERT INTO web_data (url, title, content)
VALUES (?, ?, ?)''', (data['url'], data['title'], data['content']))
conn.commit()
def crawl(url, title, conn):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.get_text()
data = process_data(url, title, content)
save_data(conn, data)
except Exception as e:
print(f"Error while processing {url}: {e}")
def start_threads(urls, num_threads):
conn = create_connection()
create_table(conn)
threads = []
for i in range(num_threads):
url = urls[i % len(urls)]
thread = threading.Thread(target=crawl, args=(url, url, conn))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
conn.close()
urls = [
'https://example.com/page1',
'https://example.com/page2',
# ...
]
num_threads = 10
start_threads(urls, num_threads)
这个示例使用了SQLite数据库来存储数据。你可以根据需要替换为其他数据库,如MySQL、PostgreSQL等。同时,你可以根据需要调整数据处理和存储的逻辑。