# 怎么使用Python提取Chrome浏览器的历史记录及收藏夹
## 引言
在日常使用浏览器时,我们会积累大量的历史记录和收藏夹数据。这些数据不仅反映了我们的浏览习惯,还可能包含重要的工作或学习资料。本文将详细介绍如何使用Python提取Chrome浏览器的历史记录和收藏夹数据,并对其进行处理和分析。
## 准备工作
### 1. 安装必要的Python库
我们需要安装以下几个Python库:
```bash
pip install sqlite3
pip install pandas
pip install pywin32 # Windows系统需要
pip install pyobjc # macOS系统需要
Chrome浏览器的历史记录和收藏夹数据存储在SQLite数据库中,不同操作系统的存储位置如下:
C:\Users\<用户名>\AppData\Local\Google\Chrome\User Data\Default
/Users/<用户名>/Library/Application Support/Google/Chrome/Default
/home/<用户名>/.config/google-chrome/Default
主要关注以下两个文件:
- History
:包含浏览历史记录
- Bookmarks
:收藏夹数据(JSON格式)
Chrome的历史记录存储在SQLite数据库中,我们可以使用Python的sqlite3
模块来访问:
import sqlite3
import pandas as pd
# Chrome历史记录文件路径
history_path = "C:\\Users\\<用户名>\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\History"
# 复制文件以避免锁定问题
import shutil
import os
temp_path = "temp_history"
shutil.copy2(history_path, temp_path)
# 连接数据库
conn = sqlite3.connect(temp_path)
cursor = conn.cursor()
历史记录主要存储在urls
表中:
# 执行查询
cursor.execute("SELECT title, url, last_visit_time FROM urls ORDER BY last_visit_time DESC")
# 获取结果
history = cursor.fetchall()
# 转换为DataFrame
history_df = pd.DataFrame(history, columns=["标题", "URL", "最后访问时间"])
# 关闭连接
conn.close()
os.remove(temp_path) # 删除临时文件
Chrome使用特殊的时间格式(1601年1月1日以来的微秒数),需要转换:
def chrome_time_to_datetime(chrometime):
return pd.to_datetime(chrometime/1e6 - 11644473600, unit='s')
history_df["访问时间"] = history_df["最后访问时间"].apply(chrome_time_to_datetime)
history_df.drop("最后访问时间", axis=1, inplace=True)
history_df.to_csv("chrome_history.csv", index=False, encoding='utf-8-sig')
Chrome的收藏夹存储为JSON格式:
import json
bookmarks_path = "C:\\Users\\<用户名>\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Bookmarks"
with open(bookmarks_path, "r", encoding="utf-8") as f:
bookmarks_data = json.load(f)
收藏夹采用树形结构存储,我们需要递归遍历:
def extract_bookmarks(bookmark_item, folder_path=""):
bookmarks = []
if "children" in bookmark_item:
# 这是一个文件夹
new_folder = f"{folder_path}/{bookmark_item['name']}"
for child in bookmark_item["children"]:
bookmarks.extend(extract_bookmarks(child, new_folder))
elif "url" in bookmark_item:
# 这是一个书签
bookmarks.append({
"标题": bookmark_item["name"],
"URL": bookmark_item["url"],
"文件夹路径": folder_path,
"添加日期": pd.to_datetime(bookmark_item.get("date_added", 0)/1e6 - 11644473600,
unit='s') if "date_added" in bookmark_item else None
})
return bookmarks
# 提取所有书签
all_bookmarks = []
for root in ["bookmark_bar", "other", "synced"]:
if root in bookmarks_data["roots"]:
all_bookmarks.extend(extract_bookmarks(bookmarks_data["roots"][root]))
bookmarks_df = pd.DataFrame(all_bookmarks)
bookmarks_df.to_csv("chrome_bookmarks.csv", index=False, encoding='utf-8-sig')
# 按小时分析浏览习惯
history_df["小时"] = history_df["访问时间"].dt.hour
hourly_activity = history_df["小时"].value_counts().sort_index()
# 绘制图表
import matplotlib.pyplot as plt
hourly_activity.plot(kind="bar")
plt.title("按小时分布的浏览活动")
plt.xlabel("小时")
plt.ylabel("访问次数")
plt.savefig("hourly_activity.png")
# 提取域名
import urllib.parse
def extract_domain(url):
try:
return urllib.parse.urlparse(url).netloc
except:
return None
history_df["域名"] = history_df["URL"].apply(extract_domain)
top_domains = history_df["域名"].value_counts().head(20)
top_domains.plot(kind="barh")
plt.title("访问最频繁的网站")
plt.savefig("top_domains.png")
# 找出重复收藏的URL
duplicate_urls = bookmarks_df[bookmarks_df.duplicated("URL", keep=False)]
duplicate_urls.to_csv("duplicate_bookmarks.csv", index=False)
import platform
def get_chrome_data_path():
system = platform.system()
username = os.getlogin()
if system == "Windows":
return f"C:\\Users\\{username}\\AppData\\Local\\Google\\Chrome\\User Data\\Default"
elif system == "Darwin": # macOS
return f"/Users/{username}/Library/Application Support/Google/Chrome/Default"
elif system == "Linux":
return f"/home/{username}/.config/google-chrome/Default"
else:
raise Exception("Unsupported operating system")
def get_chrome_history():
data_path = get_chrome_data_path()
history_path = os.path.join(data_path, "History")
try:
# 直接尝试连接
return sqlite3.connect(history_path)
except sqlite3.OperationalError:
# 如果失败,尝试复制文件
temp_path = "temp_history"
shutil.copy2(history_path, temp_path)
return sqlite3.connect(temp_path)
# chrome_data_extractor.py
import sqlite3
import pandas as pd
import json
import os
import shutil
import urllib.parse
import platform
from datetime import datetime
class ChromeDataExtractor:
def __init__(self):
self.data_path = self._get_chrome_data_path()
def _get_chrome_data_path(self):
system = platform.system()
username = os.getlogin()
if system == "Windows":
return f"C:\\Users\\{username}\\AppData\\Local\\Google\\Chrome\\User Data\\Default"
elif system == "Darwin": # macOS
return f"/Users/{username}/Library/Application Support/Google/Chrome/Default"
elif system == "Linux":
return f"/home/{username}/.config/google-chrome/Default"
else:
raise Exception("Unsupported operating system")
def _chrome_time_to_datetime(self, chrometime):
if pd.isna(chrometime) or chrometime == 0:
return None
return datetime(1601, 1, 1) + pd.Timedelta(microseconds=chrometime)
def get_history(self, save_to_csv=True):
history_path = os.path.join(self.data_path, "History")
temp_path = "temp_history"
try:
shutil.copy2(history_path, temp_path)
conn = sqlite3.connect(temp_path)
cursor = conn.cursor()
cursor.execute("""
SELECT urls.title, urls.url, urls.visit_count,
urls.last_visit_time, visits.visit_time, visits.from_visit
FROM urls
LEFT JOIN visits ON urls.id = visits.url
ORDER BY visits.visit_time DESC
""")
history = cursor.fetchall()
history_df = pd.DataFrame(history, columns=[
"标题", "URL", "访问次数", "最后访问时间", "访问时间", "来源访问"
])
# 转换时间格式
history_df["访问时间"] = history_df["访问时间"].apply(self._chrome_time_to_datetime)
history_df["最后访问时间"] = history_df["最后访问时间"].apply(self._chrome_time_to_datetime)
# 提取域名
history_df["域名"] = history_df["URL"].apply(
lambda x: urllib.parse.urlparse(x).netloc if pd.notna(x) else None
)
if save_to_csv:
history_df.to_csv("chrome_history.csv", index=False, encoding='utf-8-sig')
return history_df
finally:
conn.close()
if os.path.exists(temp_path):
os.remove(temp_path)
def get_bookmarks(self, save_to_csv=True):
bookmarks_path = os.path.join(self.data_path, "Bookmarks")
with open(bookmarks_path, "r", encoding="utf-8") as f:
bookmarks_data = json.load(f)
def extract_bookmarks(bookmark_item, folder_path=""):
bookmarks = []
if "children" in bookmark_item:
new_folder = f"{folder_path}/{bookmark_item['name']}" if folder_path else bookmark_item['name']
for child in bookmark_item["children"]:
bookmarks.extend(extract_bookmarks(child, new_folder))
elif "url" in bookmark_item:
bookmarks.append({
"标题": bookmark_item["name"],
"URL": bookmark_item["url"],
"文件夹路径": folder_path,
"添加日期": self._chrome_time_to_datetime(bookmark_item.get("date_added"))
})
return bookmarks
all_bookmarks = []
for root in ["bookmark_bar", "other", "synced"]:
if root in bookmarks_data["roots"]:
all_bookmarks.extend(extract_bookmarks(bookmarks_data["roots"][root]))
bookmarks_df = pd.DataFrame(all_bookmarks)
if save_to_csv:
bookmarks_df.to_csv("chrome_bookmarks.csv", index=False, encoding='utf-8-sig')
return bookmarks_df
if __name__ == "__main__":
extractor = ChromeDataExtractor()
print("正在提取Chrome历史记录...")
history_df = extractor.get_history()
print(f"提取到 {len(history_df)} 条历史记录")
print("\n正在提取Chrome收藏夹...")
bookmarks_df = extractor.get_bookmarks()
print(f"提取到 {len(bookmarks_df)} 个收藏项")
print("\n操作完成!")
通过本文介绍的方法,你可以轻松地使用Python提取和分析Chrome浏览器的历史记录和收藏夹数据。这些技术可以应用于个人数据分析、工作自动化等多个场景。记得在使用这些数据时要尊重隐私,合理合法地使用这些信息。
”`
亿速云「云服务器」,即开即用、新一代英特尔至强铂金CPU、三副本存储NVMe SSD云盘,价格低至29元/月。点击查看>>
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。