diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index c6bba591..00000000
--- a/.gitignore
+++ /dev/null
@@ -1,130 +0,0 @@
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-lerna-debug.log*
-.pnpm-debug.log*
-
-# Diagnostic reports (https://nodejs.org/api/report.html)
-report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
-
-# Runtime data
-pids
-*.pid
-*.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-lib-cov
-
-# Coverage directory used by tools like istanbul
-coverage
-*.lcov
-
-# nyc test coverage
-.nyc_output
-
-# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-bower_components
-
-# node-waf configuration
-.lock-wscript
-
-# Compiled binary addons (https://nodejs.org/api/addons.html)
-build/Release
-
-# Dependency directories
-node_modules/
-jspm_packages/
-
-# Snowpack dependency directory (https://snowpack.dev/)
-web_modules/
-
-# TypeScript cache
-*.tsbuildinfo
-
-# Optional npm cache directory
-.npm
-
-# Optional eslint cache
-.eslintcache
-
-# Optional stylelint cache
-.stylelintcache
-
-# Microbundle cache
-.rpt2_cache/
-.rts2_cache_cjs/
-.rts2_cache_es/
-.rts2_cache_umd/
-
-# Optional REPL history
-.node_repl_history
-
-# Output of 'npm pack'
-*.tgz
-
-# Yarn Integrity file
-.yarn-integrity
-
-# dotenv environment variable files
-.env
-.env.development.local
-.env.test.local
-.env.production.local
-.env.local
-
-# parcel-bundler cache (https://parceljs.org/)
-.cache
-.parcel-cache
-
-# Next.js build output
-.next
-out
-
-# Nuxt.js build / generate output
-.nuxt
-dist
-
-# Gatsby files
-.cache/
-# Comment in the public line in if your project uses Gatsby and not Next.js
-# https://nextjs.org/blog/next-9-1#public-directory-support
-# public
-
-# vuepress build output
-.vuepress/dist
-
-# vuepress v2.x temp and cache directory
-.temp
-.cache
-
-# Docusaurus cache and generated files
-.docusaurus
-
-# Serverless directories
-.serverless/
-
-# FuseBox cache
-.fusebox/
-
-# DynamoDB Local files
-.dynamodb/
-
-# TernJS port file
-.tern-port
-
-# Stores VSCode versions used for testing VSCode extensions
-.vscode-test
-
-# yarn v2
-.yarn/cache
-.yarn/unplugged
-.yarn/build-state.yml
-.yarn/install-state.gz
-.pnp.*
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 92936038..00000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2024 Pintree.io
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index dbe1eaa4..00000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# pintree
-An intuitive tool for managing and exporting your local bookmarks into a shareable website.
diff --git a/convert.py b/convert.py
new file mode 100644
index 00000000..83f9615f
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,387 @@
+import os
+import json
+import time
+import asyncio
+import aiohttp
+import logging
+import argparse
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from collections import defaultdict
+import hashlib
+# 设置日志
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# 默认值
+DEFAULT_INPUT_FILE = "c:\\Users\\smartisan\Desktop\\bookmarks.html"
+DEFAULT_OUTPUT_DIR = "c:\\Users\\smartisan\Desktop\\"
+
+class PerformanceMonitor:
+ def __init__(self):
+ self.request_times = []
+ self.success_count = 0
+ self.total_requests = 0
+
+ def add_request(self, time, success):
+ self.request_times.append(time)
+ self.total_requests += 1
+ if success:
+ self.success_count += 1
+
+ def get_stats(self):
+ avg_time = sum(self.request_times) / len(self.request_times) if self.request_times else 0
+ success_rate = self.success_count / self.total_requests if self.total_requests > 0 else 0
+ return {
+ "average_request_time": avg_time,
+ "success_rate": success_rate,
+ "total_requests": self.total_requests
+ }
+
+class BookmarkProcessor:
+ @staticmethod
+ def parse_bookmarks(file_path):
+ with open(file_path, 'r', encoding='utf-8') as file:
+ content = file.read()
+ soup = BeautifulSoup(content, 'html.parser')
+ bookmarks = []
+ for a in soup.find_all('a'):
+ title = a.string
+ url = a.get('href')
+ icon = a.get('icon')
+ if title and url:
+ bookmark = {'title': title, 'url': url}
+ if icon:
+ bookmark['icon'] = icon
+ bookmarks.append(bookmark)
+ return soup, bookmarks
+
+def analyze_bookmarks(bookmarks):
+ languages = defaultdict(int)
+ domain_features = defaultdict(int)
+
+ for bookmark in bookmarks:
+ title = bookmark.get('title', '') # 使用 get 方法,如果 title 不存在则返回空字符串
+ url = bookmark.get('url', '') # 同样处理 url
+
+ # 检查标题中的语言
+ if title: # 只在 title 不为空时检查
+ if any('\u4e00' <= char <= '\u9fff' for char in title):
+ languages['Chinese'] += 1
+ elif any('\u3040' <= char <= '\u30ff' for char in title):
+ languages['Japanese'] += 1
+ elif any('\uac00' <= char <= '\ud7a3' for char in title):
+ languages['Korean'] += 1
+ else:
+ languages['Other'] += 1
+
+ # 分析域名特征
+ if url: # 只在 url 不为空时分析
+ domain = urlparse(url).netloc
+ if 'github' in domain:
+ domain_features['GitHub'] += 1
+ elif 'stackoverflow' in domain:
+ domain_features['StackOverflow'] += 1
+ # ... 其他域名特征 ...
+
+ main_language = max(languages, key=languages.get) if languages else 'Unknown'
+ return main_language, dict(domain_features)
+
+
+class BookmarkCategorizer:
+ def __init__(self, api_key, api_base):
+ self.api_key = api_key
+ self.api_base = api_base
+ self.cache = {}
+ self.performance_monitor = PerformanceMonitor()
+ self.prompt = self.get_prompt()
+ self.prompt_hash = self.hash_prompt(self.prompt)
+ self.load_cache()
+
+ def get_prompt(self):
+ return """请为以下书签选择一个最合适的分类,包括大分类和子分类(如果适用)。输出格式应为大分类和多个子分类的目录树结构。参考以下示例进行分类,并根据书签标题和URL的关键词进行分析:
+ 示例1:
+ 书签: Snailclimb/JavaGuide: 【Java学习+面试指南】 一份涵盖大部分Java程序员所需要掌握的核心知识。
+ 输出:
+ 编程与技术
+ - Java技术
+
+ 示例2:
+ 书签: React - A JavaScript library for building user interfaces
+ 输出:
+ 编程与技术
+ - 前端技术
+
+ 示例3:
+ 书签: Wireshark · Go Deep.
+ 输出:
+ 网络与安全
+ - 网络技术
+
+ 示例4:
+ 书签: Kotlin for Android Developers
+ 输出:
+ 编程与技术
+ - Kotlin语言
+
+ 现在,请为下面的书签选择一个最合适的分类,遵循以下原则:
+
+ - 输出格式为大分类和多个子分类的目录树结构
+ - 如果有多个子分类,请按示例格式列出所有子分类
+ - 不要包含"分类:"或任何其他前缀\后缀,只返回分类名称的文本内容
+
+ 大分类及其子分类包括但不限于:
+
+ 编程与技术
+ - Java技术
+ - 前端技术
+ - 后端技术
+ - Web开发
+ - Go语言技术
+ - Kotlin语言
+ - 运维技术
+ - 区块链技术
+ - 系统设计
+ - 算法
+ - Python技术
+ - 数据可视化
+ - 嵌入式技术
+ - 软件开发工具
+ - 数据库技术
+ - 硬件开发
+ - ......
+
+ 设计与艺术
+ - 图标设计
+ - 字体设计
+ - UI设计
+ - 数字艺术
+ - 网页设计
+ - 视频编辑
+ - 音乐制作
+ - 影视后期特效
+ - 视频调色
+ - Photoshop资源
+ - ......
+ 教育资源
+ - 计算机书籍
+ - 教育网站
+ - 电子书
+ - ......
+ 软件与工具
+ - 桌面增强
+ - 在线工具
+ - 在线编辑器
+ - 用户脚本
+ - 社交平台工具
+ - 开发者平台
+ - 移动应用
+ - 可视化工具
+ - 刷机包
+ - Windows工具
+ - MacOS工具
+ - ......
+ - ......
+ 图片与视频
+ - 摄影教程
+ - 壁纸与桌面美化
+ - 照片编辑
+ - ......
+ 数据与分析
+ - 数据可视化
+ - 数据分析
+ - ......
+ 媒体与娱乐
+ - 音乐
+ - 电影资源
+ - 电视剧
+ - 摄影教程
+ - 地理与旅游
+ - 音乐制作
+ - 电影资源
+ - 电视资源
+ - ......
+ 网络与安全
+ - 网络技术
+ - 逆向工程
+ - 漏洞挖掘
+ - 网络安全
+ - ......
+ AI与技术
+ - 机器学习
+ - ......
+ 生活与兴趣
+ - 模拟游戏
+ - 电子产品
+ - 园艺
+ - 电子商务
+ - 成人内容
+ - ......
+ 请开始分类,记住只返回分类名称的文本内容,不要其他任何内容:
+ """
+
+ def hash_prompt(self, prompt):
+ return hashlib.md5(prompt.encode()).hexdigest()
+
+ def load_cache(self):
+ try:
+ with open('category_cache.json', 'r') as f:
+ cached_data = json.load(f)
+ if cached_data.get('prompt_hash') == self.prompt_hash:
+ self.cache = cached_data.get('categories', {})
+ else:
+ self.cache = {}
+ except FileNotFoundError:
+ self.cache = {}
+
+ def save_cache(self):
+ with open('category_cache.json', 'w') as f:
+ json.dump({
+ 'prompt_hash': self.prompt_hash,
+ 'categories': self.cache
+ }, f)
+
+ async def categorize_bookmark(self, title, url):
+ cache_key = f"{title}:{url}"
+ if cache_key in self.cache:
+ return self.cache[cache_key]
+
+ full_prompt = f"{self.prompt}\n标题: {title}\n网址: {url}"
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json"
+ }
+ data = {
+ "model": "deepseek-chat",
+ "messages": [{"role": "user", "content": full_prompt}],
+ "temperature": 0.5,
+ "max_tokens": 32,
+ "stop": ["\n"]
+ }
+
+ start_time = time.time()
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.post(f"{self.api_base}/chat/completions", headers=headers, json=data) as response:
+ if response.status == 200:
+ result = await response.json()
+ category = result['choices'][0]['message']['content'].strip()
+ self.cache[cache_key] = category
+ self.performance_monitor.add_request(time.time() - start_time, True)
+ return category
+ else:
+ logger.error(f"API request failed with status {response.status}")
+ self.performance_monitor.add_request(time.time() - start_time, False)
+ return "未分类"
+ except Exception as e:
+ logger.error(f"Error during API request: {str(e)}")
+ self.performance_monitor.add_request(time.time() - start_time, False)
+ return "未分类"
+
+ async def process_bookmarks(self, file_path):
+ # 检查 prompt 是否发生变化
+ current_prompt = self.get_prompt()
+ current_prompt_hash = self.hash_prompt(current_prompt)
+ if current_prompt_hash != self.prompt_hash:
+ logger.info("Prompt has changed. Clearing cache.")
+ self.cache = {}
+ self.prompt = current_prompt
+ self.prompt_hash = current_prompt_hash
+
+ # 其余的处理逻辑保持不变
+ soup, bookmarks = BookmarkProcessor.parse_bookmarks(file_path)
+
+ if bookmarks:
+ main_language, domain_features = analyze_bookmarks(bookmarks)
+ else:
+ main_language, domain_features = 'Unknown', {}
+
+ total_bookmarks = len(bookmarks)
+ root_folder = {
+ "type": "folder",
+ "addDate": int(time.time()),
+ "lastModified": int(time.time()),
+ "title": "Navigation Hub",
+ "children": []
+ }
+
+ categorized_bookmarks = defaultdict(list)
+
+ for i, bookmark in enumerate(bookmarks, 1):
+ title = bookmark['title']
+ url = bookmark['url']
+ category = await self.categorize_bookmark(title, url)
+ categorized_bookmarks[category].append(bookmark)
+
+ logger.info(f"进度: {i}/{total_bookmarks} - 已分类: {title} -> {category}")
+
+ for category, bookmarks in categorized_bookmarks.items():
+ folder = {
+ "type": "folder",
+ "addDate": int(time.time()),
+ "lastModified": int(time.time()),
+ "title": category,
+ "children": [{
+ "type": "link",
+ "url": bookmark['url'],
+ "title": bookmark['title'],
+ "addDate": int(time.time()),
+ "lastModified": int(time.time()),
+ **({"icon": bookmark['icon']} if 'icon' in bookmark else {})
+ } for bookmark in bookmarks]
+ }
+ root_folder["children"].append(folder)
+
+ self.save_cache()
+
+ return root_folder
+
+
+async def main():
+ try:
+ parser = argparse.ArgumentParser(description="分类书签")
+ parser.add_argument("--input_file", help="输入书签HTML文件的路径", default=DEFAULT_INPUT_FILE)
+ parser.add_argument("--output_dir", help="保存输出JSON文件的目录", default=DEFAULT_OUTPUT_DIR)
+ args = parser.parse_args()
+
+ DEEPSEEK_API_KEY = ""
+ if not DEEPSEEK_API_KEY:
+ raise ValueError("未设置DEEPSEEK_API_KEY环境变量")
+
+ categorizer = BookmarkCategorizer(DEEPSEEK_API_KEY, "https://api.deepseek.com/v1")
+
+ file_path = args.input_file
+ output_dir = args.output_dir
+
+ # 获取 root_folder
+ root_folder = await categorizer.process_bookmarks(file_path)
+
+ # 构造新的 JSON 文件路径
+ input_filename = os.path.splitext(os.path.basename(file_path))[0]
+ json_file_path = os.path.join(output_dir, f"{input_filename}.json")
+
+ # 保存 JSON 文件,将 root_folder 包装在列表中
+ with open(json_file_path, 'w', encoding='utf-8') as file:
+ json.dump([root_folder], file, ensure_ascii=False, indent=2)
+
+ logger.info(f"分类后的书签JSON文件已保存至: {json_file_path}")
+
+ # 计算并打印类别统计信息
+ category_stats = {folder["title"]: len(folder["children"]) for folder in root_folder["children"]}
+ logger.info("Category statistics:")
+ for category, count in category_stats.items():
+ logger.info(f"{category}: {count}")
+
+ performance_stats = categorizer.performance_monitor.get_stats()
+
+ logger.info("Performance statistics:")
+ logger.info(f"Average request time: {performance_stats['average_request_time']:.2f} seconds")
+ logger.info(f"Success rate: {performance_stats['success_rate']:.2%}")
+ logger.info(f"Total requests: {performance_stats['total_requests']}")
+
+ except Exception as e:
+ logging.critical(f"An unexpected error occurred: {str(e)}", exc_info=True)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())