|
| 1 | +from openai import OpenAI |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import requests |
| 4 | +import random |
| 5 | +import time |
| 6 | +import datetime |
| 7 | +import os |
| 8 | + |
| 9 | +path_to = f'src/content/blog/{datetime.datetime.now().strftime("%Y-%m-%d")}' |
| 10 | + |
| 11 | +if os.path.exists(path_to): |
| 12 | + print("Article already generated today.") |
| 13 | + exit(0) |
| 14 | +else: |
| 15 | + os.makedirs(path_to, exist_ok=True) |
| 16 | + print(f"Created directory {path_to}") |
| 17 | + |
| 18 | +start = time.time() |
| 19 | +print("Connecting to LLM API ...") |
| 20 | +deepseek = OpenAI(base_url="https://api.deepseek.com", api_key=os.environ.get("DS_APIKEY$")) |
| 21 | +print(f"Initialized LLM API. ({time.time() - start:.1f}s)") |
| 22 | + |
| 23 | +def generate(context, provider, model): |
| 24 | + completion = provider.chat.completions.create( |
| 25 | + model=model, |
| 26 | + messages=context |
| 27 | + ) |
| 28 | + return completion.choices[0].message.content.strip() |
| 29 | + |
| 30 | +def scrape_website(url, css_selector): |
| 31 | + response = requests.get(url) |
| 32 | + if response.status_code == 200: |
| 33 | + soup = BeautifulSoup(response.content, "html.parser") |
| 34 | + elements = soup.select(css_selector) |
| 35 | + return elements |
| 36 | + else: return [] |
| 37 | + |
| 38 | +topics = [topic.get_text(strip=True) for topic in scrape_website("https://news.ycombinator.com/", ".titleline")] |
| 39 | +topics_text = "\n".join(random.choices(topics, k=random.randint(5, len(topics)))) |
| 40 | +print(f"Scraped {len(topics)} topics from Hacker News.") |
| 41 | + |
| 42 | +def extract_topic(topics): |
| 43 | + global deepseek |
| 44 | + return generate([ |
| 45 | + {"role": "system", "content": "你在为一篇技术博客确定一个主题。直接用中文输出主题。"}, |
| 46 | + {"role": "user", "content": f"阅读以下是 HackerNews 的热门文章,然后写一个可以用于技术博客的主题。这个主题应当是一个通用、普通的技术,不能是一个事件或其它东西。\n\n{topics}\n\n只需要一个主题,直接输出。"}, |
| 47 | + ], deepseek, "deepseek-chat") |
| 48 | + |
| 49 | +def outline(topic): |
| 50 | + global deepseek |
| 51 | + return generate([ |
| 52 | + {"role": "user", "content": f"我要写一篇关于「{topic}」的博客文章。帮我列一个详细的文章提纲。"} |
| 53 | + ], deepseek, "deepseek-reasoner") |
| 54 | + |
| 55 | +def write_from_outline(outline): |
| 56 | + global deepseek |
| 57 | + return generate([ |
| 58 | + {"role": "user", "content": f"{outline}\n\n根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,避免使用列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用markdown输出,使用latex公式,标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。直接输出正文。"} |
| 59 | + ], deepseek, "deepseek-reasoner") |
| 60 | + |
| 61 | +def summary(article): |
| 62 | + global deepseek |
| 63 | + return generate([ |
| 64 | + {"role": "system", "content": "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。"}, |
| 65 | + {"role": "user", "content": f"给这篇文章写一个15字的简短介绍:\n\n{article}"} |
| 66 | + ], deepseek, "deepseek-chat") |
| 67 | + |
| 68 | +start = time.time() |
| 69 | +print("Generating topic ...") |
| 70 | +topic = extract_topic(topics_text) |
| 71 | +print(f"Determined topic ({time.time() - start:.1f}s): {topic}") |
| 72 | + |
| 73 | +start = time.time() |
| 74 | +print("Generating outline ...") |
| 75 | +outline_result = outline(topic) |
| 76 | +print(f"Outline generated ({time.time() - start:.1f}s).") |
| 77 | + |
| 78 | +start = time.time() |
| 79 | +print("Generating article ...") |
| 80 | +article = write_from_outline(outline_result) |
| 81 | +print(f"Article generated ({time.time() - start:.1f}s).") |
| 82 | + |
| 83 | +start = time.time() |
| 84 | +print("Generating summary ...") |
| 85 | +summary_result = summary(article) |
| 86 | +print(f"Summary ({time.time() - start:.1f}s): {summary_result}") |
| 87 | + |
| 88 | +lines = iter(article.split("\n")) |
| 89 | +markdown_file = "" |
| 90 | +author = random.choice(["杨其臻", "杨子凡", "叶家炜", "黄京"]) |
| 91 | + |
| 92 | +for line in lines: |
| 93 | + if line.startswith("# "): |
| 94 | + title = line[1:].strip() |
| 95 | + print(f"Detected title: {title}") |
| 96 | + |
| 97 | + metadata = "\n".join([ |
| 98 | + "---", |
| 99 | + f'title: "{title}"', |
| 100 | + f'author: "{author}"', |
| 101 | + f'date: "{datetime.datetime.now().strftime("%b %d, %Y")}"', |
| 102 | + f'description: "{summary_result}"', |
| 103 | + f'latex: true', |
| 104 | + f'pdf: true', |
| 105 | + "---", |
| 106 | + ]) + "\n" |
| 107 | + print(f"Injecting metadata:\n{metadata.strip()}") |
| 108 | + |
| 109 | + markdown_file += metadata |
| 110 | + break |
| 111 | + |
| 112 | +for line in lines: |
| 113 | + if line.startswith("---"): continue |
| 114 | + markdown_file += line + "\n" |
| 115 | + |
| 116 | +with open(f"{path_to}/index.md", "w", encoding="utf-8") as f: |
| 117 | + f.write(markdown_file) |
| 118 | + |
| 119 | +print(f"Markdown file generated at {path_to}/index.md") |
0 commit comments