Skip to content

Commit 03b8543

Browse files
authored
🏎️ Make content refresher async (#1184)
* 🏎️ Make content refresher async * 🏎️ Make content refresher async * 🏎️ Make content refresher async
1 parent 9e8e49e commit 03b8543

File tree

3 files changed

+94
-45
lines changed

3 files changed

+94
-45
lines changed

β€Ž.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,4 @@ yarn-error.log*
5252
.sentryclirc
5353
/volumes/
5454
schema.prismae
55+
*.sql

β€Žplatform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py

+51-45
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import re
22
from typing import Any
33

4-
import anthropic
54
import requests
65
from bs4 import BeautifulSoup
76
from loguru import logger
87
from scrapingbee import ScrapingBeeClient
98

109
from reworkd_platform.schemas.workflow.base import Block, BlockIOBase
10+
from reworkd_platform.services.anthropic import ClaudeService, HumanAssistantPrompt
1111
from reworkd_platform.settings import settings
1212

1313

@@ -29,10 +29,10 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
2929
logger.info(f"Starting {self.type}")
3030
target_url = self.input.url
3131

32-
target_content = get_page_content(target_url)
32+
target_content = await get_page_content(target_url)
3333
logger.info(target_content)
3434

35-
keywords = find_content_kws(target_content)
35+
keywords = await find_content_kws(target_content)
3636
logger.info(keywords)
3737

3838
source_urls = search_results(keywords)
@@ -41,23 +41,24 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
4141
logger.info(source_urls)
4242

4343
source_contents = [
44-
get_page_content(url)
44+
await get_page_content(url)
4545
for url in source_urls[:3] # TODO: remove limit of 3 sources
46-
] # TODO: async/multithread the LLM calls
46+
]
47+
4748
source_contents = [
4849
content for content in source_contents if content is not None
4950
]
51+
5052
logger.info(source_contents)
53+
new_info = [
54+
await find_new_info(target_content, source_content)
55+
for source_content in source_contents
56+
]
5157

52-
new_infos = "\n\n".join(
53-
[
54-
find_new_info(target_content, source_content)
55-
for source_content in source_contents
56-
]
57-
)
58+
new_infos = "\n\n".join(new_info)
5859
logger.info(new_infos)
5960

60-
updated_target_content = add_info(target_content, new_infos)
61+
updated_target_content = await add_info(target_content, new_infos)
6162
logger.info(updated_target_content)
6263

6364
return ContentRefresherOutput(
@@ -70,12 +71,13 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
7071
scraper = ScrapingBeeClient(
7172
api_key=settings.scrapingbee_api_key,
7273
)
73-
claude = anthropic.Anthropic(
74+
75+
claude = ClaudeService(
7476
api_key=settings.anthropic_api_key,
7577
)
7678

7779

78-
def get_page_content(url: str) -> str:
80+
async def get_page_content(url: str) -> str:
7981
page = requests.get(url)
8082
if page.status_code != 200:
8183
page = scraper.get(url)
@@ -90,14 +92,17 @@ def get_page_content(url: str) -> str:
9092
]
9193
)
9294

93-
prompt = f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma."
94-
response = claude.completions.create(
95-
model="claude-2",
96-
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here are the line numbers of the main content:",
95+
prompt = HumanAssistantPrompt(
96+
human_prompt=f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma.",
97+
assistant_prompt="Here are the line numbers of the main content:",
98+
)
99+
100+
line_nums = await claude.completion(
101+
prompt=prompt,
97102
max_tokens_to_sample=500,
98103
temperature=0,
99104
)
100-
line_nums = response.completion.strip()
105+
101106
if len(line_nums) == 0:
102107
return ""
103108

@@ -116,17 +121,17 @@ def get_page_content(url: str) -> str:
116121
return "\n".join(content)
117122

118123

119-
def find_content_kws(content: str) -> str:
124+
async def find_content_kws(content: str) -> str:
120125
# Claude: find search keywords that content focuses on
121-
prompt = f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively."
122-
response = claude.completions.create(
123-
model="claude-2",
124-
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a short search query that best matches the content of the article:",
126+
prompt = HumanAssistantPrompt(
127+
human_prompt=f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively.",
128+
assistant_prompt="Here is a short search query that best matches the content of the article:",
129+
)
130+
131+
return await claude.completion(
132+
prompt=prompt,
125133
max_tokens_to_sample=20,
126-
temperature=0,
127134
)
128-
response_message = response.completion.strip()
129-
return response_message
130135

131136

132137
def search_results(search_query: str) -> list[str]:
@@ -142,33 +147,34 @@ def search_results(search_query: str) -> list[str]:
142147
},
143148
)
144149
response.raise_for_status()
145-
search_results = response.json()
146-
urls = [result["link"] for result in search_results["organic"]]
150+
urls = [result["link"] for result in response.json()["organic"]]
147151
return urls
148152

149153

150-
def find_new_info(target: str, source: str) -> str:
154+
async def find_new_info(target: str, source: str) -> str:
151155
# Claude: info mentioned in source that is not mentioned in target
152-
prompt = f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article."
153-
response = claude.completions.create(
154-
model="claude-2",
155-
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a list of claims in the SOURCE that are not in the TARGET:",
156+
prompt = HumanAssistantPrompt(
157+
human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.",
158+
assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:",
159+
)
160+
161+
response = await claude.completion(
162+
prompt=prompt,
156163
max_tokens_to_sample=5000,
157-
temperature=0,
158164
)
159-
response_message = response.completion.strip()
160-
new_info = "\n".join(response_message.split("\n\n"))
165+
166+
new_info = "\n".join(response.split("\n\n"))
161167
return new_info
162168

163169

164-
def add_info(target: str, info: str) -> str:
170+
async def add_info(target: str, info: str) -> str:
165171
# Claude: rewrite target to include the info
166-
prompt = f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles."
167-
response = claude.completions.create(
168-
model="claude-2",
169-
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
172+
prompt = HumanAssistantPrompt(
173+
human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles.",
174+
assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
175+
)
176+
177+
return await claude.completion(
178+
prompt=prompt,
170179
max_tokens_to_sample=5000,
171-
temperature=0,
172180
)
173-
response_message = response.completion.strip()
174-
return response_message
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from typing import Any, Optional
2+
3+
from anthropic import AsyncAnthropic
4+
from pydantic import BaseModel
5+
6+
7+
class AbstractPrompt(BaseModel):
8+
def to_string(self) -> str:
9+
raise NotImplementedError
10+
11+
12+
class HumanAssistantPrompt(AbstractPrompt):
13+
assistant_prompt: str
14+
human_prompt: str
15+
16+
def to_string(self) -> str:
17+
return (
18+
f"""\n\nHuman: {self.human_prompt}\n\nAssistant: {self.assistant_prompt}"""
19+
)
20+
21+
22+
class ClaudeService:
23+
def __init__(self, api_key: Optional[str], model: str = "claude-2"):
24+
self.claude = AsyncAnthropic(api_key=api_key)
25+
self.model = model
26+
27+
async def completion(
28+
self,
29+
prompt: AbstractPrompt,
30+
max_tokens_to_sample: int,
31+
temperature: int = 0,
32+
**kwargs: Any,
33+
) -> str:
34+
return (
35+
await self.claude.completions.create(
36+
model=self.model,
37+
prompt=prompt.to_string(),
38+
max_tokens_to_sample=max_tokens_to_sample,
39+
temperature=temperature,
40+
**kwargs,
41+
)
42+
).completion.strip()

0 commit comments

Comments
Β (0)