Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# Python cache and compiled files
__pycache__/
**/*.pyc
*.py[cod]
*$py.class
*.so
Expand Down
32 changes: 23 additions & 9 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
)
from pipeline.batch_analyze import fetch_all_fulltexts, analyze_texts
from pipeline.csv_export import flatten_to_rows
from pipeline.pubmed_query_generation import generate_pubmed_query

# Import prompts for editing
from llm.prompts import PROMPTS
from llm.pubmed_query import PUBMED_QUERY


def _persist(key, value):
Expand Down Expand Up @@ -369,8 +371,26 @@ def main():
st.caption("💡 This is what the LLM receives. The editable section is embedded in the middle.")

# ===== Search Section =====
st.subheader("1) Enter your PubMed query")

st.subheader("1) Construct your PubMed query (reviews only)")

st.write(
"Describe which papers you want to fetch from PubMed. We'll use the description to construct a PubMed query.")
pubmed_query_instruction = st.text_area("PubMed Query Description", height=100,
placeholder='e.g., Fetch all papers about dengue containing terms about protein, mutation, and activation site.')

if st.button("Generate PubMed Query"):
pubmed_query = generate_pubmed_query(pubmed_query_instruction)
PUBMED_QUERY.pubmed_query = pubmed_query

# Editor for editing the generated pubmed query section
query = st.text_area(
"Editable PubMedQuery Section",
value=PUBMED_QUERY.pubmed_query,
height=100,
help="This section contains the PubMed query to fetch the desired publications. Edit the query here to customize extraction behavior."
)
PUBMED_QUERY.pubmed_query = query

# Show NCBI API status in main area
if not os.getenv("NCBI_API_KEY"):
st.info("💡 **Tip:** Add your NCBI API key in the sidebar (👈) to increase rate limits from 3 to 10 requests/second.")
Expand All @@ -381,13 +401,6 @@ def main():
value=True,
help="If checked, only search for Review articles. If unchecked, search all article types."
)

if reviews_only:
st.write("Paste a PubMed query (will restrict to **Review** articles automatically).")
else:
st.write("Paste a PubMed query (will search **all article types**).")

query = st.text_area("Query", height=100, placeholder='e.g., dengue[MeSH Terms] AND mutation[Text Word]')

st.subheader("2) Choose publication date range & search")
colA, colB, colC, colD = st.columns([1, 1, 1, 1])
Expand All @@ -413,6 +426,7 @@ def main():
search_button_text = "🔎 Search PubMed (reviews)" if reviews_only else "🔎 Search PubMed (all articles)"
go = st.button(search_button_text)
if go:
query = PUBMED_QUERY.pubmed_query
if not query.strip():
st.warning("Please enter a query.")
else:
Expand Down
71 changes: 71 additions & 0 deletions llm/pubmed_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from dataclasses import dataclass


@dataclass
class PubMedQuery:
# Locked sections (cannot be edited - critical for app functionality)
_prompt_header: str = """Respond only in JSON. If you cannot construct a pubmed query, respond {}.
All fields are required; if unknown, use null.
"""

_prompt_output_rules: str = """OUTPUT RULES (format-lock)
• Respond ONLY with a valid JSON that passes json.loads().
• No prose, no markdown, no trailing commas, no comments.
• One JSON object containing the pubmed query in "pubmed_query" parameter.
"""

_prompt_schema: str = """SCHEMA
The output must follow exactly:
{
"pubmed_query": "<Pubmed query string or null>"
}
"""

_prompt_examples: str = """AN EXAMPLE (keep these)
{
"pubmed_query": "((Dengue[Title]) AND (protein)) AND ((active site[Text Word]) OR (mutation[Text Word]))",
}
"""

_prompt_footer: str = """**CRITICAL**: The generated query must follow the Pubmed query syntax.
"""

# Editable sections (can be modified by users)
# Part 1: SYSTEM/INSTRUCTION and DEFINITIONS (comes before OUTPUT RULES)
_analyst_prompt_instruction: str = """SYSTEM / INSTRUCTION
You are a biomedical text-mining specialist. Write a query to search Pubmed for the given specification: '{_analyst_pubmed_query_instruction}'.
"""


def generate_prompt_for_pubmed_query(self, pubmed_query_instruction) -> str:
"""Assemble the full prompt from locked and editable sections."""
# If there's an override (set directly), use it for backward compatibility

# Otherwise, assemble from parts in the correct order:
# Header → Editable Part1 (SYSTEM + DEFINITIONS) → OUTPUT RULES → SCHEMA → EXAMPLES → Editable Part2 (INSTRUCTIONS) → Footer
return (
self._prompt_header +
"\n" +
self._analyst_prompt_instruction.format(_analyst_pubmed_query_instruction=pubmed_query_instruction) +
"\n" +
self._prompt_output_rules +
"\n" +
self._prompt_schema +
"\n" +
self._prompt_examples +
"\n" +
self._prompt_footer
)

_pubmed_query: str = ""
@property
def pubmed_query(self) -> str:
return self._pubmed_query

@pubmed_query.setter
def pubmed_query(self, value:str):
self._pubmed_query = value


PUBMED_QUERY = PubMedQuery()

7 changes: 7 additions & 0 deletions pipeline/pubmed_query_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from llm.pubmed_query import PUBMED_QUERY
from llm import gemini, utils

def generate_pubmed_query(pubmed_query_instruction):
pubmed_query_generation_prompt = PUBMED_QUERY.generate_prompt_for_pubmed_query(pubmed_query_instruction)
gemini_output = gemini._gemini_complete(pubmed_query_generation_prompt)
return utils.safe_json_value(gemini_output)["pubmed_query"]