diff --git a/.gitignore b/.gitignore index 5486b1a..bf7f413 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ # Python cache and compiled files __pycache__/ +**/*.pyc *.py[cod] *$py.class *.so diff --git a/app/app.py b/app/app.py index a923fa7..419653e 100644 --- a/app/app.py +++ b/app/app.py @@ -15,9 +15,11 @@ ) from pipeline.batch_analyze import fetch_all_fulltexts, analyze_texts from pipeline.csv_export import flatten_to_rows +from pipeline.pubmed_query_generation import generate_pubmed_query # Import prompts for editing from llm.prompts import PROMPTS +from llm.pubmed_query import PUBMED_QUERY def _persist(key, value): @@ -369,8 +371,26 @@ def main(): st.caption("💡 This is what the LLM receives. The editable section is embedded in the middle.") # ===== Search Section ===== - st.subheader("1) Enter your PubMed query") - + st.subheader("1) Construct your PubMed query (reviews only)") + + st.write( + "Describe which papers you want to fetch from PubMed. We'll use the description to construct a PubMed query.") + pubmed_query_instruction = st.text_area("PubMed Query Description", height=100, + placeholder='e.g., Fetch all papers about dengue containing terms about protein, mutation, and activation site.') + + if st.button("Generate PubMed Query"): + pubmed_query = generate_pubmed_query(pubmed_query_instruction) + PUBMED_QUERY.pubmed_query = pubmed_query + + # Editor for editing the generated pubmed query section + query = st.text_area( + "Editable PubMedQuery Section", + value=PUBMED_QUERY.pubmed_query, + height=100, + help="This section contains the PubMed query to fetch the desired publications. Edit the query here to customize extraction behavior." + ) + PUBMED_QUERY.pubmed_query = query + # Show NCBI API status in main area if not os.getenv("NCBI_API_KEY"): st.info("💡 **Tip:** Add your NCBI API key in the sidebar (👈) to increase rate limits from 3 to 10 requests/second.") @@ -381,13 +401,6 @@ def main(): value=True, help="If checked, only search for Review articles. If unchecked, search all article types." ) - - if reviews_only: - st.write("Paste a PubMed query (will restrict to **Review** articles automatically).") - else: - st.write("Paste a PubMed query (will search **all article types**).") - - query = st.text_area("Query", height=100, placeholder='e.g., dengue[MeSH Terms] AND mutation[Text Word]') st.subheader("2) Choose publication date range & search") colA, colB, colC, colD = st.columns([1, 1, 1, 1]) @@ -413,6 +426,7 @@ def main(): search_button_text = "🔎 Search PubMed (reviews)" if reviews_only else "🔎 Search PubMed (all articles)" go = st.button(search_button_text) if go: + query = PUBMED_QUERY.pubmed_query if not query.strip(): st.warning("Please enter a query.") else: diff --git a/llm/pubmed_query.py b/llm/pubmed_query.py new file mode 100644 index 0000000..6cdfa36 --- /dev/null +++ b/llm/pubmed_query.py @@ -0,0 +1,71 @@ +from dataclasses import dataclass + + +@dataclass +class PubMedQuery: + # Locked sections (cannot be edited - critical for app functionality) + _prompt_header: str = """Respond only in JSON. If you cannot construct a pubmed query, respond {}. +All fields are required; if unknown, use null. +""" + + _prompt_output_rules: str = """OUTPUT RULES (format-lock) +• Respond ONLY with a valid JSON that passes json.loads(). +• No prose, no markdown, no trailing commas, no comments. +• One JSON object containing the pubmed query in "pubmed_query" parameter. +""" + + _prompt_schema: str = """SCHEMA +The output must follow exactly: +{ + "pubmed_query": "" +} +""" + + _prompt_examples: str = """AN EXAMPLE (keep these) + { + "pubmed_query": "((Dengue[Title]) AND (protein)) AND ((active site[Text Word]) OR (mutation[Text Word]))", + } +""" + + _prompt_footer: str = """**CRITICAL**: The generated query must follow the Pubmed query syntax. +""" + + # Editable sections (can be modified by users) + # Part 1: SYSTEM/INSTRUCTION and DEFINITIONS (comes before OUTPUT RULES) + _analyst_prompt_instruction: str = """SYSTEM / INSTRUCTION +You are a biomedical text-mining specialist. Write a query to search Pubmed for the given specification: '{_analyst_pubmed_query_instruction}'. +""" + + + def generate_prompt_for_pubmed_query(self, pubmed_query_instruction) -> str: + """Assemble the full prompt from locked and editable sections.""" + # If there's an override (set directly), use it for backward compatibility + + # Otherwise, assemble from parts in the correct order: + # Header → Editable Part1 (SYSTEM + DEFINITIONS) → OUTPUT RULES → SCHEMA → EXAMPLES → Editable Part2 (INSTRUCTIONS) → Footer + return ( + self._prompt_header + + "\n" + + self._analyst_prompt_instruction.format(_analyst_pubmed_query_instruction=pubmed_query_instruction) + + "\n" + + self._prompt_output_rules + + "\n" + + self._prompt_schema + + "\n" + + self._prompt_examples + + "\n" + + self._prompt_footer + ) + + _pubmed_query: str = "" + @property + def pubmed_query(self) -> str: + return self._pubmed_query + + @pubmed_query.setter + def pubmed_query(self, value:str): + self._pubmed_query = value + + +PUBMED_QUERY = PubMedQuery() + diff --git a/pipeline/pubmed_query_generation.py b/pipeline/pubmed_query_generation.py new file mode 100644 index 0000000..8944170 --- /dev/null +++ b/pipeline/pubmed_query_generation.py @@ -0,0 +1,7 @@ +from llm.pubmed_query import PUBMED_QUERY +from llm import gemini, utils + +def generate_pubmed_query(pubmed_query_instruction): + pubmed_query_generation_prompt = PUBMED_QUERY.generate_prompt_for_pubmed_query(pubmed_query_instruction) + gemini_output = gemini._gemini_complete(pubmed_query_generation_prompt) + return utils.safe_json_value(gemini_output)["pubmed_query"] \ No newline at end of file