diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b97c73c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +**/chroma_db_LAB/ \ No newline at end of file diff --git a/your-code/main.ipynb b/your-code/main.ipynb index e3a225a..d7f3ac6 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -63,23 +63,23 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install langchain langchain_community pypdf\n", - "%pip install termcolor langchain_openai langchain-huggingface sentence-transformers chromadb langchain_chroma tiktoken openai python-dotenv\n" + "# %pip install langchain langchain_community pypdf\n", + "# %pip install termcolor langchain_openai langchain-huggingface sentence-transformers chromadb langchain_chroma tiktoken openai python-dotenv" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "6heKZkQUxYZr" }, "outputs": [], "source": [ "import os\n", - "from langchain.document_loaders import PyPDFLoader\n", - "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", "import warnings\n", - "warnings.filterwarnings('ignore')\n" + "warnings.filterwarnings('ignore')" ] }, { @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "cuREtJRixYZt" }, @@ -104,7 +104,7 @@ "source": [ "# File path for the document\n", "\n", - "file_path = \"LAB/ai-for-everyone.pdf\"" + "file_path = \"../ai-for-everyone.pdf\"" ] }, { @@ -122,12 +122,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "_b5Z_45UxYZu", "outputId": "a600d69f-14fe-4492-f236-97261d6ff36c" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "297" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Load and split the document\n", "loader = PyPDFLoader(file_path)\n", @@ -168,9 +179,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1096" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", @@ -285,31 +307,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "L0xDxElwxYZw" }, "outputs": [], "source": [ - "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", "from dotenv import load_dotenv" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "_WRIo3_0xYZx", "outputId": "78bfbbf3-9d25-4e31-bdbc-3e932e6bbfec" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "load_dotenv()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "MNZfTng5xYZz", "outputId": "db1a7c85-ef9f-447e-92cd-9d097e959847" @@ -343,23 +376,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "id": "brKe6wUgxYZ0" }, "outputs": [], "source": [ - "from langchain.vectorstores import Chroma" + "from langchain_chroma import Chroma" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "VkjHR-RkxYZ0", "outputId": "bc11bda9-f283-457a-f584-5a06b95c4dd9" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChromaDB created with document embeddings.\n" + ] + } + ], "source": [ "db = Chroma.from_documents(chunks, embeddings, persist_directory=\"./chroma_db_LAB\")\n", "print(\"ChromaDB created with document embeddings.\")" @@ -383,28 +424,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "id": "XiLv-TfrxYZ1" }, "outputs": [], "source": [ - "user_question = \"\" # User question\n", + "user_question = \"Give me a short desciption for all the chapters\" # User question\n", "retrieved_docs = db.similarity_search(user_question, k=10) # k is the number of documents to retrieve" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "qgWsh50JxYZ1", "outputId": "c8640c5d-5955-471f-fdd2-37096f5f68c7" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "How to cite this book chapter: \n", + "Verdegem, P . 2021. Introduction: Why We Need Critical Perspectives on AI. \n", + "In: Verdegem, P . (ed.) AI for Everyone? Critical Perspectives. Pp. 1–18. London: \n", + "University of Westminster Press. DOI: https://doi.org/10.16997/book55.a. License: \n", + "CC-BY-NC-ND 4.0\n", + "CHAPTER 1\n", + "Introduction: Why We Need Critical \n", + "Perspectives on AI\n", + "Pieter Verdegem\n", + "Introduction\n", + "The renewed interest in Artificial Intelligence (AI) has made it the most \n", + "recent hype in the world of technological innovation. In the business world, \n", + "AI is seen as a catalyst for growth, which will manifestly transform the eco-\n", + "nomy and the future of work (Agrawal, Gans and Goldfarb 2018; Lee 2018; \n", + "McAfee and Brynjolfsson 2017). Policymakers and civil society are putting \n", + "their hopes on AI for tackling global challenges such as pandemics and even \n", + "climate change (Dobbe and Whittaker 2019; Dananjayan and Raj 2020). AI also \n", + "seems to be the subject of an arms race between China, Russia and the USA for\n", + "Document 2:\n", + "when talking about AI and intelligent systems. Angela Daly, S. Kate Devitt and \n", + "Monique Mann (Chapter 7) introduce and discuss their Good Data approach in \n", + "order to overcome the limitations of AI ethics and governance. James Steinhoff \n", + "(Chapter 8) critically analyses the social reconfiguration of AI and discusses \n", + "the central questions about utility and feasibility. Benedetta Brevini (Chapter 9) \n", + "analyses AI policies in Europe and unpacks some of the myths around AI \n", + "that legitimate capitalism. Alkim Almila Akdag Salah ( Chapter 10 ) reflects \n", + "on how the discourses of artistic computational production have changed and \n", + "how myths about AI need to be uncovered in this context.\n", + "Part 3: AI Power and Inequalities involves five contributions. Carrie O’Connell \n", + "and Chad Van de Wiele ( Chapter 11) revisit Wiener’s cybernetic prediction \n", + "as the theoretical foundation of AI and make a plea how we need to uncover \n", + "the black box of what is behind prediction and simulation. Jernej A. Prodnik\n", + "Document 3:\n", + "Introduction: Why We Need Critical Perspectives on AI 15\n", + "elaborates on how humans and machines have to coexist in the age of AI. \n", + "Wolfgang Hofkirchner ( Chapter 3 ) continues the discussion about humans \n", + "versus machines by analysing what Digital Humanism exactly entails. He pro-\n", + "poses dialectical models in order to overcome the human–machine dualism. \n", + "Jenna Ng (Chapter 4) adds to this discussion by elaborating on the rationalisa-\n", + "tion of AI and what this means for creativity. Dan McQuillan (Chapter 5) has \n", + "a different take on humanism and proposes how people’s councils for AI can \n", + "serve solidarity and mutual aid in times of crisis.\n", + "Part 2: Discourses and Myths About AI is comprised of five chapters. Rainer \n", + "Rehak (Chapter 6) stresses the importance but also limitations of metaphors \n", + "when talking about AI and intelligent systems. Angela Daly, S. Kate Devitt and \n", + "Monique Mann (Chapter 7) introduce and discuss their Good Data approach in\n" + ] + } + ], "source": [ "# Display top results\n", "for i, doc in enumerate(retrieved_docs[:3]): # Display top 3 results\n", - " print(f\"Document {i+1}:\\n{doc.page_content[36:1000]}\") # Display content" + " print(f\"Document {i+1}:\\n{doc.page_content[0:1000]}\") # Display content" ] }, { @@ -418,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "id": "2iB3lZqHxYZ2" }, @@ -434,12 +529,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "id": "2okzmuADxYZ2", "outputId": "0aa6cdca-188d-40e0-f5b4-8888d3549ea4" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context formatted for GPT model.\n" + ] + } + ], "source": [ "# Generate a formatted context from the retrieved documents\n", "formatted_context = _get_document_prompt(retrieved_docs)\n", @@ -464,22 +567,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "id": "tqxVh9s3xYZ3", "outputId": "97cca95d-4ab3-44d8-a76c-5713aad387d8" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt constructed.\n" + ] + } + ], "source": [ "prompt = f\"\"\"\n", + "## SYSTEM ROLE\n", + "You are a knowledgeable and factual chatbot designed to assist with technical questions about **AI**, specifically focusing on **AI for everyone**.\n", + "Your answers must be based exclusively on provided content from technical books provided.\n", + "\n", + "## USER QUESTION\n", + "The user has asked:\n", + "\"{user_question}\"\n", + "\n", + "## CONTEXT\n", + "Here is the relevant content from the technical books:\n", + "'''\n", + "{formatted_context}\n", + "'''\n", + "\n", + "## GUIDELINES\n", + "1. **Accuracy**:\n", + " - Only use the content in the `CONTEXT` section to answer.\n", + " - If the answer cannot be found, explicitly state: \"The provided context does not contain this information.\"\n", + " - Start explaining artificial intelligence and then into then give an answer regarding the chapters where you can find the answer.\n", + "\n", + "2. **Transparency**:\n", + " - Reference the book's name and page numbers when providing information.\n", + " - Do not speculate or provide opinions.\n", + "\n", + "3. **Clarity**:\n", + " - Use simple, professional, and concise language.\n", + " - Format your response in Markdown for readability.\n", "\n", + "## TASK\n", + "1. Answer the user's question **directly** if possible.\n", + "2. Point the user to relevant parts of the documentation.\n", + "3. Provide the response in the following format:\n", "\n", - "\"\"\"\n" + "## RESPONSE FORMAT\n", + "'''\n", + "# [Brief Title of the Answer]\n", + "[Answer in simple, clear text.]\n", + "\n", + "**Source**:\n", + "• [Book Title], Page(s): [...]\n", + "'''\n", + "\"\"\"\n", + "print(\"Prompt constructed.\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "id": "0mjkQJ_ZxYZ3" }, @@ -497,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "id": "ylypRWRlxYZ4" }, @@ -507,11 +658,11 @@ "client = openai.OpenAI()\n", "model_params = {\n", " 'model': 'gpt-4o',\n", - " 'temperature': , # Increase creativity\n", - " 'max_tokens': , # Allow for longer responses\n", - " 'top_p': , # Use nucleus sampling\n", - " 'frequency_penalty': , # Reduce repetition\n", - " 'presence_penalty': # Encourage new topics\n", + " 'temperature': 0.7, # Increase creativity\n", + " 'max_tokens': 4000, # Allow for longer responses\n", + " 'top_p': 0.9, # Use nucleus sampling\n", + " 'frequency_penalty': 0.5, # Reduce repetition\n", + " 'presence_penalty': 0.6 # Encourage new topics\n", "}" ] }, @@ -533,17 +684,55 @@ "outputs": [], "source": [ "messages = [{'role': 'user', 'content': prompt}]\n", - "completion = client.chat.completions.create(messages=messages, **model_params, timeout=120)" + "completion = client.chat.completions.create(messages=messages, **model_params, timeout=120) " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "id": "wLPAcchBxYZ5", "outputId": "976c7800-16ed-41fe-c4cf-58f60d3230d2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'''\n", + "# Chapter Descriptions from \"AI for Everyone? Critical Perspectives\"\n", + "\n", + "The book \"AI for Everyone? Critical Perspectives\" is structured into several parts, each exploring different aspects of AI. Here is a brief description of the chapters based on the context provided:\n", + "\n", + "## Part 1: AI – Humans vs. Machines\n", + "- **Chapter 1**: Introduction to critical perspectives on AI and its implications in various sectors.\n", + "- **Chapter 2**: Explores the history and definition of AI.\n", + "- **Chapter 3**: Analyzes Digital Humanism and proposes models to overcome human–machine dualism.\n", + "- **Chapter 4**: Discusses the rationalization of AI and its impact on creativity.\n", + "- **Chapter 5**: Proposes people’s councils for AI to promote solidarity during crises.\n", + "\n", + "## Part 2: Discourses and Myths About AI\n", + "- **Chapter 6**: Critiques the terminology used in AI discourse, highlighting metaphors' importance and limitations.\n", + "- **Chapter 7**: Introduces the Good Data approach as a way to address limitations in AI ethics and governance.\n", + "- **Chapter 8**: Analyzes social reconfiguration due to AI, focusing on utility and feasibility questions.\n", + "- **Chapter 9**: Examines European AI policies, debunking myths that legitimize capitalism.\n", + "- **Chapter 10**: Reflects on artistic computational production discourses.\n", + "\n", + "## Part 3: AI Power and Inequalities\n", + "- **Chapter 11**: Revisits cybernetic prediction theory, emphasizing transparency in prediction and simulation processes.\n", + "- **Chapter 12**: Critically analyzes algorithmic logic in digital capitalism.\n", + "- **Chapter 13**: Investigates biometrics and biopolitics with a case study on facial recognition bans.\n", + "- **Chapter 14**: Discusses human labor behind global AI platforms with an empirical focus on Brazil's Mechanical Turk.\n", + "- **Chapter 15**: Proposes data justice unionism as a means to rethink AI governance.\n", + "\n", + "Each part delves into distinct themes surrounding power dynamics, societal impacts, ethical considerations, historical contexts, myths, inequalities, and potential future directions for integrating AI into society.\n", + "\n", + "**Source**:\n", + "• \"AI for Everyone? Critical Perspectives\", Pages from multiple sections within the book\n", + "'''\n" + ] + } + ], "source": [ "answer = completion.choices[0].message.content\n", "print(answer)" @@ -595,7 +784,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "id": "nCXL9Cz1xYaV" }, @@ -615,7 +804,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "id": "9y3E0YWExYaV" }, @@ -623,7 +812,7 @@ "source": [ "def highlight_keywords(text, keywords):\n", " for keyword in keywords:\n", - " text = text.replace(keyword, colored(keyword, 'green', attrs=['bold']))\n", + " text = text.replace(keyword, colored(keyword, 'red', attrs=['bold']))\n", " return text" ] }, @@ -636,16 +825,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "id": "i7SkWPpnxYaW", "outputId": "28e82563-edba-4b41-acad-ec27e5ba134f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Snippet 1:\n", + "How to cite this book chapter: \n", + "Verdegem, P . 2021. Introduction: Why We Need \u001b[1m\u001b[31mCritical\u001b[0m Perspectives on \u001b[1m\u001b[31mAI\u001b[0m. \n", + "In: Verdegem, P . (ed.) \u001b[1m\u001b[31mAI\u001b[0m for Everyone? \u001b[1m\u001b[31mCritical\u001b[0m Perspectives. Pp. 1–18. London: \n", + "University of Westminster Press. DOI: https://doi.org/10.16997/book55.a. License: \n", + "CC-BY-NC-ND 4.0\n", + "CHAPTER 1\n", + "Introduction: Why We Need \u001b[1m\u001b[31mCritical\u001b[0m \n", + "Perspectives on \u001b[1m\u001b[31mAI\u001b[0m\n", + "Pieter Verdegem\n", + "Introduction\n", + "The renewed in\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], "source": [ - "query_keywords = [] # add your keywords\n", + "query_keywords = [\"Part\", \"Chapter\", \"Critical\", \"AI\", \"artficial\", \"intelligence\", \"history\"] # add your keywords\n", "for i, doc in enumerate(retrieved_docs[:1]):\n", - " snippet = doc.page_content[:200]\n", + " snippet = doc.page_content[:400]\n", " highlighted = highlight_keywords(snippet, query_keywords)\n", " print(f\"Snippet {i+1}:\\n{highlighted}\\n{'-'*80}\")" ] @@ -687,7 +896,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "llm", + "display_name": "base", "language": "python", "name": "python3" }, @@ -701,7 +910,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.13.9" } }, "nbformat": 4,