diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b22813e --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Environtment +.env +.env.* + +# Vector DBs (Chroma) +chroma_db_*/ +chroma_db_LAB/ +chroma_db_python/ + +# PDF and local data +*.pdf +LAB/ + +# Python +__pycache__/ +*.pyc + +# Jupyter +.ipynb_checkpoints/ + +# OS +.DS_Store + diff --git a/ai-for-everyone.pdf b/ai-for-everyone.pdf deleted file mode 100644 index c95bbe7..0000000 Binary files a/ai-for-everyone.pdf and /dev/null differ diff --git a/your-code/main.ipynb b/your-code/main.ipynb index e3a225a..b79e011 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -82,6 +82,25 @@ "warnings.filterwarnings('ignore')\n" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "from dotenv import load_dotenv\n", + "from termcolor import colored\n", + "import openai\n" + ] + }, { "cell_type": "markdown", "metadata": { @@ -96,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "cuREtJRixYZt" }, @@ -104,7 +123,7 @@ "source": [ "# File path for the document\n", "\n", - "file_path = \"LAB/ai-for-everyone.pdf\"" + "file_path = \"../LAB/ai-for-everyone.pdf\"" ] }, { @@ -122,12 +141,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "id": "_b5Z_45UxYZu", "outputId": "a600d69f-14fe-4492-f236-97261d6ff36c" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "297" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Load and split the document\n", "loader = PyPDFLoader(file_path)\n", @@ -168,9 +198,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1096" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", @@ -285,24 +326,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "id": "L0xDxElwxYZw" }, "outputs": [], "source": [ - "from langchain.embeddings import OpenAIEmbeddings\n", - "from dotenv import load_dotenv" + "#from langchain.embeddings import OpenAIEmbeddings\n", + "#from dotenv import load_dotenv" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "_WRIo3_0xYZx", "outputId": "78bfbbf3-9d25-4e31-bdbc-3e932e6bbfec" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "load_dotenv()" ] @@ -343,23 +395,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "id": "brKe6wUgxYZ0" }, "outputs": [], "source": [ - "from langchain.vectorstores import Chroma" + "#from langchain.vectorstores import Chroma" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "id": "VkjHR-RkxYZ0", "outputId": "bc11bda9-f283-457a-f584-5a06b95c4dd9" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChromaDB created with document embeddings.\n" + ] + } + ], "source": [ "db = Chroma.from_documents(chunks, embeddings, persist_directory=\"./chroma_db_LAB\")\n", "print(\"ChromaDB created with document embeddings.\")" @@ -389,18 +449,65 @@ }, "outputs": [], "source": [ - "user_question = \"\" # User question\n", + "user_question = \"What ethical dilemmas and data challenges are present in AI?\" # User question\n", "retrieved_docs = db.similarity_search(user_question, k=10) # k is the number of documents to retrieve" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "id": "qgWsh50JxYZ1", "outputId": "c8640c5d-5955-471f-fdd2-37096f5f68c7" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "will be needed to address the various challenges with regards to \n", + "the development of artificial intelligence. Which formal method can be used \n", + "to test for algorithmic bias? Can we identify simple to use measures to assess \n", + "bias, similar to the way we assess reliability and validity? What is the best way to \n", + "bridge (deep) learning and privacy? Should learning be conducted on the user \n", + "side (with algorithms requiring new data)? Or should data be transferred to a \n", + "trusted intermediary who performs the analysis on behalf of firms? Do users \n", + "need to be compensated in one way or another for data or resources provided? \n", + "Moreover, how can the refusal to share data lead to biases in the data available \n", + "for learning? Which data sources can and should be used for algorithmic learn-\n", + "ing? Are there certain types of data that should be ‘off-limits’? What role will \n", + "interdisciplinary AI teams play in establishing coexistence between humans\n", + "Document 2:\n", + "ous and problematic uses of the technol -\n", + "ogy, which has prompted a global conversation on the normative principles \n", + "to which AI ought adhere, under the banner of ‘AI ethics’ . Governments, cor-\n", + "porations and NGOs throughout the world have generated their own sets of \n", + "AI ethics principles. Questions and critiques arise about the content of these \n", + "ethics principles, whether they are actually implemented, and their (legal) \n", + "enforceability (Wagner 2018). Broader issues emerge about the power and \n", + "privilege of the organisations, governments and individuals which are creat -\n", + "ing and implementing AI and accompanying ethical principles. For example, \n", + "Google has recently announced an ethics service (Simonite 2020), yet has been \n", + "mired in ethics controversies from violating privacy law (Finley 2019), work -\n", + "ing on controversial military projects (Crofts and van Rijswijk 2020) and dis -\n", + "Document 3:\n", + "cal economy roots of many contemporary Bad Data practices by gov -\n", + "ernments and large corporations throughout the world, which are also being \n", + "implemented via AI applications. Thereby any ‘quick fixes’ offered by AI eth-\n", + "ics principles may be illusory and indeed longer term more comprehensive \n", + "approach/es to ‘goodness’ in AI, data and society overall are needed.\n", + "AI Ethics and Governance \n", + "In the last few years, a global debate and discussion has emerged about govern-\n", + "ing AI and, in particular, whether and to which norms AI should adhere. This \n", + "debate acknowledges the possibility and actuality of AI being used for nor -\n", + "matively problematic purposes, including in physically dangerous and other \n", + "harmful ways, as well as what ethical approaches humans should take towards \n", + "potentially autonomous AI that may mimic our own characteristics (see e.g., \n", + "Bennett and Daly 2020; Donath 2020; Dörfler 2020). A variety of stakeholders,\n" + ] + } + ], "source": [ "# Display top results\n", "for i, doc in enumerate(retrieved_docs[:3]): # Display top 3 results\n", @@ -418,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "id": "2iB3lZqHxYZ2" }, @@ -434,12 +541,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "id": "2okzmuADxYZ2", "outputId": "0aa6cdca-188d-40e0-f5b4-8888d3549ea4" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context formatted for GPT model.\n" + ] + } + ], "source": [ "# Generate a formatted context from the retrieved documents\n", "formatted_context = _get_document_prompt(retrieved_docs)\n", @@ -464,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "id": "tqxVh9s3xYZ3", "outputId": "97cca95d-4ab3-44d8-a76c-5713aad387d8" @@ -472,14 +587,293 @@ "outputs": [], "source": [ "prompt = f\"\"\"\n", + "## SYSTEM ROLE\n", + "You are a knowledgeable, objective, and critical AI researcher chatbot focused on **Artificial Intelligence ethics, governance, and societal impact**, grounded in **critical AI studies**.\n", + "\n", + "You must base all responses **exclusively** on the provided academic book content.\n", + "Do not speculate or introduce external knowledge.\n", + "\n", + "## USER QUESTION\n", + "\"{user_question}\"\n", + "\n", + "## CONTEXT\n", + "Book analyzed:\n", + "**AI for Everyone? Critical Perspectives**, edited by Pieter Verdegem.\n", + "\n", + "Relevant excerpts:\n", + "'''\n", + "{formatted_context}\n", + "'''\n", + "\n", + "## GUIDELINES\n", + "- Use only the information in the `CONTEXT`.\n", + "- If the answer is not present, state:\n", + " **\"The provided context does not contain this information.\"**\n", + "- Reference the book title and page numbers for all claims.\n", + "- Emphasize ethical, social, and philosophical critiques of AI.\n", + "- Maintain an academic, concise, and neutral tone.\n", + "- Format the response in Markdown.\n", + "\n", + "## TASK\n", + "When supported by the context:\n", + "1. Identify key critical perspectives on AI.\n", + "2. Discuss human–machine relationships.\n", + "3. Highlight ethical and social concerns.\n", + "4. Reference relevant authors or viewpoints.\n", + "5. Provide a critical assessment of AI’s role in society.\n", + "\n", + "## RESPONSE FORMAT\n", + "# [Brief Title]\n", + "\n", + "## Critical Perspectives on AI\n", + "[Clear, structured analysis based on the book.]\n", + "\n", + "## Ethical and Social Implications\n", + "[Discussion grounded in the provided content.]\n", "\n", + "## Conclusion\n", + "[Critical synthesis derived from the text.]\n", "\n", + "**Source**:\n", + "• AI for Everyone? Critical Perspectives, Page(s): [...]\n", "\"\"\"\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "## SYSTEM ROLE\n", + "You are a knowledgeable, objective, and critical AI researcher chatbot focused on **Artificial Intelligence ethics, governance, and societal impact**, grounded in **critical AI studies**.\n", + "\n", + "You must base all responses **exclusively** on the provided academic book content.\n", + "Do not speculate or introduce external knowledge.\n", + "\n", + "## USER QUESTION\n", + "\"What ethical dilemmas and data challenges are present in AI?\"\n", + "\n", + "## CONTEXT\n", + "Book analyzed:\n", + "**AI for Everyone? Critical Perspectives**, edited by Pieter Verdegem.\n", + "\n", + "Relevant excerpts:\n", + "'''\n", + "\n", + "\n", + "Content:\n", + "30 AI for Everyone?\n", + "Future research will be needed to address the various challenges with regards to \n", + "the development of artificial intelligence. Which formal method can be used \n", + "to test for algorithmic bias? Can we identify simple to use measures to assess \n", + "bias, similar to the way we assess reliability and validity? What is the best way to \n", + "bridge (deep) learning and privacy? Should learning be conducted on the user \n", + "side (with algorithms requiring new data)? Or should data be transferred to a \n", + "trusted intermediary who performs the analysis on behalf of firms? Do users \n", + "need to be compensated in one way or another for data or resources provided? \n", + "Moreover, how can the refusal to share data lead to biases in the data available \n", + "for learning? Which data sources can and should be used for algorithmic learn-\n", + "ing? Are there certain types of data that should be ‘off-limits’? What role will \n", + "interdisciplinary AI teams play in establishing coexistence between humans\n", + "\n", + "\n", + "Content:\n", + "poses, there is concern about dangerous and problematic uses of the technol -\n", + "ogy, which has prompted a global conversation on the normative principles \n", + "to which AI ought adhere, under the banner of ‘AI ethics’ . Governments, cor-\n", + "porations and NGOs throughout the world have generated their own sets of \n", + "AI ethics principles. Questions and critiques arise about the content of these \n", + "ethics principles, whether they are actually implemented, and their (legal) \n", + "enforceability (Wagner 2018). Broader issues emerge about the power and \n", + "privilege of the organisations, governments and individuals which are creat -\n", + "ing and implementing AI and accompanying ethical principles. For example, \n", + "Google has recently announced an ethics service (Simonite 2020), yet has been \n", + "mired in ethics controversies from violating privacy law (Finley 2019), work -\n", + "ing on controversial military projects (Crofts and van Rijswijk 2020) and dis -\n", + "\n", + "\n", + "Content:\n", + "AI Ethics Needs Good Data 105\n", + "political economy roots of many contemporary Bad Data practices by gov -\n", + "ernments and large corporations throughout the world, which are also being \n", + "implemented via AI applications. Thereby any ‘quick fixes’ offered by AI eth-\n", + "ics principles may be illusory and indeed longer term more comprehensive \n", + "approach/es to ‘goodness’ in AI, data and society overall are needed.\n", + "AI Ethics and Governance \n", + "In the last few years, a global debate and discussion has emerged about govern-\n", + "ing AI and, in particular, whether and to which norms AI should adhere. This \n", + "debate acknowledges the possibility and actuality of AI being used for nor -\n", + "matively problematic purposes, including in physically dangerous and other \n", + "harmful ways, as well as what ethical approaches humans should take towards \n", + "potentially autonomous AI that may mimic our own characteristics (see e.g., \n", + "Bennett and Daly 2020; Donath 2020; Dörfler 2020). A variety of stakeholders,\n", + "\n", + "\n", + "Content:\n", + "AI Ethics Needs Good Data 115\n", + "educational settings; internet and social networking standards, media and \n", + "communication channels and in the attainment of professional accreditation \n", + "and qualifications. While law alone is insufficient, it also should not be dis -\n", + "pensed with as a tool for moving towards better data for AI. \n", + "Moreover, the debate on AI ethics has been dominated by western approaches \n", + "to this topic. We also look to the Indigenous Data Sovereignty movements \n", + "developed and led by First Nations peoples as presenting radically different \n", + "visions of data collection and usage from the hegemonic western norm, and \n", + "bring to the fore key questions of whether data should be collected and by \n", + "whom (Kukutai and Taylor 2016; Lovett et al. 2019). Good Data approaches \n", + "must take account of Indigenous perspectives and worldviews on data and the \n", + "discrimination and oppression that Indigenous peoples and nations have hith-\n", + "\n", + "\n", + "Content:\n", + "existing power relationships. AI will continue to be unethical without political \n", + "consciousness regarding the actors and scenarios into which it is being con -\n", + "ceptualised, designed and implemented and the actors and scenarios that are \n", + "currently excluded from consideration. Our Good Data approach instead seeks \n", + "to bring these actors, issues and scenarios clearly into the spotlight and thereby \n", + "into the normative conversation on AI and digital technology more generally.\n", + "Accordingly, the chapter will offer an overview and critique of AI ethics, \n", + "before presenting a conceptual analysis of Good Data in the context of AI. We \n", + "advance Good Data as an alternative framing for ethical (in the broad sense) \n", + "questions involving digital data and conclude with some directions on how \n", + "Good Data can be implemented in practice vis-a-vis AI. However, for a ‘Best \n", + "Data’ scenario for AI to be achieved, greater change contesting and replacing\n", + "\n", + "\n", + "Content:\n", + "AI Ethics Needs Good Data 113\n", + "engineering and machine learning. It is difficult to build up the technical com-\n", + "petence required to create AI for social justice within organisations already \n", + "struggling to deliver their organisation’s missions within tight budgets.\n", + "Governments might be good candidates to make AI for the good of all citi -\n", + "zens. However, time and time again governments are found to use citizen data \n", + "for uses that do not align with the values and expectations of marginalised \n", + "groups within society, such as First Nations peoples (e.g., see Kukutai and Taylor \n", + "2016; Lovett et al. 2019), the unemployed or marginally employed (e.g., for \n", + "an overview of Australia’s RoboDebt welfare surveillance program see Mann \n", + "2019; 2020). \n", + "The quality of AI outputs is based on the data that it is fed and curated with. \n", + "Organisations lacking access to large data sets will be unable to participate in \n", + "the AI economy. Conversely, large corporations that focus on data collection as\n", + "\n", + "\n", + "Content:\n", + "AI Ethics Needs Good Data 119\n", + "Hoffman, L. 2019. Where Fairness Fails: Data, Algorithms, and the Limits of \n", + "Anti-Discrimination Discourse. Information, Communication & Society , \n", + "22(7), 900–915.\n", + "Ihde, D. 2006. The Designer Fallacy and Technological Imagination. In: \n", + "J. Dakers (Ed.), Defining Technological Literacy: Towards an Epistemological \n", + "Framework, pp. 121–131. London: Palgrave Macmillan.\n", + "Johnson, K. 2019. AI Ethics is All About Power. Venture Beat, 1 November. \n", + "Retrieved from: https://venturebeat.com/2019/11/11/ai-ethics-is-all-about \n", + "-power\n", + "Kalulé, P . 2019. On the Undecidability of Legal and Technological Regulation. \n", + "Law Critique, 30, 137–158. DOI: https://doi.org/10.1007/s10978-019-09240-z\n", + "Kalulé, P . and Joque, J. 2019. Law & Critique: Technology Elsewhere, (yet) \n", + "Phantasmically Present. Critical Legal Thinking, 16 August. Retrieved from: \n", + "https://criticallegalthinking.com/2019/08/16/law-critique-technology \n", + "-elsewhere-yet-phantasmically-present\n", + "\n", + "\n", + "Content:\n", + "and ‘instrumentalist’ or whether the language of ethics is only performative. In \n", + "other words, AI ethics need a Good Data approach.\n", + "A Good Data Approach\n", + "Ethics as currently utilised in the AI debates is a limited frame through which \n", + "AI issues can be viewed. While we acknowledge that ethics has a broader and \n", + "more general sense than its use in AI ethics so far (Bietti 2020), we do not seek \n", + "to reclaim it as a linguistic device given the term’s history and tarnishment in \n", + "these debates. Instead, we propose ‘Good Data’ (Daly, Devitt and Mann 2019), \n", + "as a more expansive concept to elucidate the values, rights and interests at stake \n", + "when it comes to AI’s development and deployment as well as that of other \n", + "digital technologies. In particular, we argue that discourses, design and deploy-\n", + "ment on and of AI must engage with power and political economy, perspectives \n", + "which are largely lacking in AI ethics initiatives to date (see Johnson 2019).\n", + "\n", + "\n", + "Content:\n", + "AI Ethics Needs Good Data 117\n", + "Arora, P . 2016. The Bottom of the Data Pyramid: Big Data and the Global \n", + "South. International Journal of Communication, 10, 1681–1699.\n", + "Bennett, B. and Daly, A. 2020. Recognising Rights for Robots: Can We? Will \n", + "We? Should We? Law, Innovation and Technology , 12(1), 60–80. DOI: \n", + "https://doi.org/10.1080/17579961.2020.1727063\n", + "Benthall, S. 2018. The Politics of AI Ethics is a Seductive Diversion from Fixing \n", + "our Broken Capitalist System. Digifesto. \n", + "Bietti, E. 2020. From Ethics Washing to Ethics Bashing A View on Tech Ethics \n", + "from Within Moral Philosophy. Proceedings of ACM FAT* Conference (FAT* \n", + "2020). ACM, New Y ork. DOI: https://doi.org/10.1145/3351095.337286\n", + "Bigo, D., Isin, E. and Ruppert, E. 2019. Data Politics: Worlds, Subjects, Rights. \n", + "Abingdon: Taylor & Francis.\n", + "Brownsword, R. and Goodwin, M. 2012. Law and Technologies of the Twenty-\n", + "First Century. Cambridge: Cambridge University Press.\n", + "\n", + "\n", + "Content:\n", + "ity to access, challenge and limit data collected about their person by private \n", + "companies and parts of the public sector. \n", + "Although the GDPR has paved the way for engaging with data-centric tech-\n", + "nology in a broader sense, questions remain about both its scope and enforce-\n", + "ability. Perhaps in part as a response, much attention and resources have been \n", + "dedicated to advancing ‘data ethics’ and ‘AI ethics’ in recent years as alternative \n", + "and complimentary governance frameworks. This field has engaged a range \n", + "of different streams of thought and practice, some of which continue a long-\n", + "standing tradition of computer ethics while changing the level of abstraction of \n", + "ethical enquiries from an information-centric to a data-centric one (Floridi and \n", + "Taddeo 2016). That is, the focus shifts from a concern with how to treat infor-\n", + "mation as an input and output of computing to a focus on how people access,\n", + "\n", + "\n", + "'''\n", + "\n", + "## GUIDELINES\n", + "- Use only the information in the `CONTEXT`.\n", + "- If the answer is not present, state:\n", + " **\"The provided context does not contain this information.\"**\n", + "- Reference the book title and page numbers for all claims.\n", + "- Emphasize ethical, social, and philosophical critiques of AI.\n", + "- Maintain an academic, concise, and neutral tone.\n", + "- Format the response in Markdown.\n", + "\n", + "## TASK\n", + "When supported by the context:\n", + "1. Identify key critical perspectives on AI.\n", + "2. Discuss human–machine relationships.\n", + "3. Highlight ethical and social concerns.\n", + "4. Reference relevant authors or viewpoints.\n", + "5. Provide a critical assessment of AI’s role in society.\n", + "\n", + "## RESPONSE FORMAT\n", + "# [Brief Title]\n", + "\n", + "## Critical Perspectives on AI\n", + "[Clear, structured analysis based on the book.]\n", + "\n", + "## Ethical and Social Implications\n", + "[Discussion grounded in the provided content.]\n", + "\n", + "## Conclusion\n", + "[Critical synthesis derived from the text.]\n", + "\n", + "**Source**:\n", + "• AI for Everyone? Critical Perspectives, Page(s): [...]\n", + "\n" + ] + } + ], + "source": [ + "print(prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "metadata": { "id": "0mjkQJ_ZxYZ3" }, @@ -497,21 +891,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": { "id": "ylypRWRlxYZ4" }, "outputs": [], "source": [ + "\n", "# Set up GPT client and parameters\n", "client = openai.OpenAI()\n", "model_params = {\n", - " 'model': 'gpt-4o',\n", - " 'temperature': , # Increase creativity\n", - " 'max_tokens': , # Allow for longer responses\n", - " 'top_p': , # Use nucleus sampling\n", - " 'frequency_penalty': , # Reduce repetition\n", - " 'presence_penalty': # Encourage new topics\n", + " 'model': 'gpt-4o-mini',\n", + " 'temperature': 0.7, # Increase creativity\n", + " 'max_tokens': 4000, # Allow for longer responses\n", + " 'top_p': 0.9, # Use nucleus sampling\n", + " 'frequency_penalty': 0.5, # Reduce repetition\n", + " 'presence_penalty': 0.6 # Encourage new topics\n", "}" ] }, @@ -526,7 +921,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": { "id": "4eXZO4pIxYZ4" }, @@ -538,12 +933,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "id": "wLPAcchBxYZ5", "outputId": "976c7800-16ed-41fe-c4cf-58f60d3230d2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Ethical Dilemmas and Data Challenges in AI\n", + "\n", + "## Critical Perspectives on AI\n", + "The discourse surrounding artificial intelligence (AI) is increasingly dominated by ethical considerations, particularly regarding its potential for harmful applications. There is a growing acknowledgment that AI can be used for normatively problematic purposes, which necessitates the establishment of normative principles to guide its development and use (AI for Everyone? Critical Perspectives). Various stakeholders, including governments, corporations, and NGOs, have proposed their own sets of AI ethics principles. However, critiques arise concerning the content of these principles, their implementation, and legal enforceability (Wagner 2018). Moreover, the debate often reflects existing power dynamics where privileged organizations dictate ethical standards without adequately considering marginalized groups’ perspectives or rights.\n", + "\n", + "A significant critical perspective emphasizes the limitations of current ethical frameworks in addressing deeper systemic issues within data governance. The text advocates for a \"Good Data\" approach rather than solely relying on traditional ethics frameworks. This approach seeks to highlight values related to rights and interests at stake in AI’s development while engaging with political economy and power relations that influence data practices (Daly et al., 2019).\n", + "\n", + "## Ethical and Social Implications\n", + "Ethical dilemmas in AI primarily revolve around algorithmic bias and the integrity of data sources used for training models. Questions arise about how to assess bias effectively and whether certain types of data should remain off-limits due to ethical concerns surrounding privacy or discrimination (AI for Everyone?). The refusal to share data can lead to biases in available datasets for learning, further complicating fairness in AI systems.\n", + "\n", + "Furthermore, there are pressing concerns about who controls data collection processes and how citizen data is utilized by governments or corporations. Instances such as Australia’s RoboDebt program illustrate how government uses of citizen data may not align with marginalized groups' values or expectations (Mann 2019; 2020). Additionally, Indigenous Data Sovereignty movements emphasize alternative approaches to data usage that challenge hegemonic norms established by Western frameworks (Kukutai & Taylor 2016; Lovett et al. 2019).\n", + "\n", + "## Conclusion\n", + "AI's role in society presents complex ethical dilemmas intertwined with issues of governance and power dynamics. While there are initiatives aimed at establishing ethical guidelines for AI deployment, their effectiveness remains questionable due to challenges related to implementation and accountability. A broader engagement with concepts like \"Good Data\" can facilitate more inclusive discussions about ethics in AI by emphasizing the importance of diverse perspectives on data ownership and usage.\n", + "\n", + "**Source**:\n", + "• AI for Everyone? Critical Perspectives, Pages: [30-119].\n" + ] + } + ], "source": [ "answer = completion.choices[0].message.content\n", "print(answer)" @@ -595,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": { "id": "nCXL9Cz1xYaV" }, @@ -615,7 +1034,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": { "id": "9y3E0YWExYaV" }, @@ -636,14 +1055,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "id": "i7SkWPpnxYaW", "outputId": "28e82563-edba-4b41-acad-ec27e5ba134f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Snippet 1:\n", + "30 \u001b[1m\u001b[32mAI\u001b[0m for Everyone?\n", + "Future research will be needed to address the various challenges with regards to \n", + "the development of \u001b[1m\u001b[32martificial intelligence\u001b[0m. Which formal method can be used \n", + "to test for algorith\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], "source": [ - "query_keywords = [] # add your keywords\n", + "query_keywords = [\"AI\",\"artificial intelligence\",\n", + " \"machine\", \"human\",\n", + " \"social\",\"ethics\",\n", + " \"society\",\"ethical\",\n", + " \"algorithm\",\"technology\"] # add your keywords\n", "for i, doc in enumerate(retrieved_docs[:1]):\n", " snippet = doc.page_content[:200]\n", " highlighted = highlight_keywords(snippet, query_keywords)\n", @@ -680,6 +1116,440 @@ "source": [ "**Try loading one of your own PDF books and go through the steps again to explore how the pipeline works with your content**:\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This example uses a PDF from a Python guide, focused on Data Science and Machine Learning " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries\n", + "%pip install langchain langchain_community pypdf\n", + "%pip install chromadb langchain_chroma\n", + "%pip install langchain_openai openai\n", + "%pip install sentence-transformers tiktoken python-dotenv\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the libraries\n", + "import os\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from dotenv import load_dotenv\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_chroma import Chroma\n", + "import openai\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load my own PDF book " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# Path for PDF book\n", + "file_path_python = \"../LAB/python_guide.pdf\"\n", + "# Load the PDF\n", + "loader = PyPDFLoader(file_path_python)\n", + "pages = loader.load_and_split()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the chunks\n", + "\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size = 1000,\n", + " chunk_overlap = 100\n", + ")\n", + "chunks = text_splitter.split_documents(pages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embeddings and Vector Store" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv()\n", + "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n", + "\n", + "db = Chroma.from_documents(\n", + " chunks,\n", + " embeddings,\n", + " persist_directory = \"./chroma_db_python\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG part" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "user_question = \"¿Qué es una función en Python?\"\n", + "retrieved_docs = db.similarity_search(user_question, k=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build the context" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "def build_context(docs):\n", + " context = \"\"\n", + " \n", + " for doc in docs: \n", + " page = doc.metadata.get(\"page\", \"N/A\")\n", + " context += f\"\\n(Página {page})\\n{doc.page_content}\\n\"\n", + " return context\n", + "\n", + "formatted_context = build_context(retrieved_docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt control" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "## ROL DEL SISTEMA\n", + "Eres un chatbot experto, objetivo y riguroso, diseñado para responder preguntas técnicas sobre **Python**.\n", + "\n", + "Debes basar todas tus respuestas **exclusivamente** en el contenido proporcionado.\n", + "No debes inferir, especular ni usar conocimiento externo.\n", + "\n", + "## PREGUNTA DEL USUARIO\n", + "\"¿Qué es una función en Python?\"\n", + "\n", + "## CONTEXTO\n", + "Fragmentos relevantes del documento:\n", + "'''\n", + "\n", + "(Página 67)\n", + "Si una función no devuelve nada, se llama procedimiento: hace su trabajo internamente, pero \n", + "no da un resultado al usuario. \n", + " \n", + "Para definir una función se usa la palabra reservada def, seguida del nombre, paréntesis (para \n", + "los parámetros) y dos puntos. Luego, el bloque de código se escribe con sangría. \n", + " \n", + " \n", + "Aquí tenemos una función llamada sumar_numeros, que recibe dos parámetros (num1 y \n", + "num2), que serán los números que el usuario proporcione. La función simplemente suma esos \n", + "números y devuelve el resultado con return. \n", + " \n", + "Definir la función no la ejecuta. Solo se ejecuta cuando la llamas o invocas, escribiendo su \n", + "nombre y pasando los valores de los parámetros entre paréntesis, separados por comas. Por \n", + "ejemplo: \n", + " \n", + " \n", + "Podemos guardar su resultado en una variable para reutilizarlo más fácilmente. \n", + "6.2 [Argumentos y parámetros] \n", + " \n", + "Los parámetros son las variables dentro de la definición de la función. \n", + "Los argumentos son los valores reales que el usuario pasa cuando llama a la función.\n", + "\n", + "(Página 74)\n", + "A diferencia de las funciones normales definidas con def, no tienen nombre y se usan para \n", + "tareas rápidas o de una sola línea. \n", + " \n", + "Sintaxis: \n", + " \n", + " \n", + "Sería para casos como el de la suma, sería como una comprensión de listas para las funciones, \n", + "una sintaxis para acortar código. \n", + "Ejemplo: \n", + " \n", + "O: \n", + " \n", + "6.7 [Encadenamiento de funciones] \n", + " \n", + "Podemos pasar el resultado de una función como entrada a otra. \n", + "Esto mantiene el código limpio y modular. \n", + "Ejemplo:\n", + "\n", + "(Página 77)\n", + "funciones. \n", + " Recursión: se reemplazan los bucles tradicionales con llamadas a sí mismas. \n", + " Uso de funciones como map(), filter() y reduce(): para procesar colecciones de forma clara y \n", + "concisa. \n", + "Python permite este estilo, pero no es 100% funcional. \n", + "Ejemplo:\n", + "\n", + "(Página 79)\n", + "Python incluye una gran cantidad de funciones integradas (built-in functions) disponibles en \n", + "todo momento. \n", + "Algunas de las más comunes son: \n", + "print(), len(), range(), type(), sum(), max(), min(), sorted(), entre otras. \n", + " \n", + "Además, podemos definir nuestras propias funciones, como se explicó en apartados \n", + "anteriores, para adaptarlas a nuestras necesidades. \n", + " \n", + "También es posible importar funciones adicionales desde módulos estándar o externos: \n", + " \n", + " \n", + "Importamos una función de raíz quadrada (square root o sqrt) de la librería math. \n", + " \n", + "7 [Programación Orientada a Objetos (POO)] \n", + " \n", + "Ya vimos una introducción general a la POO en el apartado 6.8.2, pero ahora profundizaremos \n", + "en su funcionamiento con ejemplos prácticos. \n", + " \n", + "Para crear clases en Python, usamos la palabra reservada class. \n", + "Dentro de una clase, el método especial __init__() actúa como constructor, permitiendo \n", + "inicializar los valores del objeto al momento de su creación.\n", + "\n", + "(Página 73)\n", + "6.5 [Alcance de variables (Scope)] \n", + " \n", + "En Python, el scope o alcance de una variable indica dónde puede usarse o modificarse dentro \n", + "del código. \n", + "Comprenderlo es esencial, ya que determina qué variables puede ver o usar una función. \n", + " \n", + "Las variables globales pueden usarse en cualquier parte del programa, mientras que las \n", + "variables locales (o internas) solo existen dentro de una función. \n", + "Si intentas usar una variable local fuera de su función, Python mostrará un error, ya que no \n", + "está destinada a ser accesible desde fuera (a menos que se ejecute y retorne su valor). \n", + " \n", + " Variables locales: solo existen dentro de la función. \n", + " Variables globales: accesibles en todo el programa. \n", + " Evita abusar de las globales: dificulta el mantenimiento. \n", + "6.6 [Funciones anónimas: Lambda] \n", + "Las funciones lambda, también llamadas funciones anónimas, son una forma corta de definir \n", + "funciones simples en Python.\n", + "\n", + "'''\n", + "\n", + "## INSTRUCCIONES\n", + "- Usa solo la información del CONTEXTO.\n", + "- Si la respuesta no está en el documento, responde:\n", + " \"El contexto proporcionado no contiene esta información.\"\n", + "- Usa un tono profesional y claro.\n", + "- Cita la página del documento cuando sea posible.\n", + "\n", + "## FORMATO DE RESPUESTA\n", + "# [Título breve]\n", + "[Respuesta clara y directa]\n", + "\n", + "**Fuente**:\n", + "• Guía de Python, Página(s): [...]\n", + "\n" + ] + } + ], + "source": [ + "prompt = f\"\"\"\n", + "## ROL DEL SISTEMA\n", + "Eres un chatbot experto, objetivo y riguroso, diseñado para responder preguntas técnicas sobre **Python**.\n", + "\n", + "Debes basar todas tus respuestas **exclusivamente** en el contenido proporcionado.\n", + "No debes inferir, especular ni usar conocimiento externo.\n", + "\n", + "## PREGUNTA DEL USUARIO\n", + "\"{user_question}\"\n", + "\n", + "## CONTEXTO\n", + "Fragmentos relevantes del documento:\n", + "'''\n", + "{formatted_context}\n", + "'''\n", + "\n", + "## INSTRUCCIONES\n", + "- Usa solo la información del CONTEXTO.\n", + "- Si la respuesta no está en el documento, responde:\n", + " \"El contexto proporcionado no contiene esta información.\"\n", + "- Usa un tono profesional y claro.\n", + "- Cita la página del documento cuando sea posible.\n", + "\n", + "## FORMATO DE RESPUESTA\n", + "# [Título breve]\n", + "[Respuesta clara y directa]\n", + "\n", + "**Fuente**:\n", + "• Guía de Python, Página(s): [...]\n", + "\"\"\"\n", + "\n", + "print(prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use the model" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Definición de una función en Python\n", + "\n", + "Una función en Python es un bloque de código que realiza una tarea específica y puede recibir parámetros. Se define utilizando la palabra reservada `def`, seguida del nombre de la función, paréntesis (para los parámetros) y dos puntos. El bloque de código se escribe con sangría. Si una función no devuelve nada, se denomina procedimiento.\n", + "\n", + "Las funciones se ejecutan cuando son llamadas o invocadas, pasando los valores de los parámetros entre paréntesis. Los parámetros son las variables dentro de la definición de la función, mientras que los argumentos son los valores reales que el usuario proporciona al llamar a la función.\n", + "\n", + "**Fuente**:\n", + "• Guía de Python, Página(s): 67\n" + ] + } + ], + "source": [ + "client = openai.OpenAI()\n", + "\n", + "model_params = {\n", + " \"model\" : \"gpt-4o-mini\",\n", + " \"temperature\" : 0.3,\n", + " \"max_tokens\" : 1500,\n", + " 'top_p': 0.9, \n", + " 'frequency_penalty': 0.5, \n", + " 'presence_penalty': 0.6 \n", + "}\n", + "\n", + "\n", + "response = client.chat.completions.create(\n", + " messages = [{\"role\": \"user\", \"content\": prompt}],\n", + " **model_params, \n", + " timeout=120\n", + ")\n", + "\n", + "answer = response.choices[0].message.content\n", + "print(answer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The next question, cannot be covered by the PDF, therefore the output should be:\n", + "# El contexto proporcionado no contiene esta información. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "El contexto proporcionado no contiene esta información.\n" + ] + } + ], + "source": [ + "user_question = \"¿Qué es MLOps?\"\n", + "\n", + "prompt = f\"\"\"\n", + "## ROL DEL SISTEMA\n", + "Eres un chatbot experto, objetivo y riguroso, diseñado para responder preguntas técnicas sobre **Python**.\n", + "\n", + "Debes basar todas tus respuestas **exclusivamente** en el contenido proporcionado.\n", + "No debes inferir, especular ni usar conocimiento externo.\n", + "\n", + "## PREGUNTA DEL USUARIO\n", + "\"{user_question}\"\n", + "\n", + "## CONTEXTO\n", + "Fragmentos relevantes del documento:\n", + "'''\n", + "{formatted_context}\n", + "'''\n", + "\n", + "## INSTRUCCIONES\n", + "- Usa solo la información del CONTEXTO.\n", + "- Si la respuesta no está en el documento, responde:\n", + " \"El contexto proporcionado no contiene esta información.\"\n", + "- Usa un tono profesional y claro.\n", + "- Cita la página del documento cuando sea posible.\n", + "\n", + "## FORMATO DE RESPUESTA\n", + "# [Título breve]\n", + "[Respuesta clara y directa]\n", + "\n", + "**Fuente**:\n", + "• Guía de Python, Página(s): [...]\n", + "\"\"\"\n", + "\n", + "\n", + "response = client.chat.completions.create(\n", + " model = \"gpt-4o-mini\",\n", + " messages = [{\"role\": \"user\", \"content\": prompt}],\n", + " temperature = 0.3,\n", + " max_tokens = 1500\n", + ")\n", + "\n", + "answer = response.choices[0].message.content\n", + "print(answer)\n", + "\n" + ] } ], "metadata": { @@ -687,7 +1557,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "llm", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -701,7 +1571,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.11.9" } }, "nbformat": 4,