Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

can I scrape a local html file? #929

Open
nyck33 opened this issue Feb 21, 2025 · 4 comments
Open

can I scrape a local html file? #929

nyck33 opened this issue Feb 21, 2025 · 4 comments
Labels
bug Something isn't working

Comments

@nyck33
Copy link

nyck33 commented Feb 21, 2025

import json
import os
from scrapegraphai.graphs import SmartScraperGraph

openai_api_key = os.getenv("OPENAI_API_KEY")

# Configuration for the scraping pipeline
graph_config = {
    "llm": {
        "api_key": openai_api_key,
        "model": "openai/gpt-4-0125-preview",
    },
    "verbose": True,
    "headless": True,
}

# Path to your local HTML file
html_file_path = "slack_html/0221-onboarding-bitbucket.html"

# Read and properly escape HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = json.dumps(file.read(), ensure_ascii=False)

# Create input dictionary with EXACT variable names from error message
inputs = {
    '"content"': html_content,  # Double-quoted key with JSON-escaped content
    'question': "Extract all messages with their timestamps and usernames from the Slack conversation."
}

# Initialize the SmartScraperGraph
smart_scraper = SmartScraperGraph(
    prompt=inputs,
    source=html_content,
    config=graph_config  # Explicitly pass variables dictionary
)

# Create output directory if it doesn't exist
output_folder_path = "cleaned_slack_html"
os.makedirs(output_folder_path, exist_ok=True)

# Execute the scraping pipeline
try:
    result = smart_scraper.run()
    
    # Prepare output file path
    output_file_name = "cleaned_" + os.path.basename(html_file_path).replace(".html", ".json")
    output_file_path = os.path.join(output_folder_path, output_file_name)
    
    # Save results
    print("Extraction completed. Saving results...")
    with open(output_file_path, "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    
    print(f"Results saved to: {output_file_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

keeps throwing something about content

--- Executing Fetch Node ---
--- (Fetching HTML from: "<div\n  role=\"presentation\"\n  class=\"c-message_kit__background p-message_pane_message__message c-message_kit__message p-message_pane_message__message--last\"\n  data-qa=\"message_container\"\n  data-qa-unprocessed=\"false\"\n  data-qa-placeholder=\"false\"\n>\n  <div\n    role=\"document\"\n    aria-roledescription=\"message\"\n    class=\"c-message_kit__hover\"\n    data-qa-hover=\"true\"\n  >\n    <div\n      class=\"c-message_kit__actions c-message_kit__actions--default\"\n      style=\"position: relative\"\n    >\n      <div class=\"c-message_kit__gutter\">\n        <div\n          role=\"presentation\"\n          class=\"c-message_kit__gutter__left\"\n          data-stringify-ignore=\"true\"\n        >\n          <span class=\"p-member_profile_hover_card\" role=\"presentation\"\n            ><button\n              class=\"c-button-unstyled c-message_kit__avatar c-avatar c-avatar--interactive\"\n              aria-hidden=\"true\"\n              aria-label=\"View Nobu’s Profile\"\n              tabindex=\"-1\"\n              type=\"button\"\n              style=\"height: 36px; width: 36px\"\n            >\n              <span\n                class=\"c-base_icon__width_only_container\"\n                style=\"height: 36px; width: 36px\"\n                ><img\n                  src=\"https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-48\"\n                  srcset=\"\n                    https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-72 2x\n                  \"\n                  class=\"c-base_icon c-base_icon--image\"\n                  aria-hidden=\"true\"\n                  role=\"img\"\n                  alt=\"\"\n                  style=\"width: 36px\"\n              /></span></button\n          ></span>\n        </div>\n        <div\n          role=\"presentation\"\n          class=\"c-message_kit__gutter__right\"\n          data-qa=\"message_content\"\n        >\n          <span\n            class=\"c-message__sender c-message_kit__sender\"\n            data-qa=\"message_sender\"\n            data-stringify-type=\"replace\"\n            data-stringify-text=\"Nobu\"\n            ><span class=\"p-member_profile_hover_card\" role=\"presentation\"\n              ><button\n                data-message-sender=\"U0807D9TGNM\"\n                data-qa=\"message_sender_name\"\n                class=\"c-link--button c-message__sender_button\"\n                type=\"button\"\n                tabindex=\"0\"\n              >\n                Nobu\n              </button></span\n            ><span\n              id=\"primary-C07UVPLJW4E-1740101333.214529-sender\"\n              class=\"offscreen\"\n              aria-hidden=\"true\"\n              data-qa=\"aria-labelledby-primary-C07UVPLJW4E-1740101333.214529-sender\"\n              >Nobu</span\n            ></span\n          >&nbsp;&nbsp;<a\n            aria-label=\"Today at 10:28:53 AM\"\n            data-stringify-type=\"replace\"\n            data-stringify-text=\"[10:28 AM]\"\n            data-stringify-requires-siblings=\"true\"\n            data-ts=\"1740101333.214529\"\n            delay=\"300\"\n            data-sk=\"tooltip_parent\"\n            class=\"c-link c-timestamp\"\n            href=\"https://c-2-c-group.slack.com/archives/C07UVPLJW4E/p1740101333214529\"\n            ><span class=\"c-timestamp__label\" data-qa=\"timestamp_label\"\n              >10:28 AM</span\n            ></a\n          ><br />\n          <div class=\"c-message_kit__blocks c-message_kit__blocks--rich_text\">\n            <div\n              class=\"c-message__message_blocks c-message__message_blocks--rich_text\"\n              data-qa=\"message-text\"\n            >\n              <div class=\"p-block_kit_renderer\" data-qa=\"block-kit-renderer\">\n                <div\n                  class=\"p-block_kit_renderer__block_wrapper p-block_kit_renderer__block_wrapper--first\"\n                >\n                  <div class=\"p-rich_text_block\" dir=\"auto\">\n                    <div class=\"p-rich_text_section\">\n                      件名:\n                      Bitbucketアクセス制限についての正式な説明を求めます<span\n                        aria-label=\"\"\n                        class=\"c-mrkdwn__br\"\n                        data-stringify-type=\"paragraph-break\"\n                      ></span\n                      >CFO羽嶋様<br />お世話になっております。Nobuです。<span\n                        aria-label=\"\"\n                        class=\"c-mrkdwn__br\"\n                        data-stringify-type=\"paragraph-break\"\n                      ></span\n                   ****filler********                data-qa=\"file_image_thumbnail_img\"\n                          /></div\n                      ></a>\n                    </div>\n                  </div>\n                </div>\n              </div>\n            </div>\n            <div class=\"resize-triggers\">\n              <div class=\"expand-trigger\">\n                <div style=\"width: 494px; height: 229px\"></div>\n              </div>\n              <div class=\"contract-trigger\"></div>\n            </div>\n          </div>\n        </div>\n      </div>\n    </div>\n  </div>\n</div>\n") ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
Error during chain execution: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '
An error occurred: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '
Copy link

dosubot bot commented Feb 21, 2025

To resolve the error related to the missing "content" variable when using SmartScraperGraph, you should pass the local HTML content directly to the source parameter without using JSON escaping. Here's how you can adjust your code:

import os
from scrapegraphai.graphs import SmartScraperGraph

openai_api_key = os.getenv("OPENAI_API_KEY")

# Configuration for the scraping pipeline
graph_config = {
    "llm": {
        "api_key": openai_api_key,
        "model": "openai/gpt-4-0125-preview",
    },
    "verbose": True,
    "headless": True,
}

# Path to your local HTML file
html_file_path = "slack_html/0221-onboarding-bitbucket.html"

# Read the file locally
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Initialize the SmartScraperGraph
smart_scraper = SmartScraperGraph(
    prompt="Extract all messages with their timestamps and usernames from the Slack conversation.",
    source=html_content,
    config=graph_config
)

# Create output directory if it doesn't exist
output_folder_path = "cleaned_slack_html"
os.makedirs(output_folder_path, exist_ok=True)

# Execute the scraping pipeline
try:
    result = smart_scraper.run()
    
    # Prepare output file path
    output_file_name = "cleaned_" + os.path.basename(html_file_path).replace(".html", ".json")
    output_file_path = os.path.join(output_folder_path, output_file_name)
    
    # Save results
    print("Extraction completed. Saving results...")
    with open(output_file_path, "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    
    print(f"Results saved to: {output_file_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

This approach ensures that the HTML content is passed correctly to the source parameter, and the prompt is specified directly as a string, which should resolve the error you're encountering [1].

To continue talking to Dosu, mention @dosu.


Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other

@dosubot dosubot bot added the bug Something isn't working label Feb 21, 2025
@nyck33
Copy link
Author

nyck33 commented Feb 21, 2025

@dosu

that code by dosubot throws

Output is truncated. View as a [scrollable element](command:cellOutput.enableScrolling?36429495-b4aa-4b97-adb3-2a5416de6d76) or open in a [text editor](command:workbench.action.openLargeOutput?36429495-b4aa-4b97-adb3-2a5416de6d76). Adjust cell output [settings](command:workbench.action.openSettings?%5B%22%40tag%3AnotebookOutputLayout%22%5D)...
Error during chain execution: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '
An error occurred: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '

as does my attempt to use document scraper

import json
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import DocumentScraperGraph

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Configuration for the scraping pipeline
graph_config = {
    "llm": {
        "api_key": openai_api_key,
        "model": "openai/gpt-4o-2024-08-06",
    },
    "verbose": True,
    "headless": True,
}

# Path to your local HTML file
html_file_path = "slack_html/0221-onboarding-bitbucket.html"

# Read the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Initialize the DocumentScraperGraph
document_scraper = DocumentScraperGraph(
    prompt="Extract all messages with their timestamps and usernames from the Slack conversation",
    source=html_content,
    config=graph_config
)

# Create output directory if it doesn't exist
output_folder_path = "cleaned_slack_html"
os.makedirs(output_folder_path, exist_ok=True)

# Execute the scraping pipeline
try:
    result = document_scraper.run()
    
    # Prepare output file path
    output_file_name = "cleaned_" + os.path.basename(html_file_path).replace(".html", ".json")
    output_file_path = os.path.join(output_folder_path, output_file_name)
    
    # Save results
    print("Extraction completed. Saving results...")
    print(json.dumps(result, indent=4))
    
    with open(output_file_path, "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    
    print(f"Results saved to: {output_file_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

@VinciGit00
Copy link
Collaborator

Can you send me the errors?

Copy link

dosubot bot commented Mar 1, 2025

To resolve the 'Input to PromptTemplate is missing variables {"content"}' error, ensure that you are using a compatible version of the scrapegraphai package. Update to version 1.40.1 or later, as this version addresses the issue with missing variables in the prompt template. You can update the package using the following command:

pip install scrapegraphai==1.40.1

If you are using version 1.39.0 and still encounter this error, consider downgrading to version 1.37.0:

pip install scrapegraphai==1.37.0

Additionally, ensure that the source parameter is correctly set to the path of your local HTML file, and not the HTML content itself. The source should be a file path or URL, depending on the context of use [1][2].

To continue talking to Dosu, mention @dosu.


Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants