Skip to content

can I scrape a local html file? #929

Closed as not planned
Closed as not planned
@nyck33

Description

@nyck33
import json
import os
from scrapegraphai.graphs import SmartScraperGraph

openai_api_key = os.getenv("OPENAI_API_KEY")

# Configuration for the scraping pipeline
graph_config = {
    "llm": {
        "api_key": openai_api_key,
        "model": "openai/gpt-4-0125-preview",
    },
    "verbose": True,
    "headless": True,
}

# Path to your local HTML file
html_file_path = "slack_html/0221-onboarding-bitbucket.html"

# Read and properly escape HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = json.dumps(file.read(), ensure_ascii=False)

# Create input dictionary with EXACT variable names from error message
inputs = {
    '"content"': html_content,  # Double-quoted key with JSON-escaped content
    'question': "Extract all messages with their timestamps and usernames from the Slack conversation."
}

# Initialize the SmartScraperGraph
smart_scraper = SmartScraperGraph(
    prompt=inputs,
    source=html_content,
    config=graph_config  # Explicitly pass variables dictionary
)

# Create output directory if it doesn't exist
output_folder_path = "cleaned_slack_html"
os.makedirs(output_folder_path, exist_ok=True)

# Execute the scraping pipeline
try:
    result = smart_scraper.run()
    
    # Prepare output file path
    output_file_name = "cleaned_" + os.path.basename(html_file_path).replace(".html", ".json")
    output_file_path = os.path.join(output_folder_path, output_file_name)
    
    # Save results
    print("Extraction completed. Saving results...")
    with open(output_file_path, "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    
    print(f"Results saved to: {output_file_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

keeps throwing something about content

--- Executing Fetch Node ---
--- (Fetching HTML from: "<div\n  role=\"presentation\"\n  class=\"c-message_kit__background p-message_pane_message__message c-message_kit__message p-message_pane_message__message--last\"\n  data-qa=\"message_container\"\n  data-qa-unprocessed=\"false\"\n  data-qa-placeholder=\"false\"\n>\n  <div\n    role=\"document\"\n    aria-roledescription=\"message\"\n    class=\"c-message_kit__hover\"\n    data-qa-hover=\"true\"\n  >\n    <div\n      class=\"c-message_kit__actions c-message_kit__actions--default\"\n      style=\"position: relative\"\n    >\n      <div class=\"c-message_kit__gutter\">\n        <div\n          role=\"presentation\"\n          class=\"c-message_kit__gutter__left\"\n          data-stringify-ignore=\"true\"\n        >\n          <span class=\"p-member_profile_hover_card\" role=\"presentation\"\n            ><button\n              class=\"c-button-unstyled c-message_kit__avatar c-avatar c-avatar--interactive\"\n              aria-hidden=\"true\"\n              aria-label=\"View Nobu’s Profile\"\n              tabindex=\"-1\"\n              type=\"button\"\n              style=\"height: 36px; width: 36px\"\n            >\n              <span\n                class=\"c-base_icon__width_only_container\"\n                style=\"height: 36px; width: 36px\"\n                ><img\n                  src=\"https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-48\"\n                  srcset=\"\n                    https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-72 2x\n                  \"\n                  class=\"c-base_icon c-base_icon--image\"\n                  aria-hidden=\"true\"\n                  role=\"img\"\n                  alt=\"\"\n                  style=\"width: 36px\"\n              /></span></button\n          ></span>\n        </div>\n        <div\n          role=\"presentation\"\n          class=\"c-message_kit__gutter__right\"\n          data-qa=\"message_content\"\n        >\n          <span\n            class=\"c-message__sender c-message_kit__sender\"\n            data-qa=\"message_sender\"\n            data-stringify-type=\"replace\"\n            data-stringify-text=\"Nobu\"\n            ><span class=\"p-member_profile_hover_card\" role=\"presentation\"\n              ><button\n                data-message-sender=\"U0807D9TGNM\"\n                data-qa=\"message_sender_name\"\n                class=\"c-link--button c-message__sender_button\"\n                type=\"button\"\n                tabindex=\"0\"\n              >\n                Nobu\n              </button></span\n            ><span\n              id=\"primary-C07UVPLJW4E-1740101333.214529-sender\"\n              class=\"offscreen\"\n              aria-hidden=\"true\"\n              data-qa=\"aria-labelledby-primary-C07UVPLJW4E-1740101333.214529-sender\"\n              >Nobu</span\n            ></span\n          >&nbsp;&nbsp;<a\n            aria-label=\"Today at 10:28:53 AM\"\n            data-stringify-type=\"replace\"\n            data-stringify-text=\"[10:28 AM]\"\n            data-stringify-requires-siblings=\"true\"\n            data-ts=\"1740101333.214529\"\n            delay=\"300\"\n            data-sk=\"tooltip_parent\"\n            class=\"c-link c-timestamp\"\n            href=\"https://c-2-c-group.slack.com/archives/C07UVPLJW4E/p1740101333214529\"\n            ><span class=\"c-timestamp__label\" data-qa=\"timestamp_label\"\n              >10:28 AM</span\n            ></a\n          ><br />\n          <div class=\"c-message_kit__blocks c-message_kit__blocks--rich_text\">\n            <div\n              class=\"c-message__message_blocks c-message__message_blocks--rich_text\"\n              data-qa=\"message-text\"\n            >\n              <div class=\"p-block_kit_renderer\" data-qa=\"block-kit-renderer\">\n                <div\n                  class=\"p-block_kit_renderer__block_wrapper p-block_kit_renderer__block_wrapper--first\"\n                >\n                  <div class=\"p-rich_text_block\" dir=\"auto\">\n                    <div class=\"p-rich_text_section\">\n                      件名:\n                      Bitbucketアクセス制限についての正式な説明を求めます<span\n                        aria-label=\"\"\n                        class=\"c-mrkdwn__br\"\n                        data-stringify-type=\"paragraph-break\"\n                      ></span\n                      >CFO羽嶋様<br />お世話になっております。Nobuです。<span\n                        aria-label=\"\"\n                        class=\"c-mrkdwn__br\"\n                        data-stringify-type=\"paragraph-break\"\n                      ></span\n                   ****filler********                data-qa=\"file_image_thumbnail_img\"\n                          /></div\n                      ></a>\n                    </div>\n                  </div>\n                </div>\n              </div>\n            </div>\n            <div class=\"resize-triggers\">\n              <div class=\"expand-trigger\">\n                <div style=\"width: 494px; height: 229px\"></div>\n              </div>\n              <div class=\"contract-trigger\"></div>\n            </div>\n          </div>\n        </div>\n      </div>\n    </div>\n  </div>\n</div>\n") ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
Error during chain execution: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '
An error occurred: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions