-
Notifications
You must be signed in to change notification settings - Fork 122
/
Copy pathtest_crawl4ai.py
245 lines (212 loc) · 8.95 KB
/
test_crawl4ai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import requests
import time
import sys
import os
import socket
import json
# Get API token from environment or use default
API_TOKEN = os.environ.get("CRAWL4AI_API_TOKEN", "devdocs-demo-key")
# Determine if we're running inside a container
def is_running_in_container():
try:
with open('/proc/1/cgroup', 'r') as f:
return 'docker' in f.read() or 'kubepods' in f.read()
except:
return False
# Set the appropriate host based on environment
CRAWL4AI_HOST = "crawl4ai" if is_running_in_container() else "localhost"
CRAWL4AI_URL = f"http://{CRAWL4AI_HOST}:11235"
print(f"Using Crawl4AI URL: {CRAWL4AI_URL}")
def test_health():
"""Test the health endpoint of the Crawl4AI service"""
try:
print("Testing health endpoint...")
health = requests.get(f"{CRAWL4AI_URL}/health", timeout=5)
print(f"Health check status code: {health.status_code}")
print(f"Health check response: {health.json()}")
return True
except Exception as e:
print(f"Health check failed: {str(e)}")
return False
def test_unsecured():
"""Test the Crawl4AI service without authentication"""
try:
print("\nTesting unsecured crawl...")
response = requests.post(
f"{CRAWL4AI_URL}/crawl",
json={
"urls": "https://www.nbcnews.com/business",
"priority": 10
},
timeout=10
)
print(f"Crawl request status code: {response.status_code}")
print(f"Crawl request response: {response.json()}")
if response.status_code == 200:
task_id = response.json()["task_id"]
print(f"Task ID: {task_id}")
# Poll for result
print("\nPolling for task result...")
for i in range(10):
print(f"Poll attempt {i+1}/10")
status_response = requests.get(
f"{CRAWL4AI_URL}/task/{task_id}",
timeout=5
)
status = status_response.json()
print(f"Task status: {status['status']}")
if status["status"] == "completed":
print("Task completed!")
# Save the result to a file
save_result_to_file(task_id, status["result"])
return True
elif status["status"] == "failed":
print(f"Task failed: {status.get('error', 'Unknown error')}")
return False
time.sleep(2)
print("Task did not complete within timeout")
return False
else:
print("Crawl request failed")
return False
except Exception as e:
print(f"Unsecured test failed: {str(e)}")
return False
def test_secured():
"""Test the Crawl4AI service with authentication"""
try:
print("\nTesting secured crawl with token:", API_TOKEN)
headers = {"Authorization": f"Bearer {API_TOKEN}"}
response = requests.post(
f"{CRAWL4AI_URL}/crawl",
headers=headers,
json={
"urls": "https://www.nbcnews.com/business",
"priority": 10
},
timeout=10
)
print(f"Crawl request status code: {response.status_code}")
print(f"Crawl request response: {response.json()}")
if response.status_code == 200:
task_id = response.json()["task_id"]
print(f"Task ID: {task_id}")
# Poll for result
print("\nPolling for task result...")
for i in range(10):
print(f"Poll attempt {i+1}/10")
status_response = requests.get(
f"{CRAWL4AI_URL}/task/{task_id}",
headers=headers,
timeout=5
)
status = status_response.json()
print(f"Task status: {status['status']}")
if status["status"] == "completed":
print("Task completed!")
# Save the result to a file
save_result_to_file(task_id, status["result"])
return True
elif status["status"] == "failed":
print(f"Task failed: {status.get('error', 'Unknown error')}")
return False
time.sleep(2)
print("Task did not complete within timeout")
return False
else:
print("Crawl request failed")
return False
except Exception as e:
print(f"Secured test failed: {str(e)}")
return False
def save_result_to_file(task_id, result):
"""Save the result to a file"""
try:
# Create a directory for results if it doesn't exist
os.makedirs("crawl_results", exist_ok=True)
# Save the full result to a JSON file
result_file = f"crawl_results/{task_id}.json"
with open(result_file, 'w') as f:
json.dump(result, f, indent=2)
print(f"Full result saved to {result_file}")
# Extract and save the markdown content if available
if "markdown" in result and result["markdown"]:
markdown_file = f"crawl_results/{task_id}.md"
with open(markdown_file, 'w') as f:
f.write(result["markdown"])
print(f"Markdown content saved to {markdown_file}")
# Print a preview of the markdown content
preview_length = min(500, len(result["markdown"]))
print(f"\nMarkdown content preview (first {preview_length} characters):")
print(result["markdown"][:preview_length] + "...")
else:
print("No markdown content available in the result")
# Print information about the result
if "title" in result:
print(f"Page title: {result['title']}")
if "links" in result:
internal_links = result["links"].get("internal", [])
external_links = result["links"].get("external", [])
print(f"Found {len(internal_links)} internal links and {len(external_links)} external links")
return True
except Exception as e:
print(f"Error saving result to file: {str(e)}")
return False
def test_ping():
"""Test if we can ping the Crawl4AI container"""
try:
print(f"\nTesting ping to {CRAWL4AI_HOST}...")
# Try to resolve the hostname
try:
ip = socket.gethostbyname(CRAWL4AI_HOST)
print(f"Resolved {CRAWL4AI_HOST} to IP: {ip}")
except socket.gaierror:
print(f"Could not resolve hostname: {CRAWL4AI_HOST}")
return False
# Try to connect to the port
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
result = s.connect_ex((CRAWL4AI_HOST, 11235))
s.close()
if result == 0:
print(f"Successfully connected to {CRAWL4AI_HOST}:11235")
return True
else:
print(f"Could not connect to {CRAWL4AI_HOST}:11235, error code: {result}")
return False
except Exception as e:
print(f"Ping test failed: {str(e)}")
return False
if __name__ == "__main__":
print("=== Crawl4AI Test Script ===")
print(f"API Token: {API_TOKEN}")
print(f"Running in container: {is_running_in_container()}")
print(f"Using Crawl4AI host: {CRAWL4AI_HOST}")
# First test if we can ping the host
ping_ok = test_ping()
if ping_ok:
print("\n=== Ping test passed, proceeding with API tests ===")
# Run tests
health_ok = test_health()
if health_ok:
print("\n=== Health check passed, proceeding with tests ===")
unsecured_ok = test_unsecured()
secured_ok = test_secured()
print("\n=== Test Results ===")
print(f"Ping test: {'PASS' if ping_ok else 'FAIL'}")
print(f"Health check: {'PASS' if health_ok else 'FAIL'}")
print(f"Unsecured test: {'PASS' if unsecured_ok else 'FAIL'}")
print(f"Secured test: {'PASS' if secured_ok else 'FAIL'}")
if secured_ok:
print("\n=== Results ===")
print("The crawl results have been saved to the 'crawl_results' directory.")
print("You can view the results in the following ways:")
print("1. Open the JSON file for the full result")
print("2. Open the Markdown file for the formatted content")
print("3. Use the DevDocs UI to view the content in the browser")
else:
print("\n=== Health check failed, cannot proceed with tests ===")
sys.exit(1)
else:
print("\n=== Ping test failed, cannot proceed with tests ===")
sys.exit(1)