app_web_scraping/streamlit_app.py at main · hoangvantuan123/app_web_scraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import requests
from bs4 import BeautifulSoup
import streamlit as st
import pandas as pd
from collections import OrderedDict
import io
# Hàm lấy dữ liệu từ trang web


def get_data(url, tags_attributes):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, 'html.parser')
           # st.code(soup)
            data_list = []
            # tạo đối tượng data_dict với kiểu dữ liệu là OrderedDict
            # OrderedDict() là một lớp từ điển trong python ghi nhớ thứ tự các mục được thêm
            data_dict = OrderedDict()
            for tag, attributes in tags_attributes.items():
                for item in soup.find_all(tag, attributes):
                    # Lấy thông tin từng sản phẩm
                    name = item.text.strip()
                    # Thêm thông tin sản phẩm vào danh sách
                    data_dict.setdefault(f"{tag}", []).append(name)

            # Chuyển đổi thành list
            data_list = list(data_dict.values())
            # Đảo ngược thứ tự của các hàng
            data_list = list(zip(*data_list))
            # Trả về danh sách dữ liệu sản phẩm
            return data_list
        else:
            st.error(
                f"Lỗi {r.status_code}: Không thể kết nối đến đường dẫn URL. Vui lòng kiểm tra lại.")
            return []
    except:
        st.error("Không thể kết nối đến đường dẫn URL. Vui lòng kiểm tra lại.")
        return []


# Hiển thị giao diện nhập URL, tên thẻ HTML và thuộc tính
st.sidebar.title('Thu thập dữ liệu từ trang web')

# Tạo một khung
st.markdown(
    """
    <div style='background-color:#F0F2F6; padding: 10px ; border-radius: 10px'>
        <p >Lưu ý: Việc trích xuất dữ liệu từ các trang web có thể vi phạm các quy định bảo vệ dữ liệu hoặc các quy định của trang web đó. Do đó, trước khi sử dụng kĩ thuật screen scraping, cần phải đảm bảo rằng việc này không vi phạm pháp luật hoặc chính sách của trang web được truy xuất.</p>
        <p>VD url: Newegg : https://www.newegg.com</p>
    </div>
    <br>
    """,
    unsafe_allow_html=True
)

# Tạo hộp mở rộng
with st.expander("Xem đoạn mã code mẫu"):
    code = '''
    import bs4
    from urllib.request import urlopen as uReq
    from bs4 import BeautifulSoup as soup

    # Đường dẫn của trang web bạn muốn lấy dữ liệu
    my_url = 'https://www.example.com'

    # Lấy HTML của trang web
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()

    # Parse HTML bằng BeautifulSoup
    page_soup = soup(page_html, 'html.parser')
    page_soup__h1 = page_soup.h1
    page_soup__p = page_soup.p
    containers = page_soup.findAll("div", {"class": "item-container"})

    product_length = len(containers)

    # Lu file duoi dang csv
    filename = "products.csv"
    f = open(filename, "w")

    headers = "Product_name,Price\n"

    f.write(headers)

    # print(product_length)
    for container in containers:
        title_a = container.find("a", {"class": "item-title"})
        product_name = title_a.text if title_a else "Khong co ten san pham"
        #print("Product: " + product_name)

        price_current = container.find("li", {"class": "price-current"})
        price = price_current.text
        #print("Price: " + price)

        f.write(f"{product_name},{price}\n")

    f.close()
    '''
    st.code(code, language='python')

url = st.sidebar.text_input(
    'Nhập đường dẫn URL', value='https://www.example.com',  key='url')

# Tạo danh sách các thẻ HTML và thuộc tính tương ứng
tag_count = 0
tags_attributes = {}
while True:
    tag_count += 1
    tag = st.sidebar.text_input(
        "Nhập tên thẻ HTML (để trống để dừng)", key=f"tag_{tag_count}")
    if not tag:
        break
    attributes = st.sidebar.text_input(
        f"Nhập thuộc tính của thẻ {tag} (để trống nếu không có)", key=f"attributes_{tag_count}")
    attributes_dict = {}
    if attributes:
        parts = [part.strip() for part in attributes.split(",")]
        for part in parts:
            key_value = part.split(":")
            if len(key_value) < 2:
                st.warning(f"Thuộc tính không đúng định dạng: {part}")
            else:
                key = key_value[0].strip()
                value = key_value[1].strip()
                attributes_dict[key] = value
    tags_attributes[tag] = attributes_dict


# Kiểm tra dữ liệu nhập vào
if url and tags_attributes:

    # Lấy dữ liệu từ trang web
    data = get_data(url, tags_attributes)
    st.code('url: ' + url)
    # In ra giá trị của tags_attributes để kiểm tra
    st.subheader("THUỘC TÍNH THẺ HTML (JSON)")
    st.write(tags_attributes)
    # Kiểm tra dữ liệu trả về
    st.subheader("DỮ LIỆU THU THẬP ĐƯỢC (JSON)")
    st.write(data)
    if data:
        # Hiển thị dữ liệu bằng cách tạo bảng trong Streamlit
        st.subheader('DỮ LIỆU THU THẬP ĐƯỢC (CSV)')
        df = pd.DataFrame(
            data, columns=[f'Item_{i}' for i in range(len(data[0]))])
        st.write(df)
    else:
        st.warning('Không có dữ liệu để hiển thị')

    # Chức năng tải dữ liệu xuống dưới dạng CSV
    def convert_df(df):
        # sử dụng io.StringIO() để tạo một object stramlit có thể ghi vào bộ nhớ đệm như file
        stream = io.StringIO()
        df.to_csv(stream, index=False, encoding="utf-8-sig")
        # sử dụng getValue() để lấy gía trị của stramlit đã được ghi  ra
        # `encoding="utf-8-sig"` để ghi file CSV với định dạng Unicode, bao gồm các ký tự Latin và tiếng Việt.
        # Sử dụng `.encode("utf-8-sig")` để mã hóa Unicode thành nhị phân theo định dạng UTF-8.
        return stream.getvalue().encode("utf-8-sig")

    csv = convert_df(pd.DataFrame(data))

    st.download_button(
        label="Download Data",
        data=csv,
        file_name='data_name.csv',
        mime='text/csv'
    )
else:
    st.warning(
        'Vui lòng nhập đường dẫn URL và các thẻ HTML cùng với các thuộc tính tương ứng để thu thập dữ liệu.')