Project-2/scrape1.py at main · ThieuLam1202/Project-2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from bs4 import BeautifulSoup
from unidecode import unidecode
import numpy as np
import re
import requests
import csv

url = "https://vieclam24h.vn/tim-kiem-viec-lam-nhanh?page="

#function to find and return last page number
#get the count of jobs then divide it with 30 (is number of jobs on a page)
def get_last_page():
    html = requests.get(url + str(1)).text
    soup = BeautifulSoup(html, 'html5lib')
    divs = soup("div","flex items-center")
    count = divs[0].find("span", "font-semibold").text
    count = re.sub(r"\D", "", count)
    return int(np.ceil(int(count)/30))

#create dataset, which will be stored in a csv file named 'dataset1'
with open('dataset1.csv', 'w') as csvfile:
    fieldnames = ['name', 'company', 'location', 'average_salary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    #scrape data on all pages
    for x in range(1, get_last_page()):
        html = requests.get(url  + str(x)).text
        soup = BeautifulSoup(html, 'html5lib')
        divs = soup("div","relative lg:my-auto pl-2 lg:pl-2 leading-6 mb-4 flex-grow overflow-hidden mt-[14px]")
        #scrape work information
        def work_info(div):
            #job name
            if div.find("p", "text-se-neutral-100-n font-medium text-[18px] leading-6 tracking-tighter line-clamp-2 lg:line-clamp-none lg:truncate lg:block"):
                name = div.find("p", "text-se-neutral-100-n font-medium text-[18px] leading-6 tracking-tighter line-clamp-2 lg:line-clamp-none lg:truncate lg:block").text.strip()
            #else if here because job labeled with "hot" has different div
            elif div.find("p", "text-red-bright font-bold text-[18px] leading-6 tracking-tighter line-clamp-2 lg:line-clamp-none lg:truncate lg:block"):
                name = div.find("p", "text-red-bright font-bold text-[18px] leading-6 tracking-tighter line-clamp-2 lg:line-clamp-none lg:truncate lg:block").text.strip()
            else:
                name = ""
            #company name
            if div.find("p", "block text-grey-48 text-[16px] leading-6 truncate pr-2 max-w-[240px] lg:max-w-full"):
                company = div.find("p", "block text-grey-48 text-[16px] leading-6 truncate pr-2 max-w-[240px] lg:max-w-full").text.strip()
            else:
                company = ""
            #salary range
            if div.find("span", "text-se-neutral-80 text-14 whitespace-nowrap font-medium"):
                salary = div.find("span", "text-se-neutral-80 text-14 whitespace-nowrap font-medium").text.strip()
            else:
                salary = ""
            #work location
            if div.find("span", "text-se-neutral-80 text-14 whitespace-nowrap truncate"):
                location = div.find("span", "text-se-neutral-80 text-14 whitespace-nowrap truncate").text.strip()
            else:
                location = ""
            #if salary field contains the currency word then convert to one average value only
            if "triệu" in salary:
                if "-" in salary:
                    array = salary.split()
                    average_salary = (int(array[0]) + int(array[2]))/2
                else:
                    average_salary = int(re.sub(r"\D", "", salary))
            else:
                average_salary = ""
            return {
                'name' : unidecode(name),
                'company' : unidecode(company),
                'location' : unidecode(location),
                'average_salary' : average_salary
        }

        for i in range(len(divs)):
            #print(work_info(divs[i]))
            writer.writerow(work_info(divs[i]))

#close file
csvfile.close()