-
Notifications
You must be signed in to change notification settings - Fork 167
Expand file tree
/
Copy pathCompany.py
More file actions
105 lines (80 loc) · 3.64 KB
/
Company.py
File metadata and controls
105 lines (80 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import logging
import re
from bs4 import BeautifulSoup
from .ResultsObject import ResultsObject
from .utils import AnyEC, all_or_default, get_info, one_or_default
logger = logging.getLogger(__name__)
class Company(ResultsObject):
"""Linkedin User Profile Object"""
attributes = ['overview', 'jobs', 'life', 'insights']
# KD adds insights attribute
def __init__(self, overview, jobs, life, insights):
# KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work
self.overview_soup = BeautifulSoup(overview, 'html.parser')
self.jobs_soup = BeautifulSoup(jobs, 'html.parser')
self.life_soup = BeautifulSoup(life, 'html.parser')
self.insights_soup = BeautifulSoup(insights, 'html.parser')
# KD adds insights soup
@property
def overview(self):
"""Return dict of the overview section of the Linkedin Page"""
# Banner containing company Name + Location
banner = one_or_default(
self.overview_soup, 'section.org-top-card')
# Main container with company overview info
container = one_or_default(
self.overview_soup, 'section.artdeco-card.p4.mb3')
overview = {}
overview['description'] = container.select_one(
'section > p').get_text().strip()
metadata_keys = container.select('.org-page-details__definition-term')
# print(metadata_keys)
metadata_keys = [
x for x in metadata_keys if "Company size" not in x.get_text()]
# print(metadata_keys)
metadata_values = container.select(
'.org-page-details__definition-text')
overview.update(
get_info(banner, {'name': '.t-24.t-black.t-bold'})) # A fix to the name selector
overview.update(
get_info(container, {'company_size': '.org-about-company-module__company-size-definition-text'})) # Manually added Company size
for key, val in zip(metadata_keys, metadata_values):
dict_key = key.get_text().strip().lower().replace(" ", "_")
dict_val = val.get_text().strip()
if "company_size" not in dict_key:
overview[dict_key] = dict_val
# print(overview)
all_employees_links = all_or_default(
banner, '.mt1 > div > a:nth-of-type(2) > span') # A fix to locate "See all ### employees on LinkedIn"
if all_employees_links:
all_employees_text = all_employees_links[-1].text
else:
all_employees_text = ''
match = re.search(r'((\d+?,?)+)', all_employees_text)
if match:
overview['num_employees'] = int(match.group(1).replace(',', ''))
else:
overview['num_employees'] = None
logo_image_tag = one_or_default(
banner, '.org-top-card-primary-content__logo')
overview['image'] = logo_image_tag['src'] if logo_image_tag else ''
return overview
@property
def jobs(self):
return None
@property
def life(self):
return None
# KD added property for Insights
@property
def insights(self):
# summary table containing the Insights data for % change in headcount at 6m, 1y and 2y
table = one_or_default(
self.insights_soup, '.org-insights-module__summary-table')
insights = {}
insights.update(get_info(table, {
'6m change': 'td:nth-of-type(2) span:nth-of-type(3)',
'1y change': 'td:nth-of-type(3) span:nth-of-type(3)',
'2y change': 'td:nth-of-type(4) span:nth-of-type(3)'
}))
return insights