-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathlinkedin.py
44 lines (35 loc) · 1.78 KB
/
linkedin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
import scrapy
from BrowserNavigator.cookieManager import CookieManager
class LinkedinSpider(scrapy.Spider):
name = 'linkedin'
allowed_domains = ['linkedin.com']
# start_urls = ['https://www.linkedin.com/']
def start_requests(self):
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse, cookies=self.cookie)
def parse(self, response):
self.response = response
name = self.response.xpath('//*[@id="ember58"]/h1/span').extract_first()
overview = self.response.css('p.break-words white-space-pre-wrap mb5 t-14 t-black--light t-normal::text').get()
website = self.response.css('span.link-without-visited-state::text').get()
industry = self.response.css('dd.org-page-details__definition-text t-14 t-black--light t-normal::text').get()
size = self.response.css('dd.org-about-company-module__company-size-definition-text t-14 t-black--light mb1 fl::text').get()
typec = self.response.css('dd.org-page-details__definition-text t-14 t-black--light t-normal::text').get()
founded = self.response.css('dd.org-page-details__definition-text t-14 t-black--light t-normal::text').get()
specialities = self.response.css('dd.org-page-details__definition-text t-14 t-black--light t-normal').get()
dic = dict()
dic['name'] = name
dic['overview'] = overview
dic['website'] = website
dic['industry'] = industry
dic['size'] = size
dic['type'] = typec
dic['founded'] = founded
dic['specialities'] = specialities
yield {'Company': dic}
def __init__(self, urls):
super().__init__(urls)
self.urls = urls
self.response = None
self.cookie = CookieManager().get_cookies()