diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26ee672 --- /dev/null +++ b/.gitignore @@ -0,0 +1,142 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +.idea + diff --git a/README.md b/README.md index 203c379..7a23b2d 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,20 @@ this tool is for educational purposes only and violates LinkedIn.com's TOS. Use ## Screenshots -![alt tag](https://s18.postimg.org/nh7dtdkux/Screen_Shot_2018-03-29_at_7.09.04_AM.png) +![alt tag](screenshots/Screenshot_1.png) -![alt tag](https://s4.postimg.org/vu9izninx/Screen_Shot_2017_03_15_at_11_45_11_PM.png) +![alt tag](screenshots/Screenshot_2.png) -![alt tag](https://s8.postimg.org/st0h8maxx/Screen_Shot_2017_03_20_at_11_04_00_AM.png) +![alt tag](screenshots/Screenshot_3.png) + +![alt tag](screenshots/Screenshot_4.png) ## How to install `sudo apt-get install python-pip -y` `sudo pip install -r requirements.txt` + +## How to build +```shell +pyinstaller --onefile ScrapedIn.py +``` diff --git a/ScrapedIn.py b/ScrapedIn.py index 71920a8..9138f2f 100644 --- a/ScrapedIn.py +++ b/ScrapedIn.py @@ -1,63 +1,69 @@ #!/usr/bin/python __title__ = "ScrapeIn - Tool to Scrape LinkedIn" -__author__ = 'Danny Chrastil' -__email__ = 'danny.chrastil@gmail.com' +__author__ = 'Danny Chrastil, Sezer BOZKIR' +__email__ = 'danny.chrastil@gmail.com, admin@sezerbozkir.com' __description__ = "A recon tool that allows you to scrape profile search results from LinkedIn" __disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk" -__version__ = '2.0' +__version__ = '3.0' import sys -import re -import time + import xlsxwriter import json import argparse import requests -import subprocess -import urllib -import math import config - +from getpass import getpass +from stdiomask import getpass from bs4 import BeautifulSoup -from thready import threaded - -reload(sys) -sys.setdefaultencoding('utf-8') """ Setup Argument Parameters """ parser = argparse.ArgumentParser(description='Discovery LinkedIn') parser.add_argument('-u', '--keywords', help='Keywords to search') parser.add_argument('-o', '--output', help='Output file (do not include extentions)') args = parser.parse_args() +title = """ + __ _ _____ +/ _\ ___ _ __ __ _ _ __ ___ __| | \_ \_ __ +\ \ / __| '__/ _` | '_ \ / _ \/ _` | / /\/ '_ \ +_\ \ (__| | | (_| | |_) | __/ (_| /\/ /_ | | | | +\__/\___|_| \__,_| .__/ \___|\__,_\____/ |_| |_| + |_| +tool to scrape linkedin v2.0 +""" def linkedIn(proxies=None): s = requests.Session() html = s.get("https://www.linkedin.com/", proxies=proxies) soup = BeautifulSoup(html.text, "html.parser") - csrf = soup.find(id="loginCsrfParam-login")['value'] + csrf = soup.find('input', {'name': 'loginCsrfParam'})['value'] + if not (config.linkedin['username'] or config.linkedin['password']): + username = input("Please enter your LinkedIN account e-mail or username?") + password = getpass(prompt="Please enter your LinkedIN account password?") login_data = { - 'session_key': config.linkedin['username'], - 'session_password': config.linkedin['password'], + 'session_key': config.linkedin['username'] if config.linkedin['username'] else username, + 'session_password': config.linkedin['password'] if config.linkedin['password'] else password, 'loginCsrfParam': csrf, } - + # login operation logged_in = s.post("https://www.linkedin.com/uas/login-submit", - data=login_data, - proxies=proxies) - soup = BeautifulSoup(logged_in.text, "html.parser") + data=login_data, + proxies=proxies) + # soup = BeautifulSoup(logged_in.text, "html.parser") cookies = s.cookies return cookies + def get_search(search): # Fetch the initial page to get results/page counts - #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0' % search - #url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search - url = "https://www.linkedin.com/voyager/api/search/cluster"#?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search - #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=0' - #url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=OTHER&q=guided&start=0" - #url = 'https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B"75769"%5D' + # url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0' % search + # url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search + url = "https://www.linkedin.com/voyager/api/search/cluster" # ?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search + # url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=0' + # url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=OTHER&q=guided&start=0" + # url = 'https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B"75769"%5D' params = { 'count': 40, 'guides': 'List(v-%>PEOPLE,facetGeoRegion-%>ar%:0)', @@ -66,30 +72,31 @@ def get_search(search): 'q': 'guided', 'start': 0 } - headers = {'Csrf-Token':'ajax:7736867257193100830'} + headers = {'Csrf-Token': 'ajax:7736867257193100830'} cookies['JSESSIONID'] = 'ajax:7736867257193100830' - cookies['X-RestLi-Protocol-Version'] = '2.0.0' + cookies['X-RestLi-Protocol-Version'] = '2.0.0' r = requests.get(url, cookies=cookies, headers=headers, params=params) content = json.loads(r.text) data_total = content['paging']['total'] - + if not data_total: + input("0 results found. Please any button for exit!") + sys.exit(0) # Calculate pages off final results at 40 results/page pages = data_total / 40 if data_total % 40 == 0: - # Becuase we count 0... Subtract a page if there are no left over results on the last page - pages = pages - 1 - if pages == 0: + # Because we count 0... Subtract a page if there are no left over results on the last page + pages -= 1 + if pages == 0: pages = 1 - - print "[Info] %i Results Found" % data_total + + print("[Info] %i Results Found" % data_total) if data_total > 1000: pages = 24 - print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data" - print "[Info] Fetching %i Pages" % pages - print - + print("[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data") + print(f"[Info] Fetching {pages:d} Pages") + # Set record position for XLSX - recordpos = 1 + recordpos = 2 for p in range(pages): # Request results for each page using the start offset @@ -97,89 +104,111 @@ def get_search(search): r = requests.get(url, cookies=cookies, headers=headers, params=params) content = r.text.encode('UTF-8') content = json.loads(content) - print "[Info] Fetching page %i with %i results" % (p+1,len(content['elements'][0]['elements'])) - for c in content['elements'][0]['elements']: + print(f"[Info] Fetching page {p + 1:d} with {len(content['elements'][0]['elements']):d} results") + people = None + for element in content["elements"]: + if element['hitType'] == "PEOPLE": + people = element + if not people: + sys.exit("[Fatal] Could not find any user from linkedin.") + for c in people["elements"]: try: - if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False: + if not c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless']: try: data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry'] except: - data_industry = "" - data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName'] + data_industry = "" + data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile'][ + 'firstName'] data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName'] - data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier'] - data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation'] + data_slug = "https://www.linkedin.com/in/%s" % \ + c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile'][ + 'publicIdentifier'] + data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile'][ + 'occupation'] data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location'] try: - data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id'] + # old version + # data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % \ + # c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile'][ + # 'picture']['com.linkedin.voyager.common.MediaProcessorImage']['id'] + data_base_picture = \ + c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture'][ + 'com.linkedin.common.VectorImage'] + data_picture = data_base_picture['rootUrl'] + data_base_picture['artifacts'][2][ + 'fileIdentifyingUrlPathSegment'] except: - #print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation) + # print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation) data_picture = "" - + # Write data to XLSX file - worksheet1.write('A%i' % recordpos, data_firstname) - worksheet1.write('B%i' % recordpos, data_lastname) - worksheet1.write('C%i' % recordpos, data_occupation) - worksheet1.write('D%i' % recordpos, data_location) - worksheet1.write('E%i' % recordpos, data_industry) - worksheet1.write('F%i' % recordpos, data_slug) - worksheet1.write('G%i' % recordpos, data_picture) + worksheet1.write('A%i' % recordpos, data_firstname) + worksheet1.write('B%i' % recordpos, data_lastname) + worksheet1.write('C%i' % recordpos, data_occupation) + worksheet1.write('D%i' % recordpos, data_location) + worksheet1.write('E%i' % recordpos, data_industry) + worksheet1.write_url('F%i' % recordpos, data_slug, string="LinkedIN Profile") + worksheet1.write_url('G%i' % recordpos, data_picture, string="Profile Image Link") worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos) - worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i&"\n"&dataset!D%i&"\n"&dataset!E%i' % (recordpos,recordpos,recordpos,recordpos,recordpos)) + worksheet2.write('B%i' % recordpos, + '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i&"\n"&dataset!D%i&"\n"&dataset!E%i' % ( + recordpos, recordpos, recordpos, recordpos, recordpos)) worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos) - worksheet2.set_row(recordpos-1,125) + worksheet2.set_row(recordpos - 1, 125) # Increment Record Position - recordpos = recordpos + 1 + recordpos += 1 else: - print "[Notice] Headless profile found. Skipping" + print("[Notice] Headless profile found. Skipping") except: - print "[Notice] Skipping" + print("[Notice] Skipping") continue - print + def authenticate(): try: cookies = linkedIn() - print "[Info] Obtained new session: %s" % cookies['li_at'] + print(f"[Info] Obtained new session: {cookies['li_at']}") li_cookie = dict(li_at=cookies['li_at']) except KeyError as k: - print k + print(k) sys.exit('[Fatal] li_at cookie value not found') except Exception as e: - print e + print(e) sys.exit("[Fatal] Could not authenticate to linkedin.") return li_cookie + if __name__ == '__main__': - title = """ - __ _ _____ -/ _\ ___ _ __ __ _ _ __ ___ __| | \_ \_ __ -\ \ / __| '__/ _` | '_ \ / _ \/ _` | / /\/ '_ \ -_\ \ (__| | | (_| | |_) | __/ (_| /\/ /_ | | | | -\__/\___|_| \__,_| .__/ \___|\__,_\____/ |_| |_| - |_| -tool to scrape linkedin v2.0 -""" - print title.decode('UTF-8') - + print(title) + # Prompt user for data variables - search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n") - outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n") - print - + search = args.keywords if args.keywords is not None else input( + "Enter search Keywords (use quotes for more percise results)") + outfile = args.output if args.output is not None else input("Enter filename for output (exclude file extension)") + # URL Encode for the querystring - #search = urllib.quote_plus(search) + # search = urllib.quote_plus(search) cookies = authenticate() - + # Initiate XLSX File - workbook = xlsxwriter.Workbook('results/%s.xlsx' % outfile) + workbook = xlsxwriter.Workbook('%s.xlsx' % outfile) worksheet1 = workbook.add_worksheet('dataset') + bold = workbook.add_format({'bold': True}) + worksheet1.write('A1', 'Name', bold) + worksheet1.write('B1', 'Surname', bold) + worksheet1.write('C1', 'Occupation', bold) + worksheet1.write('D1', 'Location', bold) + worksheet1.write('E1', 'Industry', bold) + worksheet1.write('F1', 'LinkedIN URL', bold) + worksheet1.write('G1', 'Profile Image', bold) worksheet2 = workbook.add_worksheet('report') - worksheet2.set_column(0,0, 25) - worksheet2.set_column(1,2, 75) - + worksheet2.set_column(0, 0, 25) + worksheet2.set_column(1, 2, 75) + # Initialize Scraping get_search(search) - # Close XLSD File + # Close XLSX File workbook.close() + input("\nScraping is completed. Please any button for exit!") + sys.exit(0) diff --git a/config.py b/config.py index c3ac51c..ab1d725 100644 --- a/config.py +++ b/config.py @@ -5,8 +5,8 @@ # account to avoid account suspension linkedin = dict( - username = '', - password = '', + username=None, + password=None, ) ## [PROXY LIST] ## @@ -15,9 +15,8 @@ # blocked for sending too much traffic proxylist = [] -#proxylist.append('http://127.0.0.1:8080') +# proxylist.append('http://127.0.0.1:8080') ## [MISCELLANEOUS] ## timeout = 10 - diff --git a/requirements.txt b/requirements.txt index 47b3b9a..a28a40c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,15 @@ -beautifulsoup4==4.6.0 -certifi==2018.1.18 -chardet==3.0.4 -cryptography==2.1.4 -enum34==1.1.6 -futures==3.2.0 -idna==2.6 -ipaddress==1.0.17 -keyring==10.6.0 -keyrings.alt==3.0 -pycrypto==2.6.1 -pyxdg==0.25 -requests==2.18.4 -SecretStorage==2.3.1 -six==1.11.0 -threaded==1.0.0 -thready==0.1.5 -typing==3.6.4 -urllib3==1.22 -XlsxWriter==1.0.2 +altgraph==0.17 +beautifulsoup4==4.9.3 +certifi==2020.12.5 +chardet==4.0.0 +future==0.18.2 +idna==3.1 +pefile==2019.4.18 +pyinstaller==4.2 +pyinstaller-hooks-contrib==2020.11 +pywin32-ctypes==0.2.0 +requests==2.25.1 +soupsieve==2.2 +stdiomask==0.0.6 +urllib3==1.26.3 +XlsxWriter==1.3.7 diff --git a/results/outputfilesgohere b/results/outputfilesgohere deleted file mode 100644 index e69de29..0000000 diff --git a/screenshots/Screenshot_1.png b/screenshots/Screenshot_1.png new file mode 100644 index 0000000..88595f2 Binary files /dev/null and b/screenshots/Screenshot_1.png differ diff --git a/screenshots/Screenshot_2.png b/screenshots/Screenshot_2.png new file mode 100644 index 0000000..b3b850f Binary files /dev/null and b/screenshots/Screenshot_2.png differ diff --git a/screenshots/Screenshot_3.png b/screenshots/Screenshot_3.png new file mode 100644 index 0000000..362907a Binary files /dev/null and b/screenshots/Screenshot_3.png differ diff --git a/screenshots/Screenshot_4.png b/screenshots/Screenshot_4.png new file mode 100644 index 0000000..a7ebcc0 Binary files /dev/null and b/screenshots/Screenshot_4.png differ