-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathTeamTfrrs.py
131 lines (108 loc) · 4.27 KB
/
TeamTfrrs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from AthleteTfrrs import Athlete
class Team:
def __init__(self, State, Gender, Name):
# Construct the url and make the request
url_stub = "http://www.tfrrs.org/teams/{}_college_{}_{}".format(
State, Gender.lower(), Name
)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.102 Safari/537.36"
}
response = requests.get(url_stub, headers=headers)
# If the reponse succeeded
if response.status_code < 300 and response.status_code >= 200:
# panda's read_html doesn't accept percent colspan arguments
self.HTML = response.text.replace('colspan="100%"', 'colspan="3"')
self.initialize()
# If the response failed
else:
self.HTML = None
raise Exception("Could not retrieve", response.status_code)
def getAthleteIDs(self):
# TODO: Append Athlete ID(s) to Top Marks
links = self.soup.find_all("a")
tempIDs = {}
# Pull the athlete ID from href URLs
for link in links:
link = str(link)
if "//www.tfrrs.org/athletes/" in link:
link = link[34 : link.index(".html")]
link = link.replace("__", "_")
ID, school, name = link.split("/")
first, last = name.split("_")
name = last + ", " + first
tempIDs[name] = ID
# sort it since we had duplicates from top marks
IDs = {}
for item in sorted(tempIDs.items()):
IDs[item[0]] = item[1]
return IDs
# MERGE Into meetIds, AthleteIds = self.getIDs() to reduce double soup.findl_all("a")
# not working
# breaks at Bates, RPI, MIT, Tufts
def getMeetIds(self):
links = self.soup.find_all("a")
IDs = []
# Pull the meet ID from href URLs
for link in links:
if re.search(
'href="//www.tfrrs.org/results/(xc/){0,1}\d{5,6}/\w+(,|_)', str(link)
):
link = str(link)
idStart = re.search("/results/(xc/){0,1}\d{5,6}/", link).start() + len(
"/results/"
)
IDs += (
[link[idStart + 3 : idStart + 8]]
if "xc" in link
else [link[idStart : idStart + 5]]
)
return IDs
def initialize(self):
self.dfs = pd.read_html(self.HTML)
self.soup = BeautifulSoup(self.HTML, "html5lib")
self.AthleteIDs = self.getAthleteIDs()
self.MeetIDs = self.getMeetIds()
return None
def getAll(self):
if self.HTML:
# Setup
data = {}
data["Roster"] = self.getRoster()
data["Latest Results"] = self.getLatestResults()
data["Top Marks"] = self.getTopMarks()
return data
else:
raise Exception("No HTML loaded. Retry with different information")
def getRoster(self, asDict=False):
Roster = self.dfs[1]
RosterIDs = pd.Series(
[self.AthleteIDs[name] for name in Roster["NAME"].to_list()]
)
# Append IDs to roster
Roster = pd.concat([Roster, RosterIDs.rename("Athlete ID")], axis=1)
return Roster.to_dict() if asDict else Roster
def getLatestResults(self, asDict=False):
LatestResults = self.dfs[2]
MeetIDs = pd.Series(self.MeetIDs)
# Append Meet IDs to results
LatestResults = pd.concat((LatestResults, MeetIDs.rename("Meet ID")), axis=1)
return LatestResults.to_dict() if asDict else LatestResults
def getTopMarks(self, asDict=False):
TopMarks = self.dfs[0]
MarkIDs = pd.Series(
[
self.AthleteIDs[name] if name in self.AthleteIDs.keys() else None
for name in TopMarks["ATHLETE/SQUAD"]
]
)
# Append Athlete IDs to top marks
TopMarks = pd.concat((TopMarks, MarkIDs.rename("Athlete ID")), axis=1)
return TopMarks.to_dict() if asDict else TopMarks
if __name__ == "__main__":
Test = Team("NY", "F", "RPI")
print(Test.getRoster())