Private
Public Access
1
0
Files
rowsandall/boatmovers/scrapers.py
2022-06-27 21:28:49 +02:00

128 lines
3.9 KiB
Python

import re
from bs4 import BeautifulSoup
import requests
from boatmovers.models import *
import pandas as pd
url_heineken = ''
def csv_reader(filename,raceid,clubcol='Ploeg',bankjes=['Slag'],uitslag='Pos'):
race = Race.objects.get(id=raceid)
nr = race.crew_size
df = pd.read_csv(filename)
# replace column names if different
for row in df.itertuples():
order = row.Pos
crewname = row.Ploeg
crew = Crew(name=crewname)
crew.save()
for i in range(nr):
# need to improve for multiple rowers
try:
naam = row.Slag.split(' ')
first_name = ' '.join(naam[:-1])
last_name = naam[-1]
except AttributeError:
try:
first_name = str(row.Slag)
last_name = ''
except TypeError:
first_name = 'Unknown'
last_name = 'Athlete'
athletes = Athlete.objects.filter(first_name = first_name,
last_name = last_name)
if len(athletes) >= 1:
athlete = athletes[0]
else:
athlete = Athlete(first_name=first_name,
last_name=last_name,
club = crewname)
athlete.save()
print(athlete)
crew.athletes.add(athlete)
result = Result(
crew = crew,
race = race,
order = order
)
try:
result.save()
except ValidationError as e:
print(e)
def time_team_scraper(url,raceid):
race = Race.objects.get(id=raceid)
nr = race.crew_size
r = requests.get(url)
soup = BeautifulSoup(r.content,features='lxml')
tbl = soup.find('table')
order = 1
str = re.search('(.*)results(.*)',url)
base = str.groups()[0]
for tr in tbl.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
name = each.find('a').contents[0]
except (TypeError, IndexError):
link = ''
name =''
if 'entry' in link:
print(order, name)
dfs = pd.read_html(base+link[3:])
df = dfs[0]
namen = df['naam']
crew = Crew(name=name)
crew.save()
for i in range(nr):
try:
names = df['naam'][i].split(' ')
first_name = ' '.join(names[:-1])
last_name = names[-1]
except AttributeError:
try:
first_name = str(df['naam'][i])
last_name = ''
except TypeError:
first_name = 'Unknown'
last_name = 'Athlete'
athletes = Athlete.objects.filter(first_name = first_name,
last_name = last_name)
if len(athletes) >= 1:
athlete = athletes[0]
else:
athlete = Athlete(first_name=first_name,
last_name=last_name,
club = name)
athlete.save()
print(athlete)
crew.athletes.add(athlete)
result = Result(
crew = crew,
race = race,
order = order
)
try:
result.save()
except ValidationError as e:
print(e)
order += 1
print('')