Private
Public Access
1
0
Files
rowsandall/boatmovers/scrapers.py
2022-10-17 18:34:28 +02:00

156 lines
4.9 KiB
Python

import re
from bs4 import BeautifulSoup
import requests
from boatmovers.models import *
import pandas as pd
from django.core.exceptions import ValidationError
url_heineken = ''
def csv_reader(filename,raceid,clubcol='Ploeg',bankjes=['Slag'],uitslag='Pos',gender='m',
startorder=1):
race = Race.objects.get(id=raceid)
nr = race.crew_size
df = pd.read_csv(filename)
# replace column names if different
for row in df.itertuples():
order = row[df.columns.get_loc(uitslag)+1]
crewname = row[df.columns.get_loc(clubcol)+1]
crew = Crew(name=crewname)
crew.save()
for i in range(nr):
try:
naam = row[df.columns.get_loc(bankjes[i])+1].split(' ')
first_name = ' '.join(naam[:-1])
last_name = naam[-1]
dummy = False
except AttributeError:
try:
first_name = str(row.Slag)
last_name = ''
dummy=False
except TypeError:
first_name = 'Unknown'
last_name = 'Athlete'
dummy=True
athletes = Athlete.objects.filter(full_name='{f} {l}'.format(
f=first_name,
l=last_name,
),
gender=gender)
if len(athletes) >= 1:
athlete = athletes[0]
else:
athlete = Athlete(first_name=first_name,
last_name=last_name,
club = crewname,
gender=gender,
dummy=dummy)
try:
athlete.save()
except ValidationError as e:
text, id = e.message.split(':')
athlete = Athlete.objects.get(id=id)
#print(athlete)
crew.athletes.add(athlete)
result = Result(
crew = crew,
race = race,
order = order
)
try:
if order>=startorder:
result.save()
except ValidationError as e:
print(e)
print(' ')
def time_team_scraper(url,raceid,gender='m',startorder=1):
race = Race.objects.get(id=raceid)
nr = race.crew_size
r = requests.get(url)
soup = BeautifulSoup(r.content,features='lxml')
tbl = soup.find('table')
order = 1
str = re.search('(.*)results(.*)',url)
base = str.groups()[0]
for tr in tbl.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
name = each.find('a').contents[0]
except (TypeError, IndexError):
link = ''
name =''
if 'entry' in link:
print(order, name)
dfs = pd.read_html(base+link[3:])
df = dfs[0]
namen = df['naam']
crew = Crew(name=name)
crew.save()
for i in range(nr):
try:
names = df['naam'][i].split(' ')
full_name = df['naam']
first_name = ' '.join(names[:-1])
last_name = names[-1]
dummy=False
except AttributeError:
try:
first_name = str(df['naam'][i])
last_name = ''
dummy=False
except TypeError:
first_name = 'Unknown'
last_name = 'Athlete'
dummy=True
athletes = Athlete.objects.filter(full_name=full_name)
if len(athletes) >= 1:
athlete = athletes[0]
else:
athlete = Athlete(first_name=first_name,
last_name=last_name,
club = name,
gender=gender,
dummy=dummy)
try:
athlete.save()
except ValidationError as e:
text, id = e.message.split(':')
athlete = Athlete.objects.get(id=id)
print(athlete)
crew.athletes.add(athlete)
result = Result(
crew = crew,
race = race,
order = order
)
if order >= startorder:
try:
result.save()
except ValidationError as e:
print(e)
order += 1
print('')