Private
Public Access
1
0
Files
rowsandall/rowers/traverselinktest.py
Sander Roosendaal a547e851be more pep
2022-02-17 15:16:27 +01:00

199 lines
7.5 KiB
Python

from bs4 import BeautifulSoup
import re
from django.test import TestCase, Client, override_settings
from django.core.management import call_command
from django.utils.six import StringIO
from django.test.client import RequestFactory
from .views import c2_open
from rowers.models import Workout, User, Rower, WorkoutForm, RowerForm, GraphImage
from rowers.forms import DocumentsForm, CNsummaryForm, RegistrationFormUniqueEmail
import rowers.plots as plots
import rowers.interactiveplots as iplots
import datetime
from rowingdata import rowingdata as rdata
from rowingdata import rower as rrower
from django.utils import timezone
from rowers.rows import handle_uploaded_file
from django.core.files.uploadedfile import SimpleUploadedFile
from time import strftime, strptime, mktime, time, daylight
import os
from rowers.tasks import handle_makeplot
from rowers.utils import serialize_list, deserialize_list
from shutil import copyfile
from minimocktest import MockTestCase
import pandas as pd
import json
import numpy as np
from rowers import urls
from rowers.views import error500_view, error404_view, error400_view, error403_view
from dataprep import delete_strokedata
from redis import StrictRedis
redis_connection = StrictRedis()
VERBOSE = True
class TraverseLinksTest(TestCase):
def setUp(self):
self.u = User.objects.create_superuser(
'superuser1',
'superuser1@example.com', 'pwd')
self.r = Rower.objects.create(
user=self.u, gdproptin=True, gdproptindate=timezone.now())
nu = datetime.datetime.now()
self.w = Workout.objects.create(
name='testworkout', workouttype='On-water',
user=self.r, date=nu.strftime('%Y-%m-%d'),
starttime=nu.strftime('%H:%M:%S'),
duration="0:55:00", distance=8000)
self.w2 = Workout.objects.create(
name='testworkout 2', workouttype='On-water',
user=self.r, date=nu.strftime('%Y-%m-%d'),
starttime=nu.strftime('%H:%M:%S'),
duration="0:55:00", distance=8000)
if self.client.login(
username="superuser1", password="pwd"):
if VERBOSE:
print('\nLogin as superuser OK')
else:
raise BaseException('Login failed')
@classmethod
def setUpTestData(cls):
# Initialise your database here as needed
pass
def test_traverse_urls(self):
# Fill these lists as needed with your site specific URLs to check and to avoid
to_traverse_list = ['/rowers/list-workouts']
to_avoid_list = ['^/$', '^$', 'javascript:history\.back()',
'javascript:history\.go\(-1\)', '^mailto:.*',
'.*github\.io.*', 'javascript:.*',
'.*biorow\.com.*', '.*facebook.*',
'.*wordpress.*', '.*analytics.*', '.*freenet.*',
'.*twitter.*', '^blog.*',
'.*\d+-\d+-\d+.*',
'.*flexchart/.*',
'.*heroku.*',
'.*oauth.*',
'.*rowingdata.*',
'.*thisisant.*',
'.*garmin.*',
'.*sub7.*',
'.*bitbucket.*',
'.*rathburn.*',
'.*team.*',
'.*concept2.*',
'.*static.*',
'.*authorize.*',
'.*youtu.*',
'.*earth.*',
'.*c2list.*',
'.*stravaimport.*',
'.*performancephones.*',
'.*sporttracks.*',
'.*join-select.*',
]
done_list = []
error_list = []
source_of_link = dict()
for link in to_traverse_list:
source_of_link[link] = 'initial'
(to_traverse_list, to_avoid_list, done_list, error_list, source_of_link) = \
self.recurse_into_path(
to_traverse_list, to_avoid_list, done_list, error_list, source_of_link)
print('END REACHED\nStats:')
if VERBOSE:
print('\nto_traverse_list = ' + str(to_traverse_list))
if VERBOSE:
print('\nto_avoid_list = ' + str(to_avoid_list))
if VERBOSE:
print('\nsource_of_link = ' + str(source_of_link))
if VERBOSE:
print('\ndone_list = ' + str(done_list))
print('Followed ' + str(len(done_list)) + ' links successfully')
print('Avoided ' + str(len(to_avoid_list)) + ' links')
if error_list:
print('!! ' + str(len(error_list)) + ' error(s) : ')
for error in error_list:
print(str(error) + ' found in page ' +
source_of_link[error[0]])
print('Errors found traversing links')
assert False
else:
print('No errors')
def recurse_into_path(self, to_traverse_list, to_avoid_list, done_list, error_list, source_of_link):
""" Dives into first item of to_traverse_list
Returns: (to_traverse_list, to_avoid_list, done_list, source_of_link)
"""
if to_traverse_list:
url = to_traverse_list.pop()
if not match_any(url, to_avoid_list):
print('Surfing to ' + str(url) +
', discovered in ' + str(source_of_link[url]))
response = self.client.get(url, follow=True)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a'):
new_link = link.get('href')
if VERBOSE:
print(' Found link: ' + str(new_link))
if match_any(new_link, to_avoid_list):
if VERBOSE:
print(' Avoiding it')
elif new_link in done_list:
if VERBOSE:
print(' Already done, ignoring')
elif new_link in to_traverse_list:
if VERBOSE:
print(' Already in to traverse list, ignoring')
else:
if VERBOSE:
print(
' New, unknown link: Storing it to traverse later')
source_of_link[new_link] = url
to_traverse_list.append(new_link)
done_list.append(url)
if VERBOSE:
print('Done')
else:
error_list.append((url, response.status_code))
to_avoid_list.append(url)
if VERBOSE:
print('Diving into next level')
return self.recurse_into_path(to_traverse_list, to_avoid_list, done_list, error_list, source_of_link)
else:
# Nothing to traverse
if VERBOSE:
print('Returning to upper level')
return to_traverse_list, to_avoid_list, done_list, error_list, source_of_link
def match_any(my_string, regexp_list):
if my_string:
combined = "(" + ")|(".join(regexp_list) + ")"
return re.match(combined, my_string)
else:
# 'None' as string always matches
return True