from bs4 import BeautifulSoup import re from django.test import TestCase, Client, override_settings from django.core.management import call_command from django.utils.six import StringIO from django.test.client import RequestFactory from .views import c2_open from rowers.models import Workout, User, Rower, WorkoutForm, RowerForm, GraphImage from rowers.forms import DocumentsForm, CNsummaryForm, RegistrationFormUniqueEmail import rowers.plots as plots import rowers.interactiveplots as iplots import datetime from rowingdata import rowingdata as rdata from rowingdata import rower as rrower from django.utils import timezone from rowers.rows import handle_uploaded_file from django.core.files.uploadedfile import SimpleUploadedFile from time import strftime, strptime, mktime, time, daylight import os from rowers.tasks import handle_makeplot from rowers.utils import serialize_list, deserialize_list from shutil import copyfile from minimocktest import MockTestCase import pandas as pd import json import numpy as np from rowers import urls from rowers.views import error500_view, error404_view, error400_view, error403_view from dataprep import delete_strokedata from redis import StrictRedis redis_connection = StrictRedis() VERBOSE = True class TraverseLinksTest(TestCase): def setUp(self): self.u = User.objects.create_superuser( 'superuser1', 'superuser1@example.com', 'pwd') self.r = Rower.objects.create( user=self.u, gdproptin=True, gdproptindate=timezone.now()) nu = datetime.datetime.now() self.w = Workout.objects.create( name='testworkout', workouttype='On-water', user=self.r, date=nu.strftime('%Y-%m-%d'), starttime=nu.strftime('%H:%M:%S'), duration="0:55:00", distance=8000) self.w2 = Workout.objects.create( name='testworkout 2', workouttype='On-water', user=self.r, date=nu.strftime('%Y-%m-%d'), starttime=nu.strftime('%H:%M:%S'), duration="0:55:00", distance=8000) if self.client.login( username="superuser1", password="pwd"): if VERBOSE: print('\nLogin as superuser OK') else: raise BaseException('Login failed') @classmethod def setUpTestData(cls): # Initialise your database here as needed pass def test_traverse_urls(self): # Fill these lists as needed with your site specific URLs to check and to avoid to_traverse_list = ['/rowers/list-workouts'] to_avoid_list = ['^/$', '^$', 'javascript:history\.back()', 'javascript:history\.go\(-1\)', '^mailto:.*', '.*github\.io.*', 'javascript:.*', '.*biorow\.com.*', '.*facebook.*', '.*wordpress.*', '.*analytics.*', '.*freenet.*', '.*twitter.*', '^blog.*', '.*\d+-\d+-\d+.*', '.*flexchart/.*', '.*heroku.*', '.*oauth.*', '.*rowingdata.*', '.*thisisant.*', '.*garmin.*', '.*sub7.*', '.*bitbucket.*', '.*rathburn.*', '.*team.*', '.*concept2.*', '.*static.*', '.*authorize.*', '.*youtu.*', '.*earth.*', '.*stravaimport.*', '.*performancephones.*', '.*sporttracks.*', '.*join-select.*', ] done_list = [] error_list = [] source_of_link = dict() for link in to_traverse_list: source_of_link[link] = 'initial' (to_traverse_list, to_avoid_list, done_list, error_list, source_of_link) = \ self.recurse_into_path( to_traverse_list, to_avoid_list, done_list, error_list, source_of_link) print('END REACHED\nStats:') if VERBOSE: print('\nto_traverse_list = ' + str(to_traverse_list)) if VERBOSE: print('\nto_avoid_list = ' + str(to_avoid_list)) if VERBOSE: print('\nsource_of_link = ' + str(source_of_link)) if VERBOSE: print('\ndone_list = ' + str(done_list)) print('Followed ' + str(len(done_list)) + ' links successfully') print('Avoided ' + str(len(to_avoid_list)) + ' links') if error_list: print('!! ' + str(len(error_list)) + ' error(s) : ') for error in error_list: print(str(error) + ' found in page ' + source_of_link[error[0]]) print('Errors found traversing links') assert False else: print('No errors') def recurse_into_path(self, to_traverse_list, to_avoid_list, done_list, error_list, source_of_link): """ Dives into first item of to_traverse_list Returns: (to_traverse_list, to_avoid_list, done_list, source_of_link) """ if to_traverse_list: url = to_traverse_list.pop() if not match_any(url, to_avoid_list): print('Surfing to ' + str(url) + ', discovered in ' + str(source_of_link[url])) response = self.client.get(url, follow=True) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') for link in soup.find_all('a'): new_link = link.get('href') if VERBOSE: print(' Found link: ' + str(new_link)) if match_any(new_link, to_avoid_list): if VERBOSE: print(' Avoiding it') elif new_link in done_list: if VERBOSE: print(' Already done, ignoring') elif new_link in to_traverse_list: if VERBOSE: print(' Already in to traverse list, ignoring') else: if VERBOSE: print( ' New, unknown link: Storing it to traverse later') source_of_link[new_link] = url to_traverse_list.append(new_link) done_list.append(url) if VERBOSE: print('Done') else: error_list.append((url, response.status_code)) to_avoid_list.append(url) if VERBOSE: print('Diving into next level') return self.recurse_into_path(to_traverse_list, to_avoid_list, done_list, error_list, source_of_link) else: # Nothing to traverse if VERBOSE: print('Returning to upper level') return to_traverse_list, to_avoid_list, done_list, error_list, source_of_link def match_any(my_string, regexp_list): if my_string: combined = "(" + ")|(".join(regexp_list) + ")" return re.match(combined, my_string) else: # 'None' as string always matches return True