#-*- coding: utf-8 -*- import requests import string import csv import numpy as np from bs4 import BeautifulSoup #returns all result for a html-element by class from a url def scrap(url,html_element,class_t): page = requests.get(url) soup = BeautifulSoup(page.content,"html.parser") results = soup.find_all(html_element,class_=class_t) return results #search for a "next button" or link given som DOM elements and returns the next url def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse) found = False page = requests.get(base_url) soup = BeautifulSoup(page.content,"html.parser") results = soup.find(main_container_type,class_=main_container_class) pages = results.text next_url_names = soup.find_all(next_type,class_=next_class) for i2 in next_url_names: striped_string = i2.text.strip() print(i2.text.strip(),"stripped:",striped_string) # print("Printable characters?",striped_string.isprintable()) if (striped_string) == string_parse: print(i2) next_url = i2.get("href") print("url of next site") found = True return next_url break if found == False: print("No (more) elements found") return 0 #url gets href param, text gets textparam stripped and formated def filter(data,type_t,class_t,type_content) for entry in data: item = entry.find(type_t,class_=class_t) if item != None: if type_content == "url": return item.get("href") if type_of_sub_content == "text": return item.text.strip() else: return item if item == None: return 0