#-*- coding: utf-8 -*- import requests import string import csv import numpy as np from bs4 import BeautifulSoup from enum import Enum class filter_type(Enum): href = 1 text = 2 class_t = 3 #returns all result for a html-element by class from a url def scrap(url,html_element,class_t): page = requests.get(url) soup = BeautifulSoup(page.content,"html.parser") # results = soup.find_all(html_element,class_=class_t) # print("class",class_t) # print("results:",results) return soup #search for a "next button" or link given som DOM elements and returns the next url def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse): found = False page = requests.get(base_url) soup = BeautifulSoup(page.content,"html.parser") # results = soup.find(main_container_type,class_=main_container_class) # if pages != None: # pages = results.text # else: # print("found nothing on page") # return 0 next_url_names = soup.find_all(next_type,class_=next_class) if next_url_names != None: for i2 in next_url_names: striped_string = i2.text.strip() print(i2.text.strip(),"stripped:",striped_string) # print("Printable characters?",striped_string.isprintable()) if (striped_string) == string_parse: print(i2) next_url = i2.get("href") print("url of next site") found = True return next_url break else: found = False if found == False: print("No (more) elements found") return 0 #url gets href param, text gets textparam stripped and formated def filter(page,tag,search_by,search_string,to_extract,parent_iteration): content=[] if search_by == filter_type.class_t: item = page.find_all(tag,class_=search_string) if search_by == filter_type.text: item = page.find_all(tag,string=search_string) else: print("filter_type not known: ",search_by) return 0 if item != None: for it in item: for i in range(0,parent_iteration): it = it.parent if to_extract == filter_type.href: content.append(it.get("href")) if to_extract == filter_type.text: content.append(it.string.strip()) return content if item == None: print("No Item found") return 0