commit 24fcd9ddf0febc454ec9ba47944bcd7e9ee0f4eb Author: ccppi Date: Thu Mar 28 10:54:08 2024 +0100 Init, basic functions diff --git a/scrap.py b/scrap.py new file mode 100644 index 0000000..fa39538 --- /dev/null +++ b/scrap.py @@ -0,0 +1,54 @@ +#-*- coding: utf-8 -*- +import requests +import string +import csv +import numpy as np +from bs4 import BeautifulSoup + +#returns all result for a html-element by class from a url +def scrap(url,html_element,class_t): + page = requests.get(url) + soup = BeautifulSoup(page.content,"html.parser") + results = soup.find_all(html_element,class_=class_t) + return results + +#search for a "next button" or link given som DOM elements and returns the next url +def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse) + found = False + + page = requests.get(base_url) + soup = BeautifulSoup(page.content,"html.parser") + results = soup.find(main_container_type,class_=main_container_class) + pages = results.text + + next_url_names = soup.find_all(next_type,class_=next_class) + for i2 in next_url_names: + striped_string = i2.text.strip() + print(i2.text.strip(),"stripped:",striped_string) + # print("Printable characters?",striped_string.isprintable()) + if (striped_string) == string_parse: + print(i2) + next_url = i2.get("href") + print("url of next site") + found = True + return next_url + break + + if found == False: + print("No (more) elements found") + return 0 + +#url gets href param, text gets textparam stripped and formated +def filter(data,type_t,class_t,type_content) + for entry in data: + item = entry.find(type_t,class_=class_t) + if item != None: + if type_content == "url": + return item.get("href") + if type_of_sub_content == "text": + return item.text.strip() + else: + return item + if item == None: + return 0 +