Init, basic functions

This commit is contained in:
ccppi 2024-03-28 10:54:08 +01:00
commit 24fcd9ddf0

54
scrap.py Normal file
View File

@ -0,0 +1,54 @@
#-*- coding: utf-8 -*-
import requests
import string
import csv
import numpy as np
from bs4 import BeautifulSoup
#returns all result for a html-element by class from a url
def scrap(url,html_element,class_t):
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
results = soup.find_all(html_element,class_=class_t)
return results
#search for a "next button" or link given som DOM elements and returns the next url
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse)
found = False
page = requests.get(base_url)
soup = BeautifulSoup(page.content,"html.parser")
results = soup.find(main_container_type,class_=main_container_class)
pages = results.text
next_url_names = soup.find_all(next_type,class_=next_class)
for i2 in next_url_names:
striped_string = i2.text.strip()
print(i2.text.strip(),"stripped:",striped_string)
# print("Printable characters?",striped_string.isprintable())
if (striped_string) == string_parse:
print(i2)
next_url = i2.get("href")
print("url of next site")
found = True
return next_url
break
if found == False:
print("No (more) elements found")
return 0
#url gets href param, text gets textparam stripped and formated
def filter(data,type_t,class_t,type_content)
for entry in data:
item = entry.find(type_t,class_=class_t)
if item != None:
if type_content == "url":
return item.get("href")
if type_of_sub_content == "text":
return item.text.strip()
else:
return item
if item == None:
return 0