comparis-scrapper/scrap.py

79 lines
2.4 KiB
Python
Raw Normal View History

2024-03-28 09:54:08 +00:00
#-*- coding: utf-8 -*-
import requests
import string
import csv
import numpy as np
from bs4 import BeautifulSoup
2024-04-03 09:03:21 +00:00
from enum import Enum
class filter_type(Enum):
href = 1
text = 2
class_t = 3
2024-03-28 09:54:08 +00:00
#returns all result for a html-element by class from a url
def scrap(url,html_element,class_t):
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
2024-04-03 09:03:21 +00:00
# results = soup.find_all(html_element,class_=class_t)
# print("class",class_t)
# print("results:",results)
return soup
2024-03-28 09:54:08 +00:00
#search for a "next button" or link given som DOM elements and returns the next url
2024-04-03 09:03:21 +00:00
def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse):
2024-03-28 09:54:08 +00:00
found = False
page = requests.get(base_url)
soup = BeautifulSoup(page.content,"html.parser")
2024-04-03 09:03:21 +00:00
# results = soup.find(main_container_type,class_=main_container_class)
# if pages != None:
# pages = results.text
# else:
# print("found nothing on page")
# return 0
2024-03-28 09:54:08 +00:00
next_url_names = soup.find_all(next_type,class_=next_class)
2024-04-02 09:37:58 +00:00
if next_url_names != None:
for i2 in next_url_names:
striped_string = i2.text.strip()
print(i2.text.strip(),"stripped:",striped_string)
# print("Printable characters?",striped_string.isprintable())
if (striped_string) == string_parse:
print(i2)
next_url = i2.get("href")
print("url of next site")
found = True
return next_url
break
else:
found = False
2024-03-28 09:54:08 +00:00
if found == False:
print("No (more) elements found")
return 0
#url gets href param, text gets textparam stripped and formated
2024-04-03 09:03:21 +00:00
def filter(page,tag,search_by,search_string,to_extract,parent_iteration):
content=[]
if search_by == filter_type.class_t:
item = page.find_all(tag,class_=search_string)
if search_by == filter_type.text:
item = page.find_all(tag,string=search_string)
else:
print("filter_type not known: ",search_by)
return 0
if item != None:
for it in item:
for i in range(0,parent_iteration):
it = it.parent
if to_extract == filter_type.href:
content.append(it.get("href"))
if to_extract == filter_type.text:
content.append(it.string.strip())
return content
if item == None:
print("No Item found")
return 0
2024-03-28 09:54:08 +00:00