From 8bf6299f7f6296bec266327629eac63a4e780d13 Mon Sep 17 00:00:00 2001 From: ccppi Date: Wed, 3 Apr 2024 11:03:21 +0200 Subject: [PATCH] soidfj --- main.py | 11 ++++++---- scrap.py | 67 +++++++++++++++++++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index fbbc281..69163ff 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,8 @@ import scrap - -content = scrap.scrap("https://www.comparis.ch/immobilien/marktplatz/lenzburg/mieten",'div','css-1d60yla ehesakb4') -content = scrap.filter(content,"p","Wohnung","text") -print(content) +from scrap import filter_type +page = scrap.scrap("https://www.comparis.ch/immobilien/marktplatz/lenzburg/mieten",'div','css-1d60yla ehesakb4') +#print(content) +content2 = scrap.filter(page,'p',filter_type.text,"Wohnung",filter_type.href,5) +print(content2) +url = scrap.filter(page,'svg',filter_type.class_t,"fa-arrow-right",filter_type.href,0) +print("url:",url) diff --git a/scrap.py b/scrap.py index abf7d5a..93e4094 100644 --- a/scrap.py +++ b/scrap.py @@ -4,27 +4,34 @@ import string import csv import numpy as np from bs4 import BeautifulSoup +from enum import Enum + +class filter_type(Enum): + href = 1 + text = 2 + class_t = 3 #returns all result for a html-element by class from a url def scrap(url,html_element,class_t): page = requests.get(url) soup = BeautifulSoup(page.content,"html.parser") - results = soup.find_all(html_element,class_=class_t) - print("class",class_t) - return results +# results = soup.find_all(html_element,class_=class_t) +# print("class",class_t) +# print("results:",results) + return soup #search for a "next button" or link given som DOM elements and returns the next url -def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse): +def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse): found = False page = requests.get(base_url) soup = BeautifulSoup(page.content,"html.parser") - results = soup.find(main_container_type,class_=main_container_class) - if pages != None: - pages = results.text - else: - print("found nothing on page") - return 0 +# results = soup.find(main_container_type,class_=main_container_class) +# if pages != None: +# pages = results.text +# else: +# print("found nothing on page") +# return 0 next_url_names = soup.find_all(next_type,class_=next_class) if next_url_names != None: @@ -46,22 +53,26 @@ def page_iterator(base_url,main_container_type,main_container_class,next_type,ne return 0 #url gets href param, text gets textparam stripped and formated -def filter(data,type_t,class_t,type_content): - text=[] - url=[] - for entry in data: - item = entry.find_all(type_t,class_=class_t) - if item != None: - if type_content == "url": - for it in item: - url.append(it.get("href")) - return url - if type_content == "text": - for it in item: - text.append(it.text.strip()) - return text - else: - return item - if item == None: - return 0 +def filter(page,tag,search_by,search_string,to_extract,parent_iteration): + content=[] + if search_by == filter_type.class_t: + item = page.find_all(tag,class_=search_string) + if search_by == filter_type.text: + item = page.find_all(tag,string=search_string) + else: + print("filter_type not known: ",search_by) + return 0 + if item != None: + for it in item: + for i in range(0,parent_iteration): + it = it.parent + if to_extract == filter_type.href: + content.append(it.get("href")) + if to_extract == filter_type.text: + content.append(it.string.strip()) + return content + if item == None: + print("No Item found") + return 0 +