This commit is contained in:
ccppi 2024-04-03 11:03:21 +02:00
parent 6598218abe
commit 8bf6299f7f
2 changed files with 46 additions and 32 deletions

11
main.py
View File

@ -1,5 +1,8 @@
import scrap import scrap
from scrap import filter_type
content = scrap.scrap("https://www.comparis.ch/immobilien/marktplatz/lenzburg/mieten",'div','css-1d60yla ehesakb4') page = scrap.scrap("https://www.comparis.ch/immobilien/marktplatz/lenzburg/mieten",'div','css-1d60yla ehesakb4')
content = scrap.filter(content,"p","Wohnung","text") #print(content)
print(content) content2 = scrap.filter(page,'p',filter_type.text,"Wohnung",filter_type.href,5)
print(content2)
url = scrap.filter(page,'svg',filter_type.class_t,"fa-arrow-right",filter_type.href,0)
print("url:",url)

View File

@ -4,27 +4,34 @@ import string
import csv import csv
import numpy as np import numpy as np
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from enum import Enum
class filter_type(Enum):
href = 1
text = 2
class_t = 3
#returns all result for a html-element by class from a url #returns all result for a html-element by class from a url
def scrap(url,html_element,class_t): def scrap(url,html_element,class_t):
page = requests.get(url) page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser") soup = BeautifulSoup(page.content,"html.parser")
results = soup.find_all(html_element,class_=class_t) # results = soup.find_all(html_element,class_=class_t)
print("class",class_t) # print("class",class_t)
return results # print("results:",results)
return soup
#search for a "next button" or link given som DOM elements and returns the next url #search for a "next button" or link given som DOM elements and returns the next url
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse): def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse):
found = False found = False
page = requests.get(base_url) page = requests.get(base_url)
soup = BeautifulSoup(page.content,"html.parser") soup = BeautifulSoup(page.content,"html.parser")
results = soup.find(main_container_type,class_=main_container_class) # results = soup.find(main_container_type,class_=main_container_class)
if pages != None: # if pages != None:
pages = results.text # pages = results.text
else: # else:
print("found nothing on page") # print("found nothing on page")
return 0 # return 0
next_url_names = soup.find_all(next_type,class_=next_class) next_url_names = soup.find_all(next_type,class_=next_class)
if next_url_names != None: if next_url_names != None:
@ -46,22 +53,26 @@ def page_iterator(base_url,main_container_type,main_container_class,next_type,ne
return 0 return 0
#url gets href param, text gets textparam stripped and formated #url gets href param, text gets textparam stripped and formated
def filter(data,type_t,class_t,type_content): def filter(page,tag,search_by,search_string,to_extract,parent_iteration):
text=[] content=[]
url=[] if search_by == filter_type.class_t:
for entry in data: item = page.find_all(tag,class_=search_string)
item = entry.find_all(type_t,class_=class_t) if search_by == filter_type.text:
if item != None: item = page.find_all(tag,string=search_string)
if type_content == "url":
for it in item:
url.append(it.get("href"))
return url
if type_content == "text":
for it in item:
text.append(it.text.strip())
return text
else: else:
return item print("filter_type not known: ",search_by)
return 0
if item != None:
for it in item:
for i in range(0,parent_iteration):
it = it.parent
if to_extract == filter_type.href:
content.append(it.get("href"))
if to_extract == filter_type.text:
content.append(it.string.strip())
return content
if item == None: if item == None:
print("No Item found")
return 0 return 0