soidfj
This commit is contained in:
parent
6598218abe
commit
8bf6299f7f
11
main.py
11
main.py
@ -1,5 +1,8 @@
|
||||
import scrap
|
||||
|
||||
content = scrap.scrap("https://www.comparis.ch/immobilien/marktplatz/lenzburg/mieten",'div','css-1d60yla ehesakb4')
|
||||
content = scrap.filter(content,"p","Wohnung","text")
|
||||
print(content)
|
||||
from scrap import filter_type
|
||||
page = scrap.scrap("https://www.comparis.ch/immobilien/marktplatz/lenzburg/mieten",'div','css-1d60yla ehesakb4')
|
||||
#print(content)
|
||||
content2 = scrap.filter(page,'p',filter_type.text,"Wohnung",filter_type.href,5)
|
||||
print(content2)
|
||||
url = scrap.filter(page,'svg',filter_type.class_t,"fa-arrow-right",filter_type.href,0)
|
||||
print("url:",url)
|
||||
|
61
scrap.py
61
scrap.py
@ -4,27 +4,34 @@ import string
|
||||
import csv
|
||||
import numpy as np
|
||||
from bs4 import BeautifulSoup
|
||||
from enum import Enum
|
||||
|
||||
class filter_type(Enum):
|
||||
href = 1
|
||||
text = 2
|
||||
class_t = 3
|
||||
|
||||
#returns all result for a html-element by class from a url
|
||||
def scrap(url,html_element,class_t):
|
||||
page = requests.get(url)
|
||||
soup = BeautifulSoup(page.content,"html.parser")
|
||||
results = soup.find_all(html_element,class_=class_t)
|
||||
print("class",class_t)
|
||||
return results
|
||||
# results = soup.find_all(html_element,class_=class_t)
|
||||
# print("class",class_t)
|
||||
# print("results:",results)
|
||||
return soup
|
||||
|
||||
#search for a "next button" or link given som DOM elements and returns the next url
|
||||
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse):
|
||||
def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse):
|
||||
found = False
|
||||
|
||||
page = requests.get(base_url)
|
||||
soup = BeautifulSoup(page.content,"html.parser")
|
||||
results = soup.find(main_container_type,class_=main_container_class)
|
||||
if pages != None:
|
||||
pages = results.text
|
||||
else:
|
||||
print("found nothing on page")
|
||||
return 0
|
||||
# results = soup.find(main_container_type,class_=main_container_class)
|
||||
# if pages != None:
|
||||
# pages = results.text
|
||||
# else:
|
||||
# print("found nothing on page")
|
||||
# return 0
|
||||
|
||||
next_url_names = soup.find_all(next_type,class_=next_class)
|
||||
if next_url_names != None:
|
||||
@ -46,22 +53,26 @@ def page_iterator(base_url,main_container_type,main_container_class,next_type,ne
|
||||
return 0
|
||||
|
||||
#url gets href param, text gets textparam stripped and formated
|
||||
def filter(data,type_t,class_t,type_content):
|
||||
text=[]
|
||||
url=[]
|
||||
for entry in data:
|
||||
item = entry.find_all(type_t,class_=class_t)
|
||||
if item != None:
|
||||
if type_content == "url":
|
||||
for it in item:
|
||||
url.append(it.get("href"))
|
||||
return url
|
||||
if type_content == "text":
|
||||
for it in item:
|
||||
text.append(it.text.strip())
|
||||
return text
|
||||
def filter(page,tag,search_by,search_string,to_extract,parent_iteration):
|
||||
content=[]
|
||||
if search_by == filter_type.class_t:
|
||||
item = page.find_all(tag,class_=search_string)
|
||||
if search_by == filter_type.text:
|
||||
item = page.find_all(tag,string=search_string)
|
||||
else:
|
||||
return item
|
||||
print("filter_type not known: ",search_by)
|
||||
return 0
|
||||
if item != None:
|
||||
for it in item:
|
||||
for i in range(0,parent_iteration):
|
||||
it = it.parent
|
||||
if to_extract == filter_type.href:
|
||||
content.append(it.get("href"))
|
||||
if to_extract == filter_type.text:
|
||||
content.append(it.string.strip())
|
||||
return content
|
||||
if item == None:
|
||||
print("No Item found")
|
||||
return 0
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user