create functions

This commit is contained in:
ccppi 2024-04-02 11:37:58 +02:00
parent 24fcd9ddf0
commit aaff6694fd

View File

@ -10,36 +10,43 @@ def scrap(url,html_element,class_t):
page = requests.get(url) page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser") soup = BeautifulSoup(page.content,"html.parser")
results = soup.find_all(html_element,class_=class_t) results = soup.find_all(html_element,class_=class_t)
print("class",class_t)
return results return results
#search for a "next button" or link given som DOM elements and returns the next url #search for a "next button" or link given som DOM elements and returns the next url
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse) def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse):
found = False found = False
page = requests.get(base_url) page = requests.get(base_url)
soup = BeautifulSoup(page.content,"html.parser") soup = BeautifulSoup(page.content,"html.parser")
results = soup.find(main_container_type,class_=main_container_class) results = soup.find(main_container_type,class_=main_container_class)
pages = results.text if pages != None:
pages = results.text
else:
print("found nothing on page")
return 0
next_url_names = soup.find_all(next_type,class_=next_class) next_url_names = soup.find_all(next_type,class_=next_class)
for i2 in next_url_names: if next_url_names != None:
striped_string = i2.text.strip() for i2 in next_url_names:
print(i2.text.strip(),"stripped:",striped_string) striped_string = i2.text.strip()
# print("Printable characters?",striped_string.isprintable()) print(i2.text.strip(),"stripped:",striped_string)
if (striped_string) == string_parse: # print("Printable characters?",striped_string.isprintable())
print(i2) if (striped_string) == string_parse:
next_url = i2.get("href") print(i2)
print("url of next site") next_url = i2.get("href")
found = True print("url of next site")
return next_url found = True
break return next_url
break
else:
found = False
if found == False: if found == False:
print("No (more) elements found") print("No (more) elements found")
return 0 return 0
#url gets href param, text gets textparam stripped and formated #url gets href param, text gets textparam stripped and formated
def filter(data,type_t,class_t,type_content) def filter(data,type_t,class_t,type_content):
for entry in data: for entry in data:
item = entry.find(type_t,class_=class_t) item = entry.find(type_t,class_=class_t)
if item != None: if item != None: