diff --git a/scrap.py b/scrap.py index fa39538..5a44d35 100644 --- a/scrap.py +++ b/scrap.py @@ -10,36 +10,43 @@ def scrap(url,html_element,class_t): page = requests.get(url) soup = BeautifulSoup(page.content,"html.parser") results = soup.find_all(html_element,class_=class_t) + print("class",class_t) return results #search for a "next button" or link given som DOM elements and returns the next url -def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse) +def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse): found = False page = requests.get(base_url) soup = BeautifulSoup(page.content,"html.parser") results = soup.find(main_container_type,class_=main_container_class) - pages = results.text + if pages != None: + pages = results.text + else: + print("found nothing on page") + return 0 next_url_names = soup.find_all(next_type,class_=next_class) - for i2 in next_url_names: - striped_string = i2.text.strip() - print(i2.text.strip(),"stripped:",striped_string) - # print("Printable characters?",striped_string.isprintable()) - if (striped_string) == string_parse: - print(i2) - next_url = i2.get("href") - print("url of next site") - found = True - return next_url - break - + if next_url_names != None: + for i2 in next_url_names: + striped_string = i2.text.strip() + print(i2.text.strip(),"stripped:",striped_string) + # print("Printable characters?",striped_string.isprintable()) + if (striped_string) == string_parse: + print(i2) + next_url = i2.get("href") + print("url of next site") + found = True + return next_url + break + else: + found = False if found == False: print("No (more) elements found") return 0 #url gets href param, text gets textparam stripped and formated -def filter(data,type_t,class_t,type_content) +def filter(data,type_t,class_t,type_content): for entry in data: item = entry.find(type_t,class_=class_t) if item != None: