create functions
This commit is contained in:
parent
24fcd9ddf0
commit
aaff6694fd
37
scrap.py
37
scrap.py
@ -10,36 +10,43 @@ def scrap(url,html_element,class_t):
|
|||||||
page = requests.get(url)
|
page = requests.get(url)
|
||||||
soup = BeautifulSoup(page.content,"html.parser")
|
soup = BeautifulSoup(page.content,"html.parser")
|
||||||
results = soup.find_all(html_element,class_=class_t)
|
results = soup.find_all(html_element,class_=class_t)
|
||||||
|
print("class",class_t)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
#search for a "next button" or link given som DOM elements and returns the next url
|
#search for a "next button" or link given som DOM elements and returns the next url
|
||||||
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse)
|
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse):
|
||||||
found = False
|
found = False
|
||||||
|
|
||||||
page = requests.get(base_url)
|
page = requests.get(base_url)
|
||||||
soup = BeautifulSoup(page.content,"html.parser")
|
soup = BeautifulSoup(page.content,"html.parser")
|
||||||
results = soup.find(main_container_type,class_=main_container_class)
|
results = soup.find(main_container_type,class_=main_container_class)
|
||||||
pages = results.text
|
if pages != None:
|
||||||
|
pages = results.text
|
||||||
|
else:
|
||||||
|
print("found nothing on page")
|
||||||
|
return 0
|
||||||
|
|
||||||
next_url_names = soup.find_all(next_type,class_=next_class)
|
next_url_names = soup.find_all(next_type,class_=next_class)
|
||||||
for i2 in next_url_names:
|
if next_url_names != None:
|
||||||
striped_string = i2.text.strip()
|
for i2 in next_url_names:
|
||||||
print(i2.text.strip(),"stripped:",striped_string)
|
striped_string = i2.text.strip()
|
||||||
# print("Printable characters?",striped_string.isprintable())
|
print(i2.text.strip(),"stripped:",striped_string)
|
||||||
if (striped_string) == string_parse:
|
# print("Printable characters?",striped_string.isprintable())
|
||||||
print(i2)
|
if (striped_string) == string_parse:
|
||||||
next_url = i2.get("href")
|
print(i2)
|
||||||
print("url of next site")
|
next_url = i2.get("href")
|
||||||
found = True
|
print("url of next site")
|
||||||
return next_url
|
found = True
|
||||||
break
|
return next_url
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
found = False
|
||||||
if found == False:
|
if found == False:
|
||||||
print("No (more) elements found")
|
print("No (more) elements found")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
#url gets href param, text gets textparam stripped and formated
|
#url gets href param, text gets textparam stripped and formated
|
||||||
def filter(data,type_t,class_t,type_content)
|
def filter(data,type_t,class_t,type_content):
|
||||||
for entry in data:
|
for entry in data:
|
||||||
item = entry.find(type_t,class_=class_t)
|
item = entry.find(type_t,class_=class_t)
|
||||||
if item != None:
|
if item != None:
|
||||||
|
Loading…
Reference in New Issue
Block a user