comparis-scrapper/scrap.py

#-*- coding: utf-8 -*-
import requests
import string
import csv
import numpy as np
from bs4 import BeautifulSoup

#returns all result for a html-element by class from a url
def scrap(url,html_element,class_t):
    page = requests.get(url)
    soup = BeautifulSoup(page.content,"html.parser")
    results = soup.find_all(html_element,class_=class_t)
    print("class",class_t)
    return results

#search for a "next button" or link given som DOM elements and returns the next url
def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse):
    found = False

    page = requests.get(base_url)
    soup = BeautifulSoup(page.content,"html.parser")
    results = soup.find(main_container_type,class_=main_container_class)
    if pages != None:
        pages = results.text 
    else:
        print("found nothing on page")
        return 0

    next_url_names = soup.find_all(next_type,class_=next_class)
    if next_url_names != None:
        for i2 in next_url_names:
            striped_string = i2.text.strip()
            print(i2.text.strip(),"stripped:",striped_string)
           # print("Printable characters?",striped_string.isprintable())
            if (striped_string) == string_parse:
                print(i2)
                next_url = i2.get("href")
                print("url of next site")
                found = True
                return next_url
                break
    else:
        found = False
    if found == False:
        print("No (more) elements found")
        return 0

#url gets href param, text gets textparam stripped and formated
def filter(data,type_t,class_t,type_content):
    text=[]
    url=[]
    for entry in data:
        item = entry.find_all(type_t,class_=class_t)
        if item != None:
            if type_content == "url":
                for it in item:
                    url.append(it.get("href"))
                return url
            if type_content == "text":
                for it in item:
                     text.append(it.text.strip())
                return text
            else:
                return item
        if item == None:
            return 0
Init, basic functions 2024-03-28 09:54:08 +00:00			`#-- coding: utf-8 --`
			`import requests`
			`import string`
			`import csv`
			`import numpy as np`
			`from bs4 import BeautifulSoup`

			`#returns all result for a html-element by class from a url`
			`def scrap(url,html_element,class_t):`
			`page = requests.get(url)`
			`soup = BeautifulSoup(page.content,"html.parser")`
			`results = soup.find_all(html_element,class_=class_t)`
create functions 2024-04-02 09:37:58 +00:00			`print("class",class_t)`
Init, basic functions 2024-03-28 09:54:08 +00:00			`return results`

			`#search for a "next button" or link given som DOM elements and returns the next url`
create functions 2024-04-02 09:37:58 +00:00			`def page_iterator(base_url,main_container_type,main_container_class,next_type,next_class,string_parse):`
Init, basic functions 2024-03-28 09:54:08 +00:00			`found = False`

			`page = requests.get(base_url)`
			`soup = BeautifulSoup(page.content,"html.parser")`
			`results = soup.find(main_container_type,class_=main_container_class)`
create functions 2024-04-02 09:37:58 +00:00			`if pages != None:`
			`pages = results.text`
			`else:`
			`print("found nothing on page")`
			`return 0`
Init, basic functions 2024-03-28 09:54:08 +00:00
			`next_url_names = soup.find_all(next_type,class_=next_class)`
create functions 2024-04-02 09:37:58 +00:00			`if next_url_names != None:`
			`for i2 in next_url_names:`
			`striped_string = i2.text.strip()`
			`print(i2.text.strip(),"stripped:",striped_string)`
			`# print("Printable characters?",striped_string.isprintable())`
			`if (striped_string) == string_parse:`
			`print(i2)`
			`next_url = i2.get("href")`
			`print("url of next site")`
			`found = True`
			`return next_url`
			`break`
			`else:`
			`found = False`
Init, basic functions 2024-03-28 09:54:08 +00:00			`if found == False:`
			`print("No (more) elements found")`
			`return 0`

			`#url gets href param, text gets textparam stripped and formated`
create functions 2024-04-02 09:37:58 +00:00			`def filter(data,type_t,class_t,type_content):`
main file 2024-04-03 06:50:24 +00:00			`text=[]`
			`url=[]`
Init, basic functions 2024-03-28 09:54:08 +00:00			`for entry in data:`
main file 2024-04-03 06:50:24 +00:00			`item = entry.find_all(type_t,class_=class_t)`
Init, basic functions 2024-03-28 09:54:08 +00:00			`if item != None:`
			`if type_content == "url":`
main file 2024-04-03 06:50:24 +00:00			`for it in item:`
			`url.append(it.get("href"))`
			`return url`
			`if type_content == "text":`
			`for it in item:`
			`text.append(it.text.strip())`
			`return text`
Init, basic functions 2024-03-28 09:54:08 +00:00			`else:`
			`return item`
			`if item == None:`
			`return 0`