comparis-scrapper/scrap.py

#-*- coding: utf-8 -*-
import requests
import string
import csv
import numpy as np
from bs4 import BeautifulSoup
from enum import Enum

class filter_type(Enum):
    href = 1
    text = 2
    class_t = 3

#returns all result for a html-element by class from a url
def scrap(url,html_element,class_t):
    page = requests.get(url)
    soup = BeautifulSoup(page.content,"html.parser")
#    results = soup.find_all(html_element,class_=class_t)
#    print("class",class_t)
#    print("results:",results)
    return soup

#search for a "next button" or link given som DOM elements and returns the next url
def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse):
    found = False

    page = requests.get(base_url)
    soup = BeautifulSoup(page.content,"html.parser")
#    results = soup.find(main_container_type,class_=main_container_class)
#    if pages != None:
#        pages = results.text 
#    else:
#        print("found nothing on page")
#        return 0

    next_url_names = soup.find_all(next_type,class_=next_class)
    if next_url_names != None:
        for i2 in next_url_names:
            striped_string = i2.text.strip()
            print(i2.text.strip(),"stripped:",striped_string)
           # print("Printable characters?",striped_string.isprintable())
            if (striped_string) == string_parse:
                print(i2)
                next_url = i2.get("href")
                print("url of next site")
                found = True
                return next_url
                break
    else:
        found = False
    if found == False:
        print("No (more) elements found")
        return 0

#url gets href param, text gets textparam stripped and formated
def filter(page,tag,search_by,search_string,to_extract,parent_iteration):
    content=[]
    if search_by == filter_type.class_t:
        item = page.find_all(tag,class_=search_string)
    if search_by == filter_type.text:
        item = page.find_all(tag,string=search_string)
    else:
        print("filter_type not known: ",search_by)
        return 0
    if item != None:
        for it in item:
            for i in range(0,parent_iteration):
                it = it.parent
            if to_extract == filter_type.href:
                content.append(it.get("href"))
            if to_extract == filter_type.text:
                content.append(it.string.strip())
        return content
    if item == None:
        print("No Item found")
        return 0
Init, basic functions 2024-03-28 09:54:08 +00:00			`#-- coding: utf-8 --`
			`import requests`
			`import string`
			`import csv`
			`import numpy as np`
			`from bs4 import BeautifulSoup`
soidfj 2024-04-03 09:03:21 +00:00			`from enum import Enum`

			`class filter_type(Enum):`
			`href = 1`
			`text = 2`
			`class_t = 3`
Init, basic functions 2024-03-28 09:54:08 +00:00
			`#returns all result for a html-element by class from a url`
			`def scrap(url,html_element,class_t):`
			`page = requests.get(url)`
			`soup = BeautifulSoup(page.content,"html.parser")`
soidfj 2024-04-03 09:03:21 +00:00			`# results = soup.find_all(html_element,class_=class_t)`
			`# print("class",class_t)`
			`# print("results:",results)`
			`return soup`
Init, basic functions 2024-03-28 09:54:08 +00:00
			`#search for a "next button" or link given som DOM elements and returns the next url`
soidfj 2024-04-03 09:03:21 +00:00			`def page_iterator(base_url,next_type,next_class,parent_iteration,string_parse):`
Init, basic functions 2024-03-28 09:54:08 +00:00			`found = False`

			`page = requests.get(base_url)`
			`soup = BeautifulSoup(page.content,"html.parser")`
soidfj 2024-04-03 09:03:21 +00:00			`# results = soup.find(main_container_type,class_=main_container_class)`
			`# if pages != None:`
			`# pages = results.text`
			`# else:`
			`# print("found nothing on page")`
			`# return 0`
Init, basic functions 2024-03-28 09:54:08 +00:00
			`next_url_names = soup.find_all(next_type,class_=next_class)`
create functions 2024-04-02 09:37:58 +00:00			`if next_url_names != None:`
			`for i2 in next_url_names:`
			`striped_string = i2.text.strip()`
			`print(i2.text.strip(),"stripped:",striped_string)`
			`# print("Printable characters?",striped_string.isprintable())`
			`if (striped_string) == string_parse:`
			`print(i2)`
			`next_url = i2.get("href")`
			`print("url of next site")`
			`found = True`
			`return next_url`
			`break`
			`else:`
			`found = False`
Init, basic functions 2024-03-28 09:54:08 +00:00			`if found == False:`
			`print("No (more) elements found")`
			`return 0`

			`#url gets href param, text gets textparam stripped and formated`
soidfj 2024-04-03 09:03:21 +00:00			`def filter(page,tag,search_by,search_string,to_extract,parent_iteration):`
			`content=[]`
			`if search_by == filter_type.class_t:`
			`item = page.find_all(tag,class_=search_string)`
			`if search_by == filter_type.text:`
			`item = page.find_all(tag,string=search_string)`
			`else:`
			`print("filter_type not known: ",search_by)`
			`return 0`
			`if item != None:`
			`for it in item:`
			`for i in range(0,parent_iteration):`
			`it = it.parent`
			`if to_extract == filter_type.href:`
			`content.append(it.get("href"))`
			`if to_extract == filter_type.text:`
			`content.append(it.string.strip())`
			`return content`
			`if item == None:`
			`print("No Item found")`
			`return 0`

Init, basic functions 2024-03-28 09:54:08 +00:00