job-scrapper/lib/helpers.py
2024-07-22 10:54:28 +02:00

191 lines
5.8 KiB
Python

import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
from dateconverter import *
from datetime import datetime
DEBUG = True
def log(*s):
if DEBUG:
print(s)
class mode():
#def set(self,flag,value):
# self.flag = flag
# self.value = value
#def __init__(self,):
DEFAULT = 0
LINK = 0
LOCATION_CLEANUP = 0
SWAPDATE = 0
CLEANDATE = 0
ATTRS = 0
months = [
('January','01'),
('February','02'),
('March','03'),
('April','04'),
('May','05'),
('June','06'),
('July','07'),
('August','08'),
('September','09'),
('October','10'),
('November','11'),
('December','12')]
class item():
def __init__(self,tag,tag_content,index,name=None):
self.tag = tag
self.tag_content = tag_content
self.index = index
if name is not None:
self.name = name
else:
self.name = "not defined"
class job():
def __init__(self,title,profession,company,location,date,description,link,tag,starred):
self.title = title
self.profession = profession
self.company = company
self.location = location
self.date = date
self.description = description
self.link = link
self.tag = tag
self.starred = starred
def __str__(self):
return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)
def finder(results,item,**modes):
GETCHILDREN = modes.get("GETCHILDREN",'')
ATTRS = modes.get('ATTRS',0)
LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
LINK = modes.get('LINK',0)
SWAPDATE = modes.get('SWAPDATE',0)
CLEANDATE = modes.get('CLEANDATE',0)
BASEURL = modes.get('BASEURL','')
content = []
i = item.index
log("name",item.name)
log("Item tag: ",item.tag)
log("Modes:",modes)
log("tag_content: ",item.tag_content)
for entry in results:
if ATTRS==1:
result = entry.findAll(item.tag,attrs=item.tag_content)
log(item.tag_content)
else:
result = entry.findAll(item.tag,class_=item.tag_content)
log("found count results:",len(result))
if len(result)==0 and DEBUG == True:
log("len result: ",len(result))
for x in result:
log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
input()
if result:
log("theres a result")
if i>(len(result)-1):
log("len:",len(result)-1,"i:",i)
log("index out of bounds fall back to the %d count",i)
i=(len(result)-1)
result2 = result[i]
if GETCHILDREN != '':
found = False
for results in result:
child = results.find(GETCHILDREN)
log(child)
if child != None and found == False:
log("CHILD: ",child.text.strip())
found = True
content.append(child.text.strip())
if found == False:
content.append("CHILD_NOT_FOUND: " + GETCHILDREN)
elif LOCATION_CLEANUP==1:
location = CleanLocation(result2.text.strip())
content.append(location)
elif LINK==1:
string = result2.get("href")
if BASEURL:
string = BASEURL+string
content.append(string)
elif SWAPDATE==1:
content.append(DateCHToUS(result2.text.strip()))
elif CLEANDATE==1:
content.append(jobs_ch_clean_date(result2.text.strip()))
else:
log(result2)
content.append(result2.text.strip())
if not result:
if item.tag_content == "pubdate":
today = datetime.today().strftime('%Y-%m-%d')
content.append(today)
else:
content.append("NOTFound")
return content
def CleanLocation(location):
#p = re.compile('CH-[0-9]{4}')
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
return location
def arrayToClass(titles,companys,locations,dates,links,tag):
jobs = []
if(len(titles) == len(companys) == len(locations) == len(dates)):
log("len:",len(titles))
for i, title in enumerate(titles):
jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
log("class job:",jobs[i])
return jobs
else:
log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
return 0
def jobs_ch_clean_date(date):
newdate=''
for i in range(11,len(date)):#remove string "Published:"
newdate+=date[i]
newdate2 = jobs_ch_switch_month(newdate)
return newdate2
def jobs_ch_switch_month(date):
newdate=''
newmonth=''
day = ''
year = ''
for i in range(3,len(date)-5):
newmonth += date[i]
for month in months:
if(month[0] == newmonth):
newmonth = month[1]
for i in range(0,2):
day+=date[i]
for i in range(len(date)-2,len(date)):
year += date[i]
newdate = '20'+year+'-'+newmonth+'-'+day
return newdate
def CleanLocation(location):
#p = re.compile('CH-[0-9]{4}')
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
return location
def extractDomain(url):
pattern = r'https:\/\/.*\..+?(?=\/)'
domain = re.match(pattern,url)
if domain:
return domain.group()
else:
return 0
def makeSession(url):
with requests.Session() as session:
page = session.get(url)
return session