job-scrapper/lib/helpers.py

167 lines
4.8 KiB
Python

import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
from dateconverter import *
from datetime import datetime
DEBUG = False
def log(*s):
if DEBUG:
print(s)
class mode():
#def set(self,flag,value):
# self.flag = flag
# self.value = value
#def __init__(self,):
DEFAULT = 0
LINK = 0
LOCATION_CLEANUP = 0
SWAPDATE = 0
CLEANDATE = 0
ATTRS = 0
months = [
('January','01'),
('February','02'),
('March','03'),
('April','04'),
('May','05'),
('June','06'),
('July','07'),
('August','08'),
('September','09'),
('October','10'),
('November','11'),
('December','12')]
class item():
def __init__(self,tag,tag_content,index):
self.tag = tag
self.tag_content = tag_content
self.index = index
class job():
def __init__(self,title,profession,company,location,date,description,link,tag,starred):
self.title = title
self.profession = profession
self.company = company
self.location = location
self.date = date
self.description = description
self.link = link
self.tag = tag
self.starred = starred
def __str__(self):
return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)
def finder(results,item,**modes):
ATTRS = modes.get('ATTRS',0)
LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
LINK = modes.get('LINK',0)
SWAPDATE = modes.get('SWAPDATE',0)
CLEANDATE = modes.get('CLEANDATE',0)
BASEURL = modes.get('BASEURL','')
content = []
i = item.index
log("Modes:",modes)
for entry in results:
if ATTRS==1:
result = entry.findAll(item.tag,attrs=item.tag_content)
log(item.tag_content)
else:
result = entry.findAll(item.tag,class_=item.tag_content)
log("found count count results:",len(result))
if result:
log("theres a result")
if i>(len(result)-1):
log("len:",len(result)-1,"i:",i)
log("index out of bounds fall back to the %d count",i)
# input("Press Enter..")
i=(len(result)-1)
result2 = result[i]
if LOCATION_CLEANUP==1:
location = CleanLocation(result2.text.strip())
content.append(location)
elif LINK==1:
string = result2.get("href")
if BASEURL:
string = BASEURL+string
content.append(string)
elif SWAPDATE==1:
content.append(DateCHToUS(result2.text.strip()))
elif CLEANDATE==1:
content.append(jobs_ch_clean_date(result2.text.strip()))
else:
content.append(result2.text.strip())
if not result:
if item.tag_content == "pubdate":
today = datetime.today().strftime('%Y-%m-%d')
content.append(today)
else:
content.append("NOTFound")
return content
def CleanLocation(location):
#p = re.compile('CH-[0-9]{4}')
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
return location
def arrayToClass(titles,companys,locations,dates,links,tag):
jobs = []
if(len(titles) == len(companys) == len(locations) == len(dates)):
log("len:",len(titles))
for i, title in enumerate(titles):
jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
log("class job:",jobs[i])
return jobs
else:
log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
return 0
def jobs_ch_clean_date(date):
newdate=''
for i in range(11,len(date)):#remove string "Published:"
newdate+=date[i]
newdate2 = jobs_ch_switch_month(newdate)
return newdate2
def jobs_ch_switch_month(date):
newdate=''
newmonth=''
day = ''
year = ''
for i in range(3,len(date)-5):
newmonth += date[i]
for month in months:
if(month[0] == newmonth):
newmonth = month[1]
for i in range(0,2):
day+=date[i]
for i in range(len(date)-2,len(date)):
year += date[i]
newdate = '20'+year+'-'+newmonth+'-'+day
return newdate
def CleanLocation(location):
#p = re.compile('CH-[0-9]{4}')
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
return location
def extractDomain(url):
pattern = r'https:\/\/.*\..+?(?=\/)'
domain = re.match(pattern,url)
if domain:
return domain.group()
else:
return 0
def makeSession(url):
with requests.Session() as session:
page = session.get(url)
return session