167 lines
4.7 KiB
Python
167 lines
4.7 KiB
Python
import string
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from enum import Enum
|
|
import re
|
|
from dateconverter import *
|
|
from datetime import datetime
|
|
DEBUG = False
|
|
|
|
def log(*s):
|
|
if DEBUG:
|
|
print(s)
|
|
class mode():
|
|
#def set(self,flag,value):
|
|
# self.flag = flag
|
|
# self.value = value
|
|
#def __init__(self,):
|
|
DEFAULT = 0
|
|
LINK = 0
|
|
LOCATION_CLEANUP = 0
|
|
SWAPDATE = 0
|
|
CLEANDATE = 0
|
|
ATTRS = 0
|
|
months = [
|
|
('January','01'),
|
|
('February','02'),
|
|
('March','03'),
|
|
('April','04'),
|
|
('May','05'),
|
|
('June','06'),
|
|
('July','07'),
|
|
('August','08'),
|
|
('September','09'),
|
|
('October','10'),
|
|
('November','11'),
|
|
('December','12')]
|
|
class item():
|
|
def __init__(self,tag,tag_content,index):
|
|
self.tag = tag
|
|
self.tag_content = tag_content
|
|
self.index = index
|
|
|
|
class job():
|
|
def __init__(self,title,profession,company,location,date,description,link,tag,starred):
|
|
self.title = title
|
|
self.profession = profession
|
|
self.company = company
|
|
self.location = location
|
|
self.date = date
|
|
self.description = description
|
|
self.link = link
|
|
self.tag = tag
|
|
self.starred = starred
|
|
|
|
def __str__(self):
|
|
return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)
|
|
|
|
def finder(results,item,**modes):
|
|
ATTRS = modes.get('ATTRS',0)
|
|
LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
|
|
LINK = modes.get('LINK',0)
|
|
SWAPDATE = modes.get('SWAPDATE',0)
|
|
CLEANDATE = modes.get('CLEANDATE',0)
|
|
BASEURL = modes.get('BASEURL','')
|
|
content = []
|
|
i = item.index
|
|
log("Modes:",modes)
|
|
|
|
for entry in results:
|
|
if ATTRS==1:
|
|
result = entry.findAll(item.tag,attrs=item.tag_content)
|
|
log(item.tag_content)
|
|
else:
|
|
result = entry.findAll(item.tag,class_=item.tag_content)
|
|
log("found:",len(result))
|
|
if result:
|
|
log("theres a result")
|
|
if i>(len(result)-1):
|
|
log("len:",len(result)-1,"i:",i)
|
|
log("index out of bounds fall back to the %d count",i)
|
|
# input("Press Enter..")
|
|
i=(len(result)-1)
|
|
result2 = result[i]
|
|
if LOCATION_CLEANUP==1:
|
|
location = CleanLocation(result2.text.strip())
|
|
content.append(location)
|
|
elif LINK==1:
|
|
string = result2.get("href")
|
|
if BASEURL:
|
|
string = BASEURL+string
|
|
content.append(string)
|
|
elif SWAPDATE==1:
|
|
content.append(DateCHToUS(result2.text.strip()))
|
|
elif CLEANDATE==1:
|
|
content.append(jobs_ch_clean_date(result2.text.strip()))
|
|
else:
|
|
content.append(result2.text.strip())
|
|
if not result:
|
|
if CLEANDATE:
|
|
today = datetime.today().strftime('%Y-%M-%D')
|
|
content.append(today)
|
|
content.append("NOTFound")
|
|
return content
|
|
|
|
|
|
def CleanLocation(location):
|
|
#p = re.compile('CH-[0-9]{4}')
|
|
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
|
|
return location
|
|
|
|
def arrayToClass(titles,companys,locations,dates,links,tag):
|
|
jobs = []
|
|
if(len(titles) == len(companys) == len(locations) == len(dates)):
|
|
log("len:",len(titles))
|
|
for i, title in enumerate(titles):
|
|
jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
|
|
log(jobs[i])
|
|
return jobs
|
|
else:
|
|
print("Something went wrong unequal length of data arrays")
|
|
return 0
|
|
def jobs_ch_clean_date(date):
|
|
newdate=''
|
|
|
|
for i in range(11,len(date)):#remove string "Published:"
|
|
newdate+=date[i]
|
|
|
|
newdate2 = jobs_ch_switch_month(newdate)
|
|
return newdate2
|
|
|
|
def jobs_ch_switch_month(date):
|
|
newdate=''
|
|
newmonth=''
|
|
day = ''
|
|
year = ''
|
|
|
|
for i in range(3,len(date)-5):
|
|
newmonth += date[i]
|
|
for month in months:
|
|
if(month[0] == newmonth):
|
|
newmonth = month[1]
|
|
|
|
for i in range(0,2):
|
|
day+=date[i]
|
|
for i in range(len(date)-2,len(date)):
|
|
year += date[i]
|
|
newdate = '20'+year+'-'+newmonth+'-'+day
|
|
return newdate
|
|
|
|
def CleanLocation(location):
|
|
#p = re.compile('CH-[0-9]{4}')
|
|
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
|
|
return location
|
|
|
|
def extractDomain(url):
|
|
pattern = r'https:\/\/.*\..+?(?=\/)'
|
|
domain = re.match(pattern,url)
|
|
if domain:
|
|
return domain.group()
|
|
else:
|
|
return 0
|
|
|
|
def makeSession(url):
|
|
with requests.Session() as session:
|
|
page = session.get(url)
|
|
return session
|