228 lines
7.1 KiB
Python
228 lines
7.1 KiB
Python
import string
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from enum import Enum
|
|
import re
|
|
import shutil
|
|
from dateconverter import *
|
|
from datetime import datetime
|
|
import os
|
|
import sqlite3
|
|
import webbrowser
|
|
import mozilla
|
|
|
|
DEBUG = False
|
|
number = ['0','1','2','3','4','5','6','7','8','9']
|
|
|
|
|
|
|
|
def log(*s):
|
|
if DEBUG:
|
|
print(s)
|
|
class mode():
|
|
#def set(self,flag,value):
|
|
# self.flag = flag
|
|
# self.value = value
|
|
#def __init__(self,):
|
|
DEFAULT = 0
|
|
LINK = 0
|
|
LOCATION_CLEANUP = 0
|
|
SWAPDATE = 0
|
|
CLEANDATE = 0
|
|
ATTRS = 0
|
|
months = [
|
|
('January','01'),
|
|
('February','02'),
|
|
('March','03'),
|
|
('April','04'),
|
|
('May','05'),
|
|
('June','06'),
|
|
('July','07'),
|
|
('August','08'),
|
|
('September','09'),
|
|
('October','10'),
|
|
('November','11'),
|
|
('December','12')]
|
|
class item():
|
|
def __init__(self,tag,tag_content,index,name=None):
|
|
self.tag = tag
|
|
self.tag_content = tag_content
|
|
self.index = index
|
|
if name is not None:
|
|
self.name = name
|
|
else:
|
|
self.name = "not defined"
|
|
|
|
class job():
|
|
def __init__(self,title,profession,company,location,date,description,link,tag,starred):
|
|
self.title = title
|
|
self.profession = profession
|
|
self.company = company
|
|
self.location = location
|
|
self.date = date
|
|
self.description = description
|
|
self.link = link
|
|
self.tag = tag
|
|
self.starred = starred
|
|
def __str__(self):
|
|
return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)
|
|
|
|
def finder(results,item,**modes):
|
|
GETCHILDREN = modes.get("GETCHILDREN",'')
|
|
ATTRS = modes.get('ATTRS',0)
|
|
LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
|
|
LINK = modes.get('LINK',0)
|
|
SWAPDATE = modes.get('SWAPDATE',0)
|
|
CLEANDATE = modes.get('CLEANDATE',0)
|
|
BASEURL = modes.get('BASEURL','')
|
|
INDEEDDATE = modes.get('INDEEDDATE',0)
|
|
content = []
|
|
i = item.index
|
|
log("name",item.name)
|
|
log("Item tag: ",item.tag)
|
|
log("Modes:",modes)
|
|
log("tag_content: ",item.tag_content)
|
|
for entry in results:
|
|
if ATTRS==1:
|
|
result = entry.findAll(item.tag,attrs=item.tag_content)
|
|
log(item.tag_content)
|
|
else:
|
|
result = entry.findAll(item.tag,class_=item.tag_content)
|
|
log("found count results:",len(result))
|
|
if len(result)==0 and DEBUG == True:
|
|
log("len result: ",len(result))
|
|
for x in result:
|
|
log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
|
|
input()
|
|
if result:
|
|
log("theres a result")
|
|
if i>(len(result)-1):
|
|
log("len:",len(result)-1,"i:",i)
|
|
log("index out of bounds fall back to the %d count",i)
|
|
i=(len(result)-1)
|
|
result2 = result[i]
|
|
if GETCHILDREN != '':
|
|
found = False
|
|
for results in result:
|
|
child = results.find(GETCHILDREN)
|
|
log("[finder] search for '",GETCHILDREN,"' in: ",child)
|
|
if child != None and found == False:
|
|
log("CHILD text strip: ",child.text.strip())
|
|
found = True
|
|
content.append(child.text.strip())
|
|
if found == False:
|
|
log("[finder] No matching Child found: ",child)
|
|
content.append("CHILD_NOT_FOUND: " + GETCHILDREN)
|
|
|
|
elif LOCATION_CLEANUP==1:
|
|
location = CleanLocation(result2.text.strip())
|
|
content.append(location)
|
|
elif LINK==1:
|
|
string = result2.get("href")
|
|
if BASEURL:
|
|
string = BASEURL+string
|
|
content.append(string)
|
|
elif SWAPDATE==1:
|
|
content.append(DateCHToUS(result2.text.strip()))
|
|
elif CLEANDATE==1:
|
|
log("[finder] pre cleandate:",result2.text.strip)
|
|
content.append(jobs_ch_clean_date(result2.text.strip()))
|
|
elif INDEEDDATE==1:
|
|
log("[finder] pre indeeddate:",result2.text.strip)
|
|
content.append(indeedExtractDays(result2.text.strip()))
|
|
else:
|
|
log(result2)
|
|
content.append(result2.text.strip())
|
|
if not result:
|
|
if item.tag_content == "pubdate":
|
|
today = datetime.today().strftime('%Y-%m-%d')
|
|
content.append(today)
|
|
else:
|
|
content.append("NOTFound")
|
|
return content
|
|
|
|
|
|
def CleanLocation(location):
|
|
#p = re.compile('CH-[0-9]{4}')
|
|
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
|
|
return location
|
|
|
|
def arrayToClass(titles,companys,locations,dates,links,tag):
|
|
jobs = []
|
|
if(len(titles) == len(companys) == len(locations) == len(dates)):
|
|
log("len:",len(titles))
|
|
for i, title in enumerate(titles):
|
|
jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
|
|
log("class job:",jobs[i])
|
|
return jobs
|
|
else:
|
|
log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
|
|
return 0
|
|
def jobs_ch_clean_date(date):
|
|
newdate=''
|
|
|
|
for i in range(11,len(date)):#remove string "Published:"
|
|
newdate+=date[i]
|
|
|
|
newdate2 = jobs_ch_switch_month(newdate)
|
|
return newdate2
|
|
|
|
def jobs_ch_switch_month(date):
|
|
newdate=''
|
|
newmonth=''
|
|
day = ''
|
|
year = ''
|
|
|
|
for i in range(3,len(date)-5):
|
|
newmonth += date[i]
|
|
for month in months:
|
|
if(month[0] == newmonth):
|
|
newmonth = month[1]
|
|
|
|
for i in range(0,2):
|
|
day+=date[i]
|
|
for i in range(len(date)-2,len(date)):
|
|
year += date[i]
|
|
newdate = '20'+year+'-'+newmonth+'-'+day
|
|
return newdate
|
|
|
|
def CleanLocation(location):
|
|
#p = re.compile('CH-[0-9]{4}')
|
|
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
|
|
return location
|
|
|
|
def extractDomain(url):
|
|
pattern = r'https:\/\/.*\..+?(?=\/)'
|
|
domain = re.match(pattern,url)
|
|
if domain:
|
|
return domain.group()
|
|
else:
|
|
return 0
|
|
|
|
def makeSession(url):
|
|
with requests.Session() as session:
|
|
page = session.get(url)
|
|
return session
|
|
def indeedExtractDays(datestr):
|
|
cleannumstr=''
|
|
cleannumint=-1
|
|
cleandate=''
|
|
foundchar=False
|
|
for a in datestr:
|
|
print(a)
|
|
if a in number and foundchar==False:
|
|
foundchar=True
|
|
cleannumstr+=a
|
|
elif a in number and foundchar == True:
|
|
cleannumstr+=a
|
|
elif a not in number and foundchar == True:
|
|
break
|
|
if cleannumstr != '':
|
|
cleannumint = int(cleannumstr)
|
|
today = int(datetime.utcnow().timestamp())
|
|
cleandate = today - cleannumint * 60 * 60 * 7 * 4
|
|
#print("int:",cleannumint,"today:",today,"cleandate:",datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d'))
|
|
return datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d')
|
|
return "NOTFound"
|
|
|