2024-06-13 09:14:04 +00:00
|
|
|
import string
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from enum import Enum
|
|
|
|
import re
|
2024-08-22 08:04:39 +00:00
|
|
|
import shutil
|
2024-06-13 09:14:04 +00:00
|
|
|
from dateconverter import *
|
|
|
|
from datetime import datetime
|
2024-08-22 08:04:39 +00:00
|
|
|
import os
|
|
|
|
import sqlite3
|
|
|
|
|
2024-07-18 09:26:13 +00:00
|
|
|
DEBUG = True
|
2024-08-21 09:39:44 +00:00
|
|
|
number = ['0','1','2','3','4','5','6','7','8','9']
|
2024-08-22 08:04:39 +00:00
|
|
|
homePath = os.path.expanduser('~')
|
|
|
|
cookiePath = homePath + "/.mozilla/firefox/imibizoh.default/cookies.sqlite"
|
|
|
|
tmpPath = "/tmp/cookies.sqlite"
|
|
|
|
DBFILE = "../db/sqlite3.db"
|
2024-06-13 09:14:04 +00:00
|
|
|
def log(*s):
|
|
|
|
if DEBUG:
|
|
|
|
print(s)
|
|
|
|
class mode():
|
|
|
|
#def set(self,flag,value):
|
|
|
|
# self.flag = flag
|
|
|
|
# self.value = value
|
|
|
|
#def __init__(self,):
|
|
|
|
DEFAULT = 0
|
|
|
|
LINK = 0
|
|
|
|
LOCATION_CLEANUP = 0
|
|
|
|
SWAPDATE = 0
|
|
|
|
CLEANDATE = 0
|
|
|
|
ATTRS = 0
|
|
|
|
months = [
|
|
|
|
('January','01'),
|
|
|
|
('February','02'),
|
|
|
|
('March','03'),
|
|
|
|
('April','04'),
|
|
|
|
('May','05'),
|
|
|
|
('June','06'),
|
|
|
|
('July','07'),
|
|
|
|
('August','08'),
|
|
|
|
('September','09'),
|
|
|
|
('October','10'),
|
|
|
|
('November','11'),
|
|
|
|
('December','12')]
|
|
|
|
class item():
|
2024-07-18 09:26:13 +00:00
|
|
|
def __init__(self,tag,tag_content,index,name=None):
|
2024-06-13 09:14:04 +00:00
|
|
|
self.tag = tag
|
|
|
|
self.tag_content = tag_content
|
|
|
|
self.index = index
|
2024-07-18 09:26:13 +00:00
|
|
|
if name is not None:
|
|
|
|
self.name = name
|
|
|
|
else:
|
|
|
|
self.name = "not defined"
|
2024-06-13 09:14:04 +00:00
|
|
|
|
|
|
|
class job():
|
|
|
|
def __init__(self,title,profession,company,location,date,description,link,tag,starred):
|
|
|
|
self.title = title
|
|
|
|
self.profession = profession
|
|
|
|
self.company = company
|
|
|
|
self.location = location
|
|
|
|
self.date = date
|
|
|
|
self.description = description
|
|
|
|
self.link = link
|
|
|
|
self.tag = tag
|
|
|
|
self.starred = starred
|
|
|
|
def __str__(self):
|
|
|
|
return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)
|
|
|
|
|
|
|
|
def finder(results,item,**modes):
|
2024-07-18 11:30:20 +00:00
|
|
|
GETCHILDREN = modes.get("GETCHILDREN",'')
|
2024-06-13 09:14:04 +00:00
|
|
|
ATTRS = modes.get('ATTRS',0)
|
|
|
|
LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
|
|
|
|
LINK = modes.get('LINK',0)
|
|
|
|
SWAPDATE = modes.get('SWAPDATE',0)
|
|
|
|
CLEANDATE = modes.get('CLEANDATE',0)
|
|
|
|
BASEURL = modes.get('BASEURL','')
|
2024-08-21 09:42:47 +00:00
|
|
|
INDEEDDATE = modes.get('INDEEDDATE',0)
|
2024-06-13 09:14:04 +00:00
|
|
|
content = []
|
|
|
|
i = item.index
|
2024-07-18 09:26:13 +00:00
|
|
|
log("name",item.name)
|
|
|
|
log("Item tag: ",item.tag)
|
2024-06-13 09:14:04 +00:00
|
|
|
log("Modes:",modes)
|
2024-07-18 09:26:13 +00:00
|
|
|
log("tag_content: ",item.tag_content)
|
2024-06-13 09:14:04 +00:00
|
|
|
for entry in results:
|
|
|
|
if ATTRS==1:
|
|
|
|
result = entry.findAll(item.tag,attrs=item.tag_content)
|
|
|
|
log(item.tag_content)
|
|
|
|
else:
|
|
|
|
result = entry.findAll(item.tag,class_=item.tag_content)
|
2024-07-18 09:26:13 +00:00
|
|
|
log("found count results:",len(result))
|
2024-07-22 08:54:28 +00:00
|
|
|
if len(result)==0 and DEBUG == True:
|
|
|
|
log("len result: ",len(result))
|
|
|
|
for x in result:
|
2024-07-18 11:48:44 +00:00
|
|
|
log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
|
2024-07-18 09:26:13 +00:00
|
|
|
input()
|
2024-06-13 09:14:04 +00:00
|
|
|
if result:
|
|
|
|
log("theres a result")
|
|
|
|
if i>(len(result)-1):
|
|
|
|
log("len:",len(result)-1,"i:",i)
|
|
|
|
log("index out of bounds fall back to the %d count",i)
|
|
|
|
i=(len(result)-1)
|
|
|
|
result2 = result[i]
|
2024-07-22 08:54:28 +00:00
|
|
|
if GETCHILDREN != '':
|
2024-07-18 11:30:20 +00:00
|
|
|
found = False
|
|
|
|
for results in result:
|
|
|
|
child = results.find(GETCHILDREN)
|
2024-07-23 12:55:37 +00:00
|
|
|
log("[finder] search for '",GETCHILDREN,"' in: ",child)
|
2024-07-18 11:30:20 +00:00
|
|
|
if child != None and found == False:
|
2024-07-23 12:55:37 +00:00
|
|
|
log("CHILD text strip: ",child.text.strip())
|
2024-07-18 11:30:20 +00:00
|
|
|
found = True
|
|
|
|
content.append(child.text.strip())
|
|
|
|
if found == False:
|
2024-07-23 12:55:37 +00:00
|
|
|
log("[finder] No matching Child found: ",child)
|
2024-07-18 11:30:20 +00:00
|
|
|
content.append("CHILD_NOT_FOUND: " + GETCHILDREN)
|
|
|
|
|
|
|
|
elif LOCATION_CLEANUP==1:
|
2024-06-13 09:14:04 +00:00
|
|
|
location = CleanLocation(result2.text.strip())
|
|
|
|
content.append(location)
|
|
|
|
elif LINK==1:
|
|
|
|
string = result2.get("href")
|
|
|
|
if BASEURL:
|
|
|
|
string = BASEURL+string
|
|
|
|
content.append(string)
|
|
|
|
elif SWAPDATE==1:
|
|
|
|
content.append(DateCHToUS(result2.text.strip()))
|
|
|
|
elif CLEANDATE==1:
|
2024-07-23 12:55:37 +00:00
|
|
|
log("[finder] pre cleandate:",result2.text.strip)
|
2024-06-13 09:14:04 +00:00
|
|
|
content.append(jobs_ch_clean_date(result2.text.strip()))
|
2024-08-21 09:42:47 +00:00
|
|
|
elif INDEEDDATE==1:
|
|
|
|
log("[finder] pre indeeddate:",result2.text.strip)
|
|
|
|
content.append(indeedExtractDays(result2.text.strip()))
|
2024-06-13 09:14:04 +00:00
|
|
|
else:
|
2024-07-22 08:54:28 +00:00
|
|
|
log(result2)
|
2024-06-13 09:14:04 +00:00
|
|
|
content.append(result2.text.strip())
|
|
|
|
if not result:
|
2024-06-17 09:03:50 +00:00
|
|
|
if item.tag_content == "pubdate":
|
|
|
|
today = datetime.today().strftime('%Y-%m-%d')
|
2024-06-13 09:14:04 +00:00
|
|
|
content.append(today)
|
2024-06-17 08:22:28 +00:00
|
|
|
else:
|
|
|
|
content.append("NOTFound")
|
2024-06-13 09:14:04 +00:00
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
|
def CleanLocation(location):
|
|
|
|
#p = re.compile('CH-[0-9]{4}')
|
|
|
|
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
|
|
|
|
return location
|
|
|
|
|
|
|
|
def arrayToClass(titles,companys,locations,dates,links,tag):
|
|
|
|
jobs = []
|
|
|
|
if(len(titles) == len(companys) == len(locations) == len(dates)):
|
|
|
|
log("len:",len(titles))
|
|
|
|
for i, title in enumerate(titles):
|
|
|
|
jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
|
2024-06-17 08:22:28 +00:00
|
|
|
log("class job:",jobs[i])
|
2024-06-13 09:14:04 +00:00
|
|
|
return jobs
|
|
|
|
else:
|
2024-06-17 08:27:13 +00:00
|
|
|
log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
|
2024-06-13 09:14:04 +00:00
|
|
|
return 0
|
|
|
|
def jobs_ch_clean_date(date):
|
|
|
|
newdate=''
|
|
|
|
|
|
|
|
for i in range(11,len(date)):#remove string "Published:"
|
|
|
|
newdate+=date[i]
|
|
|
|
|
|
|
|
newdate2 = jobs_ch_switch_month(newdate)
|
|
|
|
return newdate2
|
|
|
|
|
|
|
|
def jobs_ch_switch_month(date):
|
|
|
|
newdate=''
|
|
|
|
newmonth=''
|
|
|
|
day = ''
|
|
|
|
year = ''
|
|
|
|
|
|
|
|
for i in range(3,len(date)-5):
|
|
|
|
newmonth += date[i]
|
|
|
|
for month in months:
|
|
|
|
if(month[0] == newmonth):
|
|
|
|
newmonth = month[1]
|
|
|
|
|
|
|
|
for i in range(0,2):
|
|
|
|
day+=date[i]
|
|
|
|
for i in range(len(date)-2,len(date)):
|
|
|
|
year += date[i]
|
|
|
|
newdate = '20'+year+'-'+newmonth+'-'+day
|
|
|
|
return newdate
|
|
|
|
|
|
|
|
def CleanLocation(location):
|
|
|
|
#p = re.compile('CH-[0-9]{4}')
|
|
|
|
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
|
|
|
|
return location
|
|
|
|
|
|
|
|
def extractDomain(url):
|
|
|
|
pattern = r'https:\/\/.*\..+?(?=\/)'
|
|
|
|
domain = re.match(pattern,url)
|
|
|
|
if domain:
|
|
|
|
return domain.group()
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def makeSession(url):
|
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(url)
|
|
|
|
return session
|
2024-08-21 09:39:44 +00:00
|
|
|
def indeedExtractDays(datestr):
|
|
|
|
cleannumstr=''
|
|
|
|
cleannumint=-1
|
|
|
|
cleandate=''
|
|
|
|
foundchar=False
|
|
|
|
for a in datestr:
|
|
|
|
print(a)
|
|
|
|
if a in number and foundchar==False:
|
|
|
|
foundchar=True
|
|
|
|
cleannumstr+=a
|
|
|
|
elif a in number and foundchar == True:
|
|
|
|
cleannumstr+=a
|
|
|
|
elif a not in number and foundchar == True:
|
|
|
|
break
|
2024-08-22 08:04:39 +00:00
|
|
|
if cleannumstr != '':
|
|
|
|
cleannumint = int(cleannumstr)
|
|
|
|
today = int(datetime.utcnow().timestamp())
|
|
|
|
cleandate = today - cleannumint * 60 * 60 * 7 * 4
|
2024-08-21 09:39:44 +00:00
|
|
|
#print("int:",cleannumint,"today:",today,"cleandate:",datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d'))
|
2024-08-22 08:04:39 +00:00
|
|
|
return datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d')
|
|
|
|
return "NOTFound"
|
2024-08-22 08:52:45 +00:00
|
|
|
def getCookiesFromBrowser(url):
|
2024-08-22 08:04:39 +00:00
|
|
|
#workaround for loked database
|
|
|
|
shutil.copyfile(cookiePath,tmpPath)
|
|
|
|
cookie = ''
|
|
|
|
rows = [0]
|
|
|
|
|
|
|
|
with sqlite3.connect(tmpPath) as connection:
|
|
|
|
cmd_read_cookies = f"""SELECT name,value FROM moz_cookies WHERE host like ?;"""
|
|
|
|
print(cmd_read_cookies)
|
|
|
|
cursor = connection.cursor()
|
2024-08-22 08:52:45 +00:00
|
|
|
cursor.execute(cmd_read_cookies,('%'+url+'%',))
|
2024-08-22 08:04:39 +00:00
|
|
|
while len(rows)!=0:
|
|
|
|
rows = cursor.fetchmany(25)
|
|
|
|
for row in rows:
|
|
|
|
print("row:",row)
|
|
|
|
cookie = cookie + row[0] + '=' + row[1]
|
|
|
|
cookie += ";"
|
|
|
|
|
|
|
|
print("Cookies:",cookie)
|
|
|
|
return cookie
|
|
|
|
#access cookies from firefox:
|
|
|
|
#copy (because locked): cp .mozilla/firefox/imibizoh.default/cookies.sqlite cookies.sqlite
|
|
|
|
#Select value from moz_cookies where host like '%indeed%'
|
2024-08-22 08:52:45 +00:00
|
|
|
def urlToDomain(url):
|
|
|
|
pos = patternSearch(url,"https://")
|
|
|
|
urlCut = dropBeforePos(url,pos)
|
|
|
|
print("url cut",urlCut)
|
|
|
|
posDot = skipAfterChar(urlCut,'.')
|
|
|
|
urlCut = dropBeforePos(urlCut,posDot)
|
|
|
|
print("url after cut dot:",urlCut)
|
|
|
|
posDot = skipAfterChar(urlCut,'.')
|
|
|
|
urlCut = dropAfterPos(urlCut,posDot)
|
|
|
|
print("url after cut dot:",urlCut)
|
|
|
|
def patternSearch(url,pattern):
|
|
|
|
x = 0
|
|
|
|
for a,i in enumerate(url):
|
|
|
|
print("i:",i)
|
|
|
|
if i == pattern[x]:
|
|
|
|
if x<len(pattern)-1:
|
|
|
|
x = x + 1
|
|
|
|
elif x==len(pattern)-1:
|
|
|
|
print("FULL PATTERN FOUND at pos :",a)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
x = 0
|
|
|
|
return a
|
|
|
|
def skipAfterChar(aString,char):
|
|
|
|
for a,i in enumerate(aString):
|
|
|
|
if i == char:
|
|
|
|
break
|
|
|
|
return a
|
|
|
|
def dropBeforePos(aString,pos):
|
|
|
|
aString2=''
|
|
|
|
pos+=1
|
|
|
|
if pos < len(aString):
|
|
|
|
for i in range(pos,len(aString)):
|
|
|
|
aString2 += aString[i]
|
|
|
|
return aString2
|
|
|
|
def dropAfterPos(aString,pos):
|
|
|
|
aString2=''
|
|
|
|
if pos < len(aString):
|
|
|
|
for i in range(0,pos):
|
|
|
|
aString2 += aString[i]
|
|
|
|
return aString2
|