job-scrapper/lib/helpers.py

290 lines
9.2 KiB
Python
Raw Normal View History

2024-06-13 09:14:04 +00:00
import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
import shutil
2024-06-13 09:14:04 +00:00
from dateconverter import *
from datetime import datetime
import os
import sqlite3
DEBUG = True
2024-08-21 09:39:44 +00:00
number = ['0','1','2','3','4','5','6','7','8','9']
homePath = os.path.expanduser('~')
cookiePath = homePath + "/.mozilla/firefox/imibizoh.default/cookies.sqlite"
tmpPath = "/tmp/cookies.sqlite"
DBFILE = "../db/sqlite3.db"
2024-06-13 09:14:04 +00:00
def log(*s):
if DEBUG:
print(s)
class mode():
#def set(self,flag,value):
# self.flag = flag
# self.value = value
#def __init__(self,):
DEFAULT = 0
LINK = 0
LOCATION_CLEANUP = 0
SWAPDATE = 0
CLEANDATE = 0
ATTRS = 0
months = [
('January','01'),
('February','02'),
('March','03'),
('April','04'),
('May','05'),
('June','06'),
('July','07'),
('August','08'),
('September','09'),
('October','10'),
('November','11'),
('December','12')]
class item():
def __init__(self,tag,tag_content,index,name=None):
2024-06-13 09:14:04 +00:00
self.tag = tag
self.tag_content = tag_content
self.index = index
if name is not None:
self.name = name
else:
self.name = "not defined"
2024-06-13 09:14:04 +00:00
class job():
def __init__(self,title,profession,company,location,date,description,link,tag,starred):
self.title = title
self.profession = profession
self.company = company
self.location = location
self.date = date
self.description = description
self.link = link
self.tag = tag
self.starred = starred
def __str__(self):
return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)
def finder(results,item,**modes):
GETCHILDREN = modes.get("GETCHILDREN",'')
2024-06-13 09:14:04 +00:00
ATTRS = modes.get('ATTRS',0)
LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
LINK = modes.get('LINK',0)
SWAPDATE = modes.get('SWAPDATE',0)
CLEANDATE = modes.get('CLEANDATE',0)
BASEURL = modes.get('BASEURL','')
2024-08-21 09:42:47 +00:00
INDEEDDATE = modes.get('INDEEDDATE',0)
2024-06-13 09:14:04 +00:00
content = []
i = item.index
log("name",item.name)
log("Item tag: ",item.tag)
2024-06-13 09:14:04 +00:00
log("Modes:",modes)
log("tag_content: ",item.tag_content)
2024-06-13 09:14:04 +00:00
for entry in results:
if ATTRS==1:
result = entry.findAll(item.tag,attrs=item.tag_content)
log(item.tag_content)
else:
result = entry.findAll(item.tag,class_=item.tag_content)
log("found count results:",len(result))
if len(result)==0 and DEBUG == True:
log("len result: ",len(result))
for x in result:
2024-07-18 11:48:44 +00:00
log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
input()
2024-06-13 09:14:04 +00:00
if result:
log("theres a result")
if i>(len(result)-1):
log("len:",len(result)-1,"i:",i)
log("index out of bounds fall back to the %d count",i)
i=(len(result)-1)
result2 = result[i]
if GETCHILDREN != '':
found = False
for results in result:
child = results.find(GETCHILDREN)
log("[finder] search for '",GETCHILDREN,"' in: ",child)
if child != None and found == False:
log("CHILD text strip: ",child.text.strip())
found = True
content.append(child.text.strip())
if found == False:
log("[finder] No matching Child found: ",child)
content.append("CHILD_NOT_FOUND: " + GETCHILDREN)
elif LOCATION_CLEANUP==1:
2024-06-13 09:14:04 +00:00
location = CleanLocation(result2.text.strip())
content.append(location)
elif LINK==1:
string = result2.get("href")
if BASEURL:
string = BASEURL+string
content.append(string)
elif SWAPDATE==1:
content.append(DateCHToUS(result2.text.strip()))
elif CLEANDATE==1:
log("[finder] pre cleandate:",result2.text.strip)
2024-06-13 09:14:04 +00:00
content.append(jobs_ch_clean_date(result2.text.strip()))
2024-08-21 09:42:47 +00:00
elif INDEEDDATE==1:
log("[finder] pre indeeddate:",result2.text.strip)
content.append(indeedExtractDays(result2.text.strip()))
2024-06-13 09:14:04 +00:00
else:
log(result2)
2024-06-13 09:14:04 +00:00
content.append(result2.text.strip())
if not result:
if item.tag_content == "pubdate":
today = datetime.today().strftime('%Y-%m-%d')
2024-06-13 09:14:04 +00:00
content.append(today)
2024-06-17 08:22:28 +00:00
else:
content.append("NOTFound")
2024-06-13 09:14:04 +00:00
return content
def CleanLocation(location):
#p = re.compile('CH-[0-9]{4}')
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
return location
def arrayToClass(titles,companys,locations,dates,links,tag):
jobs = []
if(len(titles) == len(companys) == len(locations) == len(dates)):
log("len:",len(titles))
for i, title in enumerate(titles):
jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
2024-06-17 08:22:28 +00:00
log("class job:",jobs[i])
2024-06-13 09:14:04 +00:00
return jobs
else:
2024-06-17 08:27:13 +00:00
log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
2024-06-13 09:14:04 +00:00
return 0
def jobs_ch_clean_date(date):
newdate=''
for i in range(11,len(date)):#remove string "Published:"
newdate+=date[i]
newdate2 = jobs_ch_switch_month(newdate)
return newdate2
def jobs_ch_switch_month(date):
newdate=''
newmonth=''
day = ''
year = ''
for i in range(3,len(date)-5):
newmonth += date[i]
for month in months:
if(month[0] == newmonth):
newmonth = month[1]
for i in range(0,2):
day+=date[i]
for i in range(len(date)-2,len(date)):
year += date[i]
newdate = '20'+year+'-'+newmonth+'-'+day
return newdate
def CleanLocation(location):
#p = re.compile('CH-[0-9]{4}')
location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
return location
def extractDomain(url):
pattern = r'https:\/\/.*\..+?(?=\/)'
domain = re.match(pattern,url)
if domain:
return domain.group()
else:
return 0
def makeSession(url):
with requests.Session() as session:
page = session.get(url)
return session
2024-08-21 09:39:44 +00:00
def indeedExtractDays(datestr):
cleannumstr=''
cleannumint=-1
cleandate=''
foundchar=False
for a in datestr:
print(a)
if a in number and foundchar==False:
foundchar=True
cleannumstr+=a
elif a in number and foundchar == True:
cleannumstr+=a
elif a not in number and foundchar == True:
break
if cleannumstr != '':
cleannumint = int(cleannumstr)
today = int(datetime.utcnow().timestamp())
cleandate = today - cleannumint * 60 * 60 * 7 * 4
2024-08-21 09:39:44 +00:00
#print("int:",cleannumint,"today:",today,"cleandate:",datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d'))
return datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d')
return "NOTFound"
2024-08-22 08:52:45 +00:00
def getCookiesFromBrowser(url):
#workaround for loked database
shutil.copyfile(cookiePath,tmpPath)
cookie = ''
rows = [0]
with sqlite3.connect(tmpPath) as connection:
cmd_read_cookies = f"""SELECT name,value FROM moz_cookies WHERE host like ?;"""
print(cmd_read_cookies)
cursor = connection.cursor()
2024-08-22 08:52:45 +00:00
cursor.execute(cmd_read_cookies,('%'+url+'%',))
while len(rows)!=0:
rows = cursor.fetchmany(25)
for row in rows:
print("row:",row)
cookie = cookie + row[0] + '=' + row[1]
cookie += ";"
print("Cookies:",cookie)
return cookie
#access cookies from firefox:
#copy (because locked): cp .mozilla/firefox/imibizoh.default/cookies.sqlite cookies.sqlite
#Select value from moz_cookies where host like '%indeed%'
2024-08-22 08:52:45 +00:00
def urlToDomain(url):
pos = patternSearch(url,"https://")
urlCut = dropBeforePos(url,pos)
print("url cut",urlCut)
posDot = skipAfterChar(urlCut,'.')
urlCut = dropBeforePos(urlCut,posDot)
print("url after cut dot:",urlCut)
posDot = skipAfterChar(urlCut,'.')
urlCut = dropAfterPos(urlCut,posDot)
print("url after cut dot:",urlCut)
def patternSearch(url,pattern):
x = 0
for a,i in enumerate(url):
print("i:",i)
if i == pattern[x]:
if x<len(pattern)-1:
x = x + 1
elif x==len(pattern)-1:
print("FULL PATTERN FOUND at pos :",a)
break
else:
x = 0
return a
def skipAfterChar(aString,char):
for a,i in enumerate(aString):
if i == char:
break
return a
def dropBeforePos(aString,pos):
aString2=''
pos+=1
if pos < len(aString):
for i in range(pos,len(aString)):
aString2 += aString[i]
return aString2
def dropAfterPos(aString,pos):
aString2=''
if pos < len(aString):
for i in range(0,pos):
aString2 += aString[i]
return aString2