added indeed suport
start extracting date from "vor X days" where time = today-X
This commit is contained in:
parent
8d59514ddf
commit
42d11c1c8d
7
lib/conf
7
lib/conf
@ -1,3 +1,10 @@
|
|||||||
|
[quereinsteiger_indeed]
|
||||||
|
USER = NONE
|
||||||
|
PW = NONE
|
||||||
|
LOGINURL = NONE
|
||||||
|
SCRAPURL = https://ch.indeed.com/jobs?q=quereinsteiger+it&l=&from=searchOnDesktopSerp&vjk=7c069ecf7f0cabb6
|
||||||
|
TAG = Informatiker, Quereinsteiger
|
||||||
|
|
||||||
[jobs.ch_seilbahn]
|
[jobs.ch_seilbahn]
|
||||||
USER = NONE
|
USER = NONE
|
||||||
PW = NONE
|
PW = NONE
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import re
|
||||||
|
import datetime
|
||||||
def DateCHToUS(date):
|
def DateCHToUS(date):
|
||||||
#01.02.2010 --> 2010-02-01
|
#01.02.2010 --> 2010-02-01
|
||||||
day=""
|
day=""
|
||||||
@ -12,3 +14,10 @@ def DateCHToUS(date):
|
|||||||
newdate = year+"-"+month+"-"+day
|
newdate = year+"-"+month+"-"+day
|
||||||
return(newdate)
|
return(newdate)
|
||||||
|
|
||||||
|
def indeed_date(date):
|
||||||
|
redate = re.match('\d+',date)
|
||||||
|
fixdate = today().strftime("%Y/%m%d") - timedelta(days=redate.group())
|
||||||
|
print("date: today")
|
||||||
|
return fixdate
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,12 +95,13 @@ def finder(results,item,**modes):
|
|||||||
found = False
|
found = False
|
||||||
for results in result:
|
for results in result:
|
||||||
child = results.find(GETCHILDREN)
|
child = results.find(GETCHILDREN)
|
||||||
log(child)
|
log("[finder] search for '",GETCHILDREN,"' in: ",child)
|
||||||
if child != None and found == False:
|
if child != None and found == False:
|
||||||
log("CHILD: ",child.text.strip())
|
log("CHILD text strip: ",child.text.strip())
|
||||||
found = True
|
found = True
|
||||||
content.append(child.text.strip())
|
content.append(child.text.strip())
|
||||||
if found == False:
|
if found == False:
|
||||||
|
log("[finder] No matching Child found: ",child)
|
||||||
content.append("CHILD_NOT_FOUND: " + GETCHILDREN)
|
content.append("CHILD_NOT_FOUND: " + GETCHILDREN)
|
||||||
|
|
||||||
elif LOCATION_CLEANUP==1:
|
elif LOCATION_CLEANUP==1:
|
||||||
@ -114,6 +115,7 @@ def finder(results,item,**modes):
|
|||||||
elif SWAPDATE==1:
|
elif SWAPDATE==1:
|
||||||
content.append(DateCHToUS(result2.text.strip()))
|
content.append(DateCHToUS(result2.text.strip()))
|
||||||
elif CLEANDATE==1:
|
elif CLEANDATE==1:
|
||||||
|
log("[finder] pre cleandate:",result2.text.strip)
|
||||||
content.append(jobs_ch_clean_date(result2.text.strip()))
|
content.append(jobs_ch_clean_date(result2.text.strip()))
|
||||||
else:
|
else:
|
||||||
log(result2)
|
log(result2)
|
||||||
|
@ -4,31 +4,43 @@ DEBUG = True
|
|||||||
def log(*s):
|
def log(*s):
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
print(s)
|
print(s)
|
||||||
def indeed_com(url,session):
|
|
||||||
|
def scrap_indeed_com(url,entry,session):
|
||||||
|
log("[scrap_indeed_com] url: ",url)
|
||||||
jobs = []
|
jobs = []
|
||||||
if(session == 0):
|
# if(session == 0):
|
||||||
with requests.Session() as session:
|
with requests.Session() as session:
|
||||||
page = session.get(url)
|
session.headers = {
|
||||||
log(page)
|
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
|
||||||
else:
|
}
|
||||||
page = session.get(url)
|
page = session.get(url)
|
||||||
log(page)
|
log(page)
|
||||||
|
# else:
|
||||||
|
# page = session.get(url)
|
||||||
|
# log(page)
|
||||||
soup = BeautifulSoup(page.content,"html.parser")
|
soup = BeautifulSoup(page.content,"html.parser")
|
||||||
#print(soup.prettify())
|
#print(soup.prettify())
|
||||||
|
|
||||||
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0')
|
|
||||||
|
|
||||||
location = item("p",{'data-testid':'text-location'},0)
|
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
|
||||||
ar_location = finder(results,location,LOCATION_CLEANUP=1,ATTRS=1)
|
|
||||||
|
|
||||||
company = item("p",{'data-testid':'company-name'},0)
|
location = item("div",{'data-testid':'text-location'},0,"indeed location")
|
||||||
ar_company = finder(results,location,ATTRS=1)
|
ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)
|
||||||
|
|
||||||
title = item("a",'jobTitle',0)
|
company = item("span",{'data-testid':'company-name'},0,"indeed company")
|
||||||
ar_title = finder(results,location)
|
ar_company = finder(results,company,ATTRS=1)
|
||||||
|
|
||||||
date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0)
|
title = item("a",'jcs-JobTitle',0,"indeed title")
|
||||||
ar_date = finder(results,date,CLEANDATE=1)
|
ar_title = finder(results,title,GETCHILDREN="span")
|
||||||
|
|
||||||
|
date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")
|
||||||
|
ar_date = finder(results,date,ATTRS=1)
|
||||||
|
|
||||||
|
link = item("a",'jcs-JobTitle',0,"link")
|
||||||
|
ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")
|
||||||
|
|
||||||
|
tag = entry.tag#get from config
|
||||||
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
||||||
|
|
||||||
def scrap_jobs(url,entry,session):
|
def scrap_jobs(url,entry,session):
|
||||||
jobs = []
|
jobs = []
|
||||||
@ -64,6 +76,28 @@ def scrap_jobs(url,entry,session):
|
|||||||
tag = entry.tag#get from config
|
tag = entry.tag#get from config
|
||||||
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
||||||
|
|
||||||
|
def next_url_indeed_com(url,session,baseurl):
|
||||||
|
next_link_str = ''
|
||||||
|
if(session == 0):
|
||||||
|
with requests.Session() as session:
|
||||||
|
page = session.get(url)
|
||||||
|
else:
|
||||||
|
page = requests.get(url)
|
||||||
|
soup = BeautifulSoup(page.content,"html.parser")
|
||||||
|
result_next = soup.findAll("nav",attrs={"role":"navigation"})
|
||||||
|
next_=item("a",{'data-testid':'pagination-page-next'},0)
|
||||||
|
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
|
||||||
|
if next_link:
|
||||||
|
if(next_link[0] != "NOTFound"):
|
||||||
|
next_link_str = str(next_link[0])
|
||||||
|
next_link_str = baseurl + next_link_str
|
||||||
|
log(next_link_str)
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
if next_link_str != '':
|
||||||
|
return next_link_str
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
def next_url_jobs_ch(url,session,baseurl):
|
def next_url_jobs_ch(url,session,baseurl):
|
||||||
next_link_str = ''
|
next_link_str = ''
|
||||||
if(session == 0):
|
if(session == 0):
|
||||||
|
@ -23,6 +23,8 @@ def choose_scraper(entry,session):
|
|||||||
runner(entry,session,scrap_jobagent,next_url_jobagent)
|
runner(entry,session,scrap_jobagent,next_url_jobagent)
|
||||||
case 'https://www.jobagent.ch':
|
case 'https://www.jobagent.ch':
|
||||||
runner(entry,session,scrap_jobagent,next_url_jobagent)
|
runner(entry,session,scrap_jobagent,next_url_jobagent)
|
||||||
|
case 'https://ch.indeed.com':
|
||||||
|
runner(entry,session,scrap_indeed_com,next_url_indeed_com)
|
||||||
|
|
||||||
def parse(**kwargs):
|
def parse(**kwargs):
|
||||||
session=0
|
session=0
|
||||||
@ -95,7 +97,6 @@ def runner(entry,session,scrap_func,next_url_func):
|
|||||||
print(domain)
|
print(domain)
|
||||||
if domain == 'https://www.jobagent.ch' or domain == 'https://software-job.ch':
|
if domain == 'https://www.jobagent.ch' or domain == 'https://software-job.ch':
|
||||||
jobs = scrap_func(b_url,entry,session)
|
jobs = scrap_func(b_url,entry,session)
|
||||||
log("jobs passing to db:",jobs)
|
|
||||||
if jobs:
|
if jobs:
|
||||||
writedb(jobs)
|
writedb(jobs)
|
||||||
else:
|
else:
|
||||||
@ -108,6 +109,15 @@ def runner(entry,session,scrap_func,next_url_func):
|
|||||||
else:
|
else:
|
||||||
print("nothing found on this page")
|
print("nothing found on this page")
|
||||||
b_url = next_url_func(b_url,session,"https://www.jobs.ch")
|
b_url = next_url_func(b_url,session,"https://www.jobs.ch")
|
||||||
|
elif domain == 'https://ch.indeed.com':
|
||||||
|
jobs = scrap_func(b_url,entry,session)
|
||||||
|
if jobs:
|
||||||
|
writedb(jobs)
|
||||||
|
else:
|
||||||
|
print("nothing found on this page")
|
||||||
|
b_url = next_url_func(b_url,session,domain)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if b_url != 0:
|
if b_url != 0:
|
||||||
print("main:" + b_url)
|
print("main:" + b_url)
|
||||||
|
Loading…
Reference in New Issue
Block a user