diff --git a/lib/conf b/lib/conf index 0c824cd..ebdceb9 100644 --- a/lib/conf +++ b/lib/conf @@ -1,3 +1,10 @@ +[quereinsteiger_indeed] +USER = NONE +PW = NONE +LOGINURL = NONE +SCRAPURL = https://ch.indeed.com/jobs?q=quereinsteiger+it&l=&from=searchOnDesktopSerp&vjk=7c069ecf7f0cabb6 +TAG = Informatiker, Quereinsteiger + [jobs.ch_seilbahn] USER = NONE PW = NONE diff --git a/lib/dateconverter.py b/lib/dateconverter.py index 9081d39..8a17d66 100644 --- a/lib/dateconverter.py +++ b/lib/dateconverter.py @@ -1,3 +1,5 @@ +import re +import datetime def DateCHToUS(date): #01.02.2010 --> 2010-02-01 day="" @@ -12,3 +14,10 @@ def DateCHToUS(date): newdate = year+"-"+month+"-"+day return(newdate) +def indeed_date(date): + redate = re.match('\d+',date) + fixdate = today().strftime("%Y/%m%d") - timedelta(days=redate.group()) + print("date: today") + return fixdate + + diff --git a/lib/helpers.py b/lib/helpers.py index 39ff331..0535c83 100644 --- a/lib/helpers.py +++ b/lib/helpers.py @@ -95,12 +95,13 @@ def finder(results,item,**modes): found = False for results in result: child = results.find(GETCHILDREN) - log(child) + log("[finder] search for '",GETCHILDREN,"' in: ",child) if child != None and found == False: - log("CHILD: ",child.text.strip()) + log("CHILD text strip: ",child.text.strip()) found = True content.append(child.text.strip()) if found == False: + log("[finder] No matching Child found: ",child) content.append("CHILD_NOT_FOUND: " + GETCHILDREN) elif LOCATION_CLEANUP==1: @@ -114,6 +115,7 @@ def finder(results,item,**modes): elif SWAPDATE==1: content.append(DateCHToUS(result2.text.strip())) elif CLEANDATE==1: + log("[finder] pre cleandate:",result2.text.strip) content.append(jobs_ch_clean_date(result2.text.strip())) else: log(result2) diff --git a/lib/scrap_jobs.py b/lib/scrap_jobs.py index 7760302..1d70472 100644 --- a/lib/scrap_jobs.py +++ b/lib/scrap_jobs.py @@ -4,31 +4,43 @@ DEBUG = True def log(*s): if DEBUG: print(s) -def indeed_com(url,session): + +def scrap_indeed_com(url,entry,session): + log("[scrap_indeed_com] url: ",url) jobs = [] - if(session == 0): - with requests.Session() as session: - page = session.get(url) - log(page) - else: + # if(session == 0): + with requests.Session() as session: + session.headers = { + "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0" + } page = session.get(url) log(page) +# else: +# page = session.get(url) +# log(page) soup = BeautifulSoup(page.content,"html.parser") #print(soup.prettify()) - results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') + + results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element - location = item("p",{'data-testid':'text-location'},0) - ar_location = finder(results,location,LOCATION_CLEANUP=1,ATTRS=1) + location = item("div",{'data-testid':'text-location'},0,"indeed location") + ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1) - company = item("p",{'data-testid':'company-name'},0) - ar_company = finder(results,location,ATTRS=1) + company = item("span",{'data-testid':'company-name'},0,"indeed company") + ar_company = finder(results,company,ATTRS=1) - title = item("a",'jobTitle',0) - ar_title = finder(results,location) + title = item("a",'jcs-JobTitle',0,"indeed title") + ar_title = finder(results,title,GETCHILDREN="span") - date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0) - ar_date = finder(results,date,CLEANDATE=1) + date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date") + ar_date = finder(results,date,ATTRS=1) + + link = item("a",'jcs-JobTitle',0,"link") + ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com") + + tag = entry.tag#get from config + return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag) def scrap_jobs(url,entry,session): jobs = [] @@ -64,6 +76,28 @@ def scrap_jobs(url,entry,session): tag = entry.tag#get from config return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag) +def next_url_indeed_com(url,session,baseurl): + next_link_str = '' + if(session == 0): + with requests.Session() as session: + page = session.get(url) + else: + page = requests.get(url) + soup = BeautifulSoup(page.content,"html.parser") + result_next = soup.findAll("nav",attrs={"role":"navigation"}) + next_=item("a",{'data-testid':'pagination-page-next'},0) + next_link = finder(result_next,next_,ATTRS=1,LINK=1) + if next_link: + if(next_link[0] != "NOTFound"): + next_link_str = str(next_link[0]) + next_link_str = baseurl + next_link_str + log(next_link_str) + else: + return 0 + if next_link_str != '': + return next_link_str + else: + return 0 def next_url_jobs_ch(url,session,baseurl): next_link_str = '' if(session == 0): diff --git a/lib/sysparse.py b/lib/sysparse.py index b067237..c063524 100644 --- a/lib/sysparse.py +++ b/lib/sysparse.py @@ -23,6 +23,8 @@ def choose_scraper(entry,session): runner(entry,session,scrap_jobagent,next_url_jobagent) case 'https://www.jobagent.ch': runner(entry,session,scrap_jobagent,next_url_jobagent) + case 'https://ch.indeed.com': + runner(entry,session,scrap_indeed_com,next_url_indeed_com) def parse(**kwargs): session=0 @@ -95,7 +97,6 @@ def runner(entry,session,scrap_func,next_url_func): print(domain) if domain == 'https://www.jobagent.ch' or domain == 'https://software-job.ch': jobs = scrap_func(b_url,entry,session) - log("jobs passing to db:",jobs) if jobs: writedb(jobs) else: @@ -108,6 +109,15 @@ def runner(entry,session,scrap_func,next_url_func): else: print("nothing found on this page") b_url = next_url_func(b_url,session,"https://www.jobs.ch") + elif domain == 'https://ch.indeed.com': + jobs = scrap_func(b_url,entry,session) + if jobs: + writedb(jobs) + else: + print("nothing found on this page") + b_url = next_url_func(b_url,session,domain) + + if b_url != 0: print("main:" + b_url)