added indeed suport

start extracting date from "vor X days" where time = today-X
2024-07-23 14:55:37 +02:00 · 2024-07-23 14:55:37 +02:00 · 42d11c1c8d
commit 42d11c1c8d
parent 8d59514ddf
5 changed files with 80 additions and 18 deletions
--- a/lib/conf
+++ b/lib/conf
@ -1,3 +1,10 @@
+[quereinsteiger_indeed]
+USER = NONE
+PW = NONE
+LOGINURL = NONE
+SCRAPURL = https://ch.indeed.com/jobs?q=quereinsteiger+it&l=&from=searchOnDesktopSerp&vjk=7c069ecf7f0cabb6
+TAG = Informatiker, Quereinsteiger
+
 [jobs.ch_seilbahn]
 USER = NONE
 PW = NONE
--- a/lib/dateconverter.py
+++ b/lib/dateconverter.py
@ -1,3 +1,5 @@
+import re
+import datetime
 def DateCHToUS(date):
    #01.02.2010 --> 2010-02-01
    day=""
@ -12,3 +14,10 @@ def DateCHToUS(date):
    newdate = year+"-"+month+"-"+day
    return(newdate)

+def indeed_date(date):
+    redate = re.match('\d+',date)
+    fixdate = today().strftime("%Y/%m%d") - timedelta(days=redate.group())
+    print("date: today")
+    return fixdate
+
+
--- a/lib/helpers.py
+++ b/lib/helpers.py
@ -95,12 +95,13 @@ def finder(results,item,**modes):
                found = False
                for results in result:
                    child = results.find(GETCHILDREN)
-                    log(child)
+                    log("[finder] search for '",GETCHILDREN,"' in: ",child)
                    if child != None and found == False:
-                        log("CHILD: ",child.text.strip())
+                        log("CHILD text strip: ",child.text.strip())
                        found = True
                        content.append(child.text.strip())
                if found == False:
+                    log("[finder] No matching Child found: ",child)
                    content.append("CHILD_NOT_FOUND: " + GETCHILDREN)

            elif LOCATION_CLEANUP==1:
@ -114,6 +115,7 @@ def finder(results,item,**modes):
            elif SWAPDATE==1:
                content.append(DateCHToUS(result2.text.strip()))
            elif CLEANDATE==1:
+                log("[finder] pre cleandate:",result2.text.strip)
                content.append(jobs_ch_clean_date(result2.text.strip()))
            else:
                log(result2)
--- a/lib/scrap_jobs.py
+++ b/lib/scrap_jobs.py
@ -4,31 +4,43 @@ DEBUG = True
 def log(*s):
    if DEBUG:
        print(s)
-def indeed_com(url,session):
+
+def scrap_indeed_com(url,entry,session):
+    log("[scrap_indeed_com] url: ",url)
    jobs = []
-    if(session == 0):
+  #  if(session == 0):
    with requests.Session() as session:
+        session.headers = {
+                "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
+                } 
        page = session.get(url)
        log(page)
-    else:
-        page = session.get(url)
-        log(page)
+#    else:
+#        page = session.get(url)
+#        log(page)
    soup = BeautifulSoup(page.content,"html.parser")
    #print(soup.prettify())

-    results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0')

-    location = item("p",{'data-testid':'text-location'},0)
-    ar_location = finder(results,location,LOCATION_CLEANUP=1,ATTRS=1)
+    results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
    
-    company = item("p",{'data-testid':'company-name'},0)
-    ar_company = finder(results,location,ATTRS=1)
+    location = item("div",{'data-testid':'text-location'},0,"indeed location")
+    ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)

-    title = item("a",'jobTitle',0)
-    ar_title = finder(results,location)
+    company = item("span",{'data-testid':'company-name'},0,"indeed company")
+    ar_company = finder(results,company,ATTRS=1)

-    date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0)
-    ar_date = finder(results,date,CLEANDATE=1)
+    title = item("a",'jcs-JobTitle',0,"indeed title")
+    ar_title = finder(results,title,GETCHILDREN="span")
+
+    date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")
+    ar_date = finder(results,date,ATTRS=1)
+
+    link = item("a",'jcs-JobTitle',0,"link")
+    ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")
+    
+    tag = entry.tag#get from config
+    return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)

 def scrap_jobs(url,entry,session):
    jobs = []
@ -64,6 +76,28 @@ def scrap_jobs(url,entry,session):
    tag = entry.tag#get from config
    return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)

+def next_url_indeed_com(url,session,baseurl):
+    next_link_str = ''
+    if(session == 0):
+        with requests.Session() as session:
+            page = session.get(url)
+    else:
+        page = requests.get(url)
+    soup = BeautifulSoup(page.content,"html.parser")
+    result_next = soup.findAll("nav",attrs={"role":"navigation"})
+    next_=item("a",{'data-testid':'pagination-page-next'},0)
+    next_link = finder(result_next,next_,ATTRS=1,LINK=1)
+    if next_link:
+        if(next_link[0] != "NOTFound"):
+            next_link_str = str(next_link[0])
+            next_link_str = baseurl + next_link_str 
+            log(next_link_str)
+    else:
+        return 0
+    if next_link_str != '':
+        return next_link_str 
+    else:
+        return 0
 def next_url_jobs_ch(url,session,baseurl):
    next_link_str = ''
    if(session == 0):
--- a/lib/sysparse.py
+++ b/lib/sysparse.py
@ -23,6 +23,8 @@ def choose_scraper(entry,session):
            runner(entry,session,scrap_jobagent,next_url_jobagent)
        case 'https://www.jobagent.ch':
            runner(entry,session,scrap_jobagent,next_url_jobagent)
+        case 'https://ch.indeed.com':
+            runner(entry,session,scrap_indeed_com,next_url_indeed_com)

 def parse(**kwargs):
    session=0 
@ -95,7 +97,6 @@ def runner(entry,session,scrap_func,next_url_func):
            print(domain)
            if domain == 'https://www.jobagent.ch' or domain == 'https://software-job.ch':
                jobs = scrap_func(b_url,entry,session)
-                log("jobs passing to db:",jobs)
                if jobs:
                    writedb(jobs)
                else:
@ -108,6 +109,15 @@ def runner(entry,session,scrap_func,next_url_func):
                else:
                    print("nothing found on this page")
                b_url = next_url_func(b_url,session,"https://www.jobs.ch")
+            elif domain == 'https://ch.indeed.com':
+                jobs = scrap_func(b_url,entry,session)
+                if jobs:
+                    writedb(jobs)
+                else:
+                    print("nothing found on this page")
+                b_url = next_url_func(b_url,session,domain)
+
+

            if b_url != 0:
                print("main:" + b_url)