job-scrapper/lib/scrap_jobs.py

from helpers import *
DEBUG = True

def log(*s):
    if DEBUG:
        print(s)

def scrap_indeed_com(url,entry,session):
    log("[scrap_indeed_com] url: ",url)
    jobs = []
  #  if(session == 0):
    with requests.Session() as session:
        session.headers = {
                "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
                } 
        page = session.get(url)
        log(page)
#    else:
#        page = session.get(url)
#        log(page)
    soup = BeautifulSoup(page.content,"html.parser")
    #print(soup.prettify())


    results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
    
    location = item("div",{'data-testid':'text-location'},0,"indeed location")
    ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)

    company = item("span",{'data-testid':'company-name'},0,"indeed company")
    ar_company = finder(results,company,ATTRS=1)

    title = item("a",'jcs-JobTitle',0,"indeed title")
    ar_title = finder(results,title,GETCHILDREN="span")

    date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")
    ar_date = finder(results,date,ATTRS=1)

    link = item("a",'jcs-JobTitle',0,"link")
    ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")
    
    tag = entry.tag#get from config
    return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)

def scrap_jobs(url,entry,session):
    jobs = []
    log("in scrap jobs,url",url)
    if(session == 0):
        with requests.Session() as session:
            page = session.get(url)
            log(page)
    else:
        page = session.get(url)
        log(page)
    soup = BeautifulSoup(page.content,"html.parser")
    #print(soup.prettify())

    results = soup.find_all("div",attrs={'data-feat':'searched_jobs'})
    
    location_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn"
    location = item("p",location_class,0,"location")
    ar_location = finder(results,location,LOCATION_CLEANUP=1)
    company_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn"
    company  = item("p",company_class,0,"company")
    ar_company = finder(results,company,DEFAULT=1,GETCHILDREN='strong')
    
    title = item("span","jlFpCz",0,"TITLE")
    ar_title = finder(results,title,DEFAULT=1)

    date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0,"date")
    ar_date = finder(results,date,CLEANDATE=1)

    link = item("a",{'data-cy' :'job-link'},0,"link")
    ar_link = finder(results,link,LINK=1,ATTRS=1,BASEURL="https://jobs.ch")
    
    tag = entry.tag#get from config
    return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)

def next_url_indeed_com(url,session,baseurl):
    next_link_str = ''
    if(session == 0):
        with requests.Session() as session:
            page = session.get(url)
    else:
        page = requests.get(url)
    soup = BeautifulSoup(page.content,"html.parser")
    result_next = soup.findAll("nav",attrs={"role":"navigation"})
    next_=item("a",{'data-testid':'pagination-page-next'},0)
    next_link = finder(result_next,next_,ATTRS=1,LINK=1)
    if next_link:
        if(next_link[0] != "NOTFound"):
            next_link_str = str(next_link[0])
            next_link_str = baseurl + next_link_str 
            log(next_link_str)
    else:
        return 0
    if next_link_str != '':
        return next_link_str 
    else:
        return 0
def next_url_jobs_ch(url,session,baseurl):
    next_link_str = ''
    if(session == 0):
        with requests.Session() as session:
            page = session.get(url)
    else:
        page = requests.get(url)
    soup = BeautifulSoup(page.content,"html.parser")
    result_next = soup.findAll("div",attrs={"data-cy":"paginator"})
    next_=item("a",{"data-cy":"paginator-next"},0)
    next_link = finder(result_next,next_,ATTRS=1,LINK=1)
    if next_link:
        if(next_link[0] != "NOTFound"):
            next_link_str = str(next_link[0])
            next_link_str = baseurl + next_link_str 
            log(next_link_str)
    else:
        return 0
    if next_link_str != '':
        return next_link_str 
    else:
        return 0

def next_url_jobagent(base_url,session,c):#depreacted will be removed in the future
    found = False

    if(session == 0):
        with requests.Session() as session:
            page = session.get(base_url)
    else:
        page = requests.get(base_url)

    soup = BeautifulSoup(page.content,"html.parser")
    results = soup.find("ul",class_="pagination")
    
    if(results != None):
        pages = results.text 
    if(results == None):
        print("pagination next not found, probably end of pages:")

    next_url_names = soup.find_all("a",class_="btn btn-sm btn-secondary")
    for i2 in next_url_names:
        striped_string = i2.text.strip()
        log(i2.text.strip(),"stripped:",striped_string)
        log("Printable characters?",striped_string.isprintable())
        if (striped_string) == "Nächste Seite":
            log(i2)
            next_url = i2.get("href")
            log("url of next site")
            found = True
            return next_url
            break

    if found == False:
        return 0

def scrap_jobagent(url,entry,session):
    jobs = []
    log("in scrap jobs,url",url)
    if(session == 0):
        with requests.Session() as session:
            page = session.get(url)
            log(page)
    else:
        page = session.get(url)
        log(page)
    soup = BeautifulSoup(page.content,"html.parser")
    #print(soup.prettify())

    results = soup.find_all("li",class_="item")
    if not results:
        print("no li items found")
        log("page:",page)
    
    title = item("span","jobtitle",0,"jobagent title")
    ar_title = finder(results,title)
    
    location = item("span","location",0,"jobagent location")
    ar_location = finder(results,location,LOCATION_CLEANUP=1)

    company = item("span","company",0,"jobagent company")
    ar_company = finder(results,company,DEFAULT=1)

    link = item("a","title",0,"jobagent link")
    ar_link = finder(results,link,LINK=1)

    date = item("span","pubdate",0)
    ar_date = finder(results,date,SWAPDATE=1)
    tag = entry.tag
    
    return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
Initial commit 2024-06-13 09:14:04 +00:00			`from helpers import *`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`DEBUG = True`
Initial commit 2024-06-13 09:14:04 +00:00
			`def log(*s):`
			`if DEBUG:`
			`print(s)`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00
			`def scrap_indeed_com(url,entry,session):`
			`log("[scrap_indeed_com] url: ",url)`
Initial commit 2024-06-13 09:14:04 +00:00			`jobs = []`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`# if(session == 0):`
			`with requests.Session() as session:`
			`session.headers = {`
			`"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"`
			`}`
Initial commit 2024-06-13 09:14:04 +00:00			`page = session.get(url)`
logic error when pubdate not found 2024-06-17 08:27:13 +00:00			`log(page)`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`# else:`
			`# page = session.get(url)`
			`# log(page)`
Initial commit 2024-06-13 09:14:04 +00:00			`soup = BeautifulSoup(page.content,"html.parser")`
			`#print(soup.prettify())`

added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00
			`results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element`
Initial commit 2024-06-13 09:14:04 +00:00
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`location = item("div",{'data-testid':'text-location'},0,"indeed location")`
			`ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)`
Initial commit 2024-06-13 09:14:04 +00:00
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`company = item("span",{'data-testid':'company-name'},0,"indeed company")`
			`ar_company = finder(results,company,ATTRS=1)`
Initial commit 2024-06-13 09:14:04 +00:00
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`title = item("a",'jcs-JobTitle',0,"indeed title")`
			`ar_title = finder(results,title,GETCHILDREN="span")`
Initial commit 2024-06-13 09:14:04 +00:00
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")`
			`ar_date = finder(results,date,ATTRS=1)`

			`link = item("a",'jcs-JobTitle',0,"link")`
			`ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")`

			`tag = entry.tag#get from config`
			`return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)`
Initial commit 2024-06-13 09:14:04 +00:00
			`def scrap_jobs(url,entry,session):`
			`jobs = []`
			`log("in scrap jobs,url",url)`
			`if(session == 0):`
			`with requests.Session() as session:`
			`page = session.get(url)`
			`log(page)`
			`else:`
			`page = session.get(url)`
			`log(page)`
			`soup = BeautifulSoup(page.content,"html.parser")`
			`#print(soup.prettify())`

- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`results = soup.find_all("div",attrs={'data-feat':'searched_jobs'})`
Initial commit 2024-06-13 09:14:04 +00:00
			`location_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn"`
sdf 2024-07-18 11:48:44 +00:00			`location = item("p",location_class,0,"location")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_location = finder(results,location,LOCATION_CLEANUP=1)`
			`company_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn"`
sdf 2024-07-18 11:48:44 +00:00			`company = item("p",company_class,0,"company")`
-implement GETCHILD from a tag like <strong>sdfsafd</strong> 2024-07-18 11:30:20 +00:00			`ar_company = finder(results,company,DEFAULT=1,GETCHILDREN='strong')`
Initial commit 2024-06-13 09:14:04 +00:00
sdf 2024-07-18 11:48:44 +00:00			`title = item("span","jlFpCz",0,"TITLE")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_title = finder(results,title,DEFAULT=1)`

sdf 2024-07-18 11:48:44 +00:00			`date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0,"date")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_date = finder(results,date,CLEANDATE=1)`

sdf 2024-07-18 11:48:44 +00:00			`link = item("a",{'data-cy' :'job-link'},0,"link")`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`ar_link = finder(results,link,LINK=1,ATTRS=1,BASEURL="https://jobs.ch")`
Initial commit 2024-06-13 09:14:04 +00:00
			`tag = entry.tag#get from config`
			`return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)`

added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`def next_url_indeed_com(url,session,baseurl):`
			`next_link_str = ''`
			`if(session == 0):`
			`with requests.Session() as session:`
			`page = session.get(url)`
			`else:`
			`page = requests.get(url)`
			`soup = BeautifulSoup(page.content,"html.parser")`
			`result_next = soup.findAll("nav",attrs={"role":"navigation"})`
			`next_=item("a",{'data-testid':'pagination-page-next'},0)`
			`next_link = finder(result_next,next_,ATTRS=1,LINK=1)`
			`if next_link:`
			`if(next_link[0] != "NOTFound"):`
			`next_link_str = str(next_link[0])`
			`next_link_str = baseurl + next_link_str`
			`log(next_link_str)`
			`else:`
			`return 0`
			`if next_link_str != '':`
			`return next_link_str`
			`else:`
			`return 0`
Initial commit 2024-06-13 09:14:04 +00:00			`def next_url_jobs_ch(url,session,baseurl):`
			`next_link_str = ''`
			`if(session == 0):`
			`with requests.Session() as session:`
			`page = session.get(url)`
			`else:`
			`page = requests.get(url)`
			`soup = BeautifulSoup(page.content,"html.parser")`
			`result_next = soup.findAll("div",attrs={"data-cy":"paginator"})`
			`next_=item("a",{"data-cy":"paginator-next"},0)`
			`next_link = finder(result_next,next_,ATTRS=1,LINK=1)`
			`if next_link:`
			`if(next_link[0] != "NOTFound"):`
			`next_link_str = str(next_link[0])`
			`next_link_str = baseurl + next_link_str`
			`log(next_link_str)`
			`else:`
			`return 0`
			`if next_link_str != '':`
			`return next_link_str`
			`else:`
			`return 0`

			`def next_url_jobagent(base_url,session,c):#depreacted will be removed in the future`
			`found = False`

			`if(session == 0):`
			`with requests.Session() as session:`
			`page = session.get(base_url)`
			`else:`
			`page = requests.get(base_url)`

			`soup = BeautifulSoup(page.content,"html.parser")`
			`results = soup.find("ul",class_="pagination")`

			`if(results != None):`
			`pages = results.text`
			`if(results == None):`
			`print("pagination next not found, probably end of pages:")`

			`next_url_names = soup.find_all("a",class_="btn btn-sm btn-secondary")`
			`for i2 in next_url_names:`
			`striped_string = i2.text.strip()`
			`log(i2.text.strip(),"stripped:",striped_string)`
logic error when pubdate not found 2024-06-17 08:27:13 +00:00			`log("Printable characters?",striped_string.isprintable())`
Initial commit 2024-06-13 09:14:04 +00:00			`if (striped_string) == "Nächste Seite":`
			`log(i2)`
			`next_url = i2.get("href")`
			`log("url of next site")`
			`found = True`
			`return next_url`
			`break`

			`if found == False:`
			`return 0`

			`def scrap_jobagent(url,entry,session):`
			`jobs = []`
			`log("in scrap jobs,url",url)`
			`if(session == 0):`
			`with requests.Session() as session:`
			`page = session.get(url)`
			`log(page)`
			`else:`
			`page = session.get(url)`
			`log(page)`
			`soup = BeautifulSoup(page.content,"html.parser")`
			`#print(soup.prettify())`

			`results = soup.find_all("li",class_="item")`
logic error when pubdate not found 2024-06-17 08:22:28 +00:00			`if not results:`
			`print("no li items found")`
logic error when pubdate not found 2024-06-17 08:27:13 +00:00			`log("page:",page)`
Initial commit 2024-06-13 09:14:04 +00:00
sdf 2024-07-18 11:48:44 +00:00			`title = item("span","jobtitle",0,"jobagent title")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_title = finder(results,title)`

sdf 2024-07-18 11:48:44 +00:00			`location = item("span","location",0,"jobagent location")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_location = finder(results,location,LOCATION_CLEANUP=1)`

sdf 2024-07-18 11:48:44 +00:00			`company = item("span","company",0,"jobagent company")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_company = finder(results,company,DEFAULT=1)`

sdf 2024-07-18 11:48:44 +00:00			`link = item("a","title",0,"jobagent link")`
Initial commit 2024-06-13 09:14:04 +00:00			`ar_link = finder(results,link,LINK=1)`

			`date = item("span","pubdate",0)`
			`ar_date = finder(results,date,SWAPDATE=1)`
			`tag = entry.tag`

			`return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)`