job-scrapper/lib/scrap_jobs.py

210 lines
6.9 KiB
Python
Raw Normal View History

2024-06-13 09:14:04 +00:00
from helpers import *
2024-08-22 09:12:11 +00:00
from login import solveCaptcha
2024-08-23 10:13:15 +00:00
import mozilla
DEBUG = True
2024-06-13 09:14:04 +00:00
def log(*s):
if DEBUG:
print(s)
def scrap_indeed_com(url,entry,session):
2024-08-23 10:13:15 +00:00
moz_cookies = mozilla.getCookiesFromBrowser(url)
2024-08-22 09:12:11 +00:00
print("[scrap]cookies:", moz_cookies)
session.headers = {
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0",
"Referer" : "https://ch.indeed.com/jobs?&from=searchOnHP",
"Cookie" : moz_cookies
2024-08-22 09:12:11 +00:00
}
2024-06-13 09:14:04 +00:00
jobs = []
2024-08-22 09:12:11 +00:00
log("in scrap jobs,url",url)
if(session == 0 or session == -1):
with requests.Session() as session:
page = session.get(url)
log(page)
else:
2024-06-13 09:14:04 +00:00
page = session.get(url)
2024-06-17 08:27:13 +00:00
log(page)
if solveCaptcha(session,page) == 1:
print("Cookie stealing unsuccesfull retry with force")
moz_cookies = mozilla.getCookiesFromBrowser(url,force=True)
2024-06-13 09:14:04 +00:00
soup = BeautifulSoup(page.content,"html.parser")
2024-08-22 09:59:40 +00:00
#print(soup.prettify())
2024-06-13 09:14:04 +00:00
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
2024-06-13 09:14:04 +00:00
location = item("div",{'data-testid':'text-location'},0,"indeed location")
ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)
2024-06-13 09:14:04 +00:00
company = item("span",{'data-testid':'company-name'},0,"indeed company")
ar_company = finder(results,company,ATTRS=1)
2024-06-13 09:14:04 +00:00
title = item("a",'jcs-JobTitle',0,"indeed title")
ar_title = finder(results,title,GETCHILDREN="span")
2024-06-13 09:14:04 +00:00
date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")
2024-08-21 09:43:48 +00:00
ar_date = finder(results,date,ATTRS=1,INDEEDDATE=1)
link = item("a",'jcs-JobTitle',0,"link")
ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")
tag = entry.tag#get from config
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
2024-06-13 09:14:04 +00:00
def scrap_jobs(url,entry,session):
2024-08-19 10:55:56 +00:00
session.headers = {
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
}
2024-06-13 09:14:04 +00:00
jobs = []
log("in scrap jobs,url",url)
if(session == 0 or session == -1):
2024-06-13 09:14:04 +00:00
with requests.Session() as session:
page = session.get(url)
log(page)
else:
page = session.get(url)
log(page)
soup = BeautifulSoup(page.content,"html.parser")
#print(soup.prettify())
results = soup.find_all("div",attrs={'data-feat':'searched_jobs'})
2024-06-13 09:14:04 +00:00
location_class = "d_grid items_start gap_s12 grid-cols_[auto_1fr] px_s8"
location = item("div",location_class,0,"location")
ar_location = finder(results,location,GETCHILDREN='p',LOCATION_CLEANUP=1)
company_class = "mb_s12 lastOfType:mb_s0 textStyle_p2"
2024-07-18 11:48:44 +00:00
company = item("p",company_class,0,"company")
ar_company = finder(results,company,DEFAULT=1,GETCHILDREN='strong')
2024-06-13 09:14:04 +00:00
title = item("span","text_link.brand.base",0,"TITLE")
2024-06-13 09:14:04 +00:00
ar_title = finder(results,title,DEFAULT=1)
date = item("span","pos_absolute",0,"date")
2024-06-13 09:14:04 +00:00
ar_date = finder(results,date,CLEANDATE=1)
2024-07-18 11:48:44 +00:00
link = item("a",{'data-cy' :'job-link'},0,"link")
ar_link = finder(results,link,LINK=1,ATTRS=1,BASEURL="https://jobs.ch")
2024-06-13 09:14:04 +00:00
tag = entry.tag#get from config
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
def next_url_indeed_com(url,session,baseurl):
next_link_str = ''
if(session == 0):
with requests.Session() as session:
page = session.get(url)
else:
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
result_next = soup.findAll("nav",attrs={"role":"navigation"})
next_=item("a",{'data-testid':'pagination-page-next'},0)
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
if next_link:
if(next_link[0] != "NOTFound"):
next_link_str = str(next_link[0])
next_link_str = baseurl + next_link_str
log(next_link_str)
else:
return 0
if next_link_str != '':
return next_link_str
else:
return 0
2024-06-13 09:14:04 +00:00
def next_url_jobs_ch(url,session,baseurl):
next_link_str = ''
if(session == 0):
with requests.Session() as session:
page = session.get(url)
else:
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
result_next = soup.findAll("div",attrs={"data-cy":"paginator"})
next_=item("a",{"data-cy":"paginator-next"},0)
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
if next_link:
if(next_link[0] != "NOTFound"):
next_link_str = str(next_link[0])
next_link_str = baseurl + next_link_str
log(next_link_str)
else:
return 0
if next_link_str != '':
return next_link_str
else:
return 0
def next_url_jobagent(base_url,session,c):#depreacted will be removed in the future
found = False
if(session == 0):
with requests.Session() as session:
page = session.get(base_url)
else:
page = requests.get(base_url)
soup = BeautifulSoup(page.content,"html.parser")
results = soup.find("ul",class_="pagination")
if(results != None):
pages = results.text
if(results == None):
print("pagination next not found, probably end of pages:")
next_url_names = soup.find_all("a",class_="btn btn-sm btn-secondary")
for i2 in next_url_names:
striped_string = i2.text.strip()
log(i2.text.strip(),"stripped:",striped_string)
2024-06-17 08:27:13 +00:00
log("Printable characters?",striped_string.isprintable())
2024-06-13 09:14:04 +00:00
if (striped_string) == "Nächste Seite":
log(i2)
next_url = i2.get("href")
log("url of next site")
found = True
return next_url
break
if found == False:
return 0
def scrap_jobagent(url,entry,session):
jobs = []
2024-08-19 10:55:56 +00:00
log("[scrap_jobagent],url",url)
if(session == 0 or session == -1):
2024-08-19 10:55:56 +00:00
log("session not sucessful transmitted ",session)
2024-06-13 09:14:04 +00:00
with requests.Session() as session:
page = session.get(url)
log(page)
else:
page = session.get(url)
2024-08-19 10:55:56 +00:00
page = session.get(url)
log("[scrap_jobagent]page:",page)
2024-06-13 09:14:04 +00:00
soup = BeautifulSoup(page.content,"html.parser")
2024-08-19 10:55:56 +00:00
print(soup.prettify())
print(session.headers)
2024-06-13 09:14:04 +00:00
results = soup.find_all("li",class_="item")
2024-06-17 08:22:28 +00:00
if not results:
print("no li items found")
2024-06-17 08:27:13 +00:00
log("page:",page)
2024-06-13 09:14:04 +00:00
2024-07-18 11:48:44 +00:00
title = item("span","jobtitle",0,"jobagent title")
2024-06-13 09:14:04 +00:00
ar_title = finder(results,title)
2024-07-18 11:48:44 +00:00
location = item("span","location",0,"jobagent location")
2024-06-13 09:14:04 +00:00
ar_location = finder(results,location,LOCATION_CLEANUP=1)
2024-07-18 11:48:44 +00:00
company = item("span","company",0,"jobagent company")
2024-06-13 09:14:04 +00:00
ar_company = finder(results,company,DEFAULT=1)
2024-07-18 11:48:44 +00:00
link = item("a","title",0,"jobagent link")
2024-06-13 09:14:04 +00:00
ar_link = finder(results,link,LINK=1)
date = item("span","pubdate",0)
ar_date = finder(results,date,SWAPDATE=1)
tag = entry.tag
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
2024-08-22 09:12:11 +00:00