2024-06-13 09:14:04 +00:00
|
|
|
from helpers import *
|
2024-08-22 09:12:11 +00:00
|
|
|
from login import solveCaptcha
|
2024-07-18 09:26:13 +00:00
|
|
|
DEBUG = True
|
2024-06-13 09:14:04 +00:00
|
|
|
|
|
|
|
def log(*s):
|
|
|
|
if DEBUG:
|
|
|
|
print(s)
|
2024-07-23 12:55:37 +00:00
|
|
|
|
|
|
|
def scrap_indeed_com(url,entry,session):
|
2024-08-22 09:12:11 +00:00
|
|
|
moz_cookies = getCookiesFromBrowser(url)
|
|
|
|
print("[scrap]cookies:", moz_cookies)
|
|
|
|
session.headers = {
|
|
|
|
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0",
|
|
|
|
"Referer" : "https://ch.indeed.com/jobs?&from=searchOnHP",
|
|
|
|
"Cookie" : moz_cookies# """CTK=1i5q619l6jvkj800; indeed_rcc="LV:CTK:RQ"; CSRF=aEb4JWpfbYPy3j3g2rsUPDSixXqBe1Oe; INDEED_CSRF_TOKEN=4p83HqsTMan9QrVZun2Q0wrFeCoGm9mG; LV="LA=1724238852:LV=1724234376:CV=1724238852:TS=1724234376"; _cfuvid=27ptQm94DDaFeIjNGSNxW3g9GyDAJExtQz_RNr0jvE0-1724238843162-0.0.1.1-604800000; JSESSIONID=F196631331EF16D28C0E00AC7A43CB10; OptanonConsent=isGpcEnabled=1&datestamp=Wed+Aug+21+2024+13%3A14%3A47+GMT%2B0200+(Central+European+Summer+Time)&version=202210.1.0&isIABGlobal=false&hosts=&consentId=b0f6c692-930d-4929-9251-9a4f7bc72f61&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A0%2CC0003%3A0%2CC0004%3A0%2CC0007%3A0&AwaitingReconsent=false; _ga_LYNT3BTHPG=GS1.1.1724238849.2.1.1724238908.0.0.454081609; _ga=GA1.1.1356051481.1724234379; SURF=WCl9mMSuWXP2jp3GlLMyXzkQkAdKDg7W; FPID=FPID2.2.Dd22VS9g0Vfjh5dQoT9s%2Bws7zDmpmQlIzsYP9ZLW8kg%3D.1724234379; FPLC=Qmy8DxSR81EJxewKgZ7RfgP%2BdXEXWWU4RKVUs2Pn1vEIp%2Fu2Upaqz5%2Blgf05XLqfdY7S4qGRwWAbQqAbKQZb%2FBWQxZwpmvOzw%2Bhgpkfvj320PLIwamECv9iYH%2Bx%2FrQ%3D%3D; RQ="q=quereinsteiger&l=&ts=1724238933002&rbsalmin=0&rbsalmax=0:q=python+qt&l=&ts=1724234491003&rbsalmin=0&rbsalmax=0"; __cf_bm=X3BsfEnAGodB.ELxHVfYTAYd4K4n3TUbHVV7OyKMjBg-1724238843-1.0.1.1-4QMaUgbvnumBKmzwOcY2o0Taikgpvn72OoTXG_ZtU8q3qOCuf06riyYIJlXD.zsd7JxmZ_VdN1S9cCbGwXid6w; gonetap=closed; SHARED_INDEED_CSRF_TOKEN=4p83HqsTMan9QrVZun2Q0wrFeCoGm9mG"""
|
|
|
|
}
|
2024-06-13 09:14:04 +00:00
|
|
|
jobs = []
|
2024-08-22 09:12:11 +00:00
|
|
|
log("in scrap jobs,url",url)
|
|
|
|
if(session == 0 or session == -1):
|
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(url)
|
|
|
|
log(page)
|
|
|
|
else:
|
2024-06-13 09:14:04 +00:00
|
|
|
page = session.get(url)
|
2024-06-17 08:27:13 +00:00
|
|
|
log(page)
|
2024-08-22 09:12:11 +00:00
|
|
|
solveCaptcha(session,page)
|
2024-06-13 09:14:04 +00:00
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
2024-08-22 09:59:40 +00:00
|
|
|
#print(soup.prettify())
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-07-23 12:55:37 +00:00
|
|
|
|
|
|
|
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-07-23 12:55:37 +00:00
|
|
|
location = item("div",{'data-testid':'text-location'},0,"indeed location")
|
|
|
|
ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-07-23 12:55:37 +00:00
|
|
|
company = item("span",{'data-testid':'company-name'},0,"indeed company")
|
|
|
|
ar_company = finder(results,company,ATTRS=1)
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-07-23 12:55:37 +00:00
|
|
|
title = item("a",'jcs-JobTitle',0,"indeed title")
|
|
|
|
ar_title = finder(results,title,GETCHILDREN="span")
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-07-23 12:55:37 +00:00
|
|
|
date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")
|
2024-08-21 09:43:48 +00:00
|
|
|
ar_date = finder(results,date,ATTRS=1,INDEEDDATE=1)
|
2024-07-23 12:55:37 +00:00
|
|
|
|
|
|
|
link = item("a",'jcs-JobTitle',0,"link")
|
|
|
|
ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")
|
|
|
|
|
|
|
|
tag = entry.tag#get from config
|
|
|
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
2024-06-13 09:14:04 +00:00
|
|
|
|
|
|
|
def scrap_jobs(url,entry,session):
|
2024-08-19 10:55:56 +00:00
|
|
|
session.headers = {
|
|
|
|
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
|
|
|
|
}
|
2024-06-13 09:14:04 +00:00
|
|
|
jobs = []
|
|
|
|
log("in scrap jobs,url",url)
|
2024-08-08 09:58:50 +00:00
|
|
|
if(session == 0 or session == -1):
|
2024-06-13 09:14:04 +00:00
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(url)
|
|
|
|
log(page)
|
|
|
|
else:
|
|
|
|
page = session.get(url)
|
|
|
|
log(page)
|
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
|
|
#print(soup.prettify())
|
|
|
|
|
2024-07-18 09:26:13 +00:00
|
|
|
results = soup.find_all("div",attrs={'data-feat':'searched_jobs'})
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-08-12 10:56:05 +00:00
|
|
|
location_class = "d_grid items_start gap_s12 grid-cols_[auto_1fr] px_s8"
|
|
|
|
location = item("div",location_class,0,"location")
|
|
|
|
ar_location = finder(results,location,GETCHILDREN='p',LOCATION_CLEANUP=1)
|
|
|
|
|
|
|
|
company_class = "mb_s12 lastOfType:mb_s0 textStyle_p2"
|
2024-07-18 11:48:44 +00:00
|
|
|
company = item("p",company_class,0,"company")
|
2024-07-18 11:30:20 +00:00
|
|
|
ar_company = finder(results,company,DEFAULT=1,GETCHILDREN='strong')
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-08-12 10:56:05 +00:00
|
|
|
title = item("span","text_link.brand.base",0,"TITLE")
|
2024-06-13 09:14:04 +00:00
|
|
|
ar_title = finder(results,title,DEFAULT=1)
|
|
|
|
|
2024-08-12 10:56:05 +00:00
|
|
|
date = item("span","pos_absolute",0,"date")
|
2024-06-13 09:14:04 +00:00
|
|
|
ar_date = finder(results,date,CLEANDATE=1)
|
|
|
|
|
2024-07-18 11:48:44 +00:00
|
|
|
link = item("a",{'data-cy' :'job-link'},0,"link")
|
2024-07-18 09:26:13 +00:00
|
|
|
ar_link = finder(results,link,LINK=1,ATTRS=1,BASEURL="https://jobs.ch")
|
2024-06-13 09:14:04 +00:00
|
|
|
|
|
|
|
tag = entry.tag#get from config
|
|
|
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
|
|
|
|
2024-07-23 12:55:37 +00:00
|
|
|
def next_url_indeed_com(url,session,baseurl):
|
|
|
|
next_link_str = ''
|
|
|
|
if(session == 0):
|
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(url)
|
|
|
|
else:
|
|
|
|
page = requests.get(url)
|
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
|
|
result_next = soup.findAll("nav",attrs={"role":"navigation"})
|
|
|
|
next_=item("a",{'data-testid':'pagination-page-next'},0)
|
|
|
|
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
|
|
|
|
if next_link:
|
|
|
|
if(next_link[0] != "NOTFound"):
|
|
|
|
next_link_str = str(next_link[0])
|
|
|
|
next_link_str = baseurl + next_link_str
|
|
|
|
log(next_link_str)
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
if next_link_str != '':
|
|
|
|
return next_link_str
|
|
|
|
else:
|
|
|
|
return 0
|
2024-06-13 09:14:04 +00:00
|
|
|
def next_url_jobs_ch(url,session,baseurl):
|
|
|
|
next_link_str = ''
|
|
|
|
if(session == 0):
|
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(url)
|
|
|
|
else:
|
|
|
|
page = requests.get(url)
|
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
|
|
result_next = soup.findAll("div",attrs={"data-cy":"paginator"})
|
|
|
|
next_=item("a",{"data-cy":"paginator-next"},0)
|
|
|
|
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
|
|
|
|
if next_link:
|
|
|
|
if(next_link[0] != "NOTFound"):
|
|
|
|
next_link_str = str(next_link[0])
|
|
|
|
next_link_str = baseurl + next_link_str
|
|
|
|
log(next_link_str)
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
if next_link_str != '':
|
|
|
|
return next_link_str
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def next_url_jobagent(base_url,session,c):#depreacted will be removed in the future
|
|
|
|
found = False
|
|
|
|
|
|
|
|
if(session == 0):
|
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(base_url)
|
|
|
|
else:
|
|
|
|
page = requests.get(base_url)
|
|
|
|
|
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
|
|
results = soup.find("ul",class_="pagination")
|
|
|
|
|
|
|
|
if(results != None):
|
|
|
|
pages = results.text
|
|
|
|
if(results == None):
|
|
|
|
print("pagination next not found, probably end of pages:")
|
|
|
|
|
|
|
|
next_url_names = soup.find_all("a",class_="btn btn-sm btn-secondary")
|
|
|
|
for i2 in next_url_names:
|
|
|
|
striped_string = i2.text.strip()
|
|
|
|
log(i2.text.strip(),"stripped:",striped_string)
|
2024-06-17 08:27:13 +00:00
|
|
|
log("Printable characters?",striped_string.isprintable())
|
2024-06-13 09:14:04 +00:00
|
|
|
if (striped_string) == "Nächste Seite":
|
|
|
|
log(i2)
|
|
|
|
next_url = i2.get("href")
|
|
|
|
log("url of next site")
|
|
|
|
found = True
|
|
|
|
return next_url
|
|
|
|
break
|
|
|
|
|
|
|
|
if found == False:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def scrap_jobagent(url,entry,session):
|
|
|
|
jobs = []
|
2024-08-19 10:55:56 +00:00
|
|
|
log("[scrap_jobagent],url",url)
|
2024-08-08 09:58:50 +00:00
|
|
|
if(session == 0 or session == -1):
|
2024-08-19 10:55:56 +00:00
|
|
|
log("session not sucessful transmitted ",session)
|
2024-06-13 09:14:04 +00:00
|
|
|
with requests.Session() as session:
|
|
|
|
page = session.get(url)
|
|
|
|
log(page)
|
|
|
|
else:
|
|
|
|
page = session.get(url)
|
2024-08-19 10:55:56 +00:00
|
|
|
page = session.get(url)
|
|
|
|
log("[scrap_jobagent]page:",page)
|
2024-06-13 09:14:04 +00:00
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
2024-08-19 10:55:56 +00:00
|
|
|
print(soup.prettify())
|
|
|
|
print(session.headers)
|
2024-06-13 09:14:04 +00:00
|
|
|
results = soup.find_all("li",class_="item")
|
2024-06-17 08:22:28 +00:00
|
|
|
if not results:
|
|
|
|
print("no li items found")
|
2024-06-17 08:27:13 +00:00
|
|
|
log("page:",page)
|
2024-06-13 09:14:04 +00:00
|
|
|
|
2024-07-18 11:48:44 +00:00
|
|
|
title = item("span","jobtitle",0,"jobagent title")
|
2024-06-13 09:14:04 +00:00
|
|
|
ar_title = finder(results,title)
|
|
|
|
|
2024-07-18 11:48:44 +00:00
|
|
|
location = item("span","location",0,"jobagent location")
|
2024-06-13 09:14:04 +00:00
|
|
|
ar_location = finder(results,location,LOCATION_CLEANUP=1)
|
|
|
|
|
2024-07-18 11:48:44 +00:00
|
|
|
company = item("span","company",0,"jobagent company")
|
2024-06-13 09:14:04 +00:00
|
|
|
ar_company = finder(results,company,DEFAULT=1)
|
|
|
|
|
2024-07-18 11:48:44 +00:00
|
|
|
link = item("a","title",0,"jobagent link")
|
2024-06-13 09:14:04 +00:00
|
|
|
ar_link = finder(results,link,LINK=1)
|
|
|
|
|
|
|
|
date = item("span","pubdate",0)
|
|
|
|
ar_date = finder(results,date,SWAPDATE=1)
|
|
|
|
tag = entry.tag
|
|
|
|
|
|
|
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
|
|
|
|
2024-08-22 09:12:11 +00:00
|
|
|
|