160 lines
5.2 KiB
Python
160 lines
5.2 KiB
Python
from helpers import *
|
|
DEBUG = False
|
|
|
|
def log(*s):
|
|
if DEBUG:
|
|
print(s)
|
|
def indeed_com(url,session):
|
|
jobs = []
|
|
if(session == 0):
|
|
with requests.Session() as session:
|
|
page = session.get(url)
|
|
log(page)
|
|
else:
|
|
page = session.get(url)
|
|
log(page)
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
#print(soup.prettify())
|
|
|
|
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0')
|
|
|
|
location = item("p",{'data-testid':'text-location'},0)
|
|
ar_location = finder(results,location,LOCATION_CLEANUP=1,ATTRS=1)
|
|
|
|
company = item("p",{'data-testid':'company-name'},0)
|
|
ar_company = finder(results,location,ATTRS=1)
|
|
|
|
title = item("a",'jobTitle',0)
|
|
ar_title = finder(results,location)
|
|
|
|
date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0)
|
|
ar_date = finder(results,date,CLEANDATE=1)
|
|
|
|
def scrap_jobs(url,entry,session):
|
|
jobs = []
|
|
log("in scrap jobs,url",url)
|
|
if(session == 0):
|
|
with requests.Session() as session:
|
|
page = session.get(url)
|
|
log(page)
|
|
else:
|
|
page = session.get(url)
|
|
log(page)
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
#print(soup.prettify())
|
|
|
|
results = soup.find_all("div",attrs={"data-feat":"searched_jobs"})
|
|
|
|
location_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn"
|
|
location = item("p",location_class,0)
|
|
ar_location = finder(results,location,LOCATION_CLEANUP=1)
|
|
|
|
company_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn"
|
|
company = item("p",company_class,3)
|
|
ar_company = finder(results,company,DEFAULT=1)
|
|
|
|
title = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 VacancyItem___StyledText2-sc-iugtv6-5 iaJYDR jlFpCz dMwMcR",0)
|
|
ar_title = finder(results,title,DEFAULT=1)
|
|
|
|
date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0)
|
|
ar_date = finder(results,date,CLEANDATE=1)
|
|
|
|
link = item("a","Link__ExtendedRR6Link-sc-czsz28-1 khAvCu Link-sc-czsz28-2 VacancyLink___StyledLink-sc-ufp08j-0 dXKwhi dDgwgk",0)
|
|
ar_link = finder(results,link,LINK=1,BASEURL="https://jobs.ch")
|
|
|
|
tag = entry.tag#get from config
|
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
|
|
|
def next_url_jobs_ch(url,session,baseurl):
|
|
next_link_str = ''
|
|
if(session == 0):
|
|
with requests.Session() as session:
|
|
page = session.get(url)
|
|
else:
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
result_next = soup.findAll("div",attrs={"data-cy":"paginator"})
|
|
next_=item("a",{"data-cy":"paginator-next"},0)
|
|
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
|
|
if next_link:
|
|
if(next_link[0] != "NOTFound"):
|
|
next_link_str = str(next_link[0])
|
|
next_link_str = baseurl + next_link_str
|
|
log(next_link_str)
|
|
else:
|
|
return 0
|
|
if next_link_str != '':
|
|
return next_link_str
|
|
else:
|
|
return 0
|
|
|
|
def next_url_jobagent(base_url,session,c):#depreacted will be removed in the future
|
|
found = False
|
|
|
|
if(session == 0):
|
|
with requests.Session() as session:
|
|
page = session.get(base_url)
|
|
else:
|
|
page = requests.get(base_url)
|
|
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
results = soup.find("ul",class_="pagination")
|
|
|
|
if(results != None):
|
|
pages = results.text
|
|
if(results == None):
|
|
print("pagination next not found, probably end of pages:")
|
|
|
|
next_url_names = soup.find_all("a",class_="btn btn-sm btn-secondary")
|
|
for i2 in next_url_names:
|
|
striped_string = i2.text.strip()
|
|
log(i2.text.strip(),"stripped:",striped_string)
|
|
log("Printable characters?",striped_string.isprintable())
|
|
if (striped_string) == "Nächste Seite":
|
|
log(i2)
|
|
next_url = i2.get("href")
|
|
log("url of next site")
|
|
found = True
|
|
return next_url
|
|
break
|
|
|
|
if found == False:
|
|
return 0
|
|
|
|
def scrap_jobagent(url,entry,session):
|
|
jobs = []
|
|
log("in scrap jobs,url",url)
|
|
if(session == 0):
|
|
with requests.Session() as session:
|
|
page = session.get(url)
|
|
log(page)
|
|
else:
|
|
page = session.get(url)
|
|
log(page)
|
|
soup = BeautifulSoup(page.content,"html.parser")
|
|
#print(soup.prettify())
|
|
|
|
results = soup.find_all("li",class_="item")
|
|
if not results:
|
|
print("no li items found")
|
|
log("page:",page)
|
|
|
|
title = item("span","jobtitle",0)
|
|
ar_title = finder(results,title)
|
|
|
|
location = item("span","location",0)
|
|
ar_location = finder(results,location,LOCATION_CLEANUP=1)
|
|
|
|
company = item("span","company",0)
|
|
ar_company = finder(results,company,DEFAULT=1)
|
|
|
|
link = item("a","title",0)
|
|
ar_link = finder(results,link,LINK=1)
|
|
|
|
date = item("span","pubdate",0)
|
|
ar_date = finder(results,date,SWAPDATE=1)
|
|
tag = entry.tag
|
|
|
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
|
|