from helpers import * DEBUG = False def log(*s): if DEBUG: print(s) def indeed_com(url,session): jobs = [] if(session == 0): with requests.Session() as session: page = session.get(url) print(page) else: page = session.get(url) print(page) soup = BeautifulSoup(page.content,"html.parser") #print(soup.prettify()) results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') location = item("p",{'data-testid':'text-location'},0) ar_location = finder(results,location,LOCATION_CLEANUP=1,ATTRS=1) company = item("p",{'data-testid':'company-name'},0) ar_company = finder(results,location,ATTRS=1) title = item("a",'jobTitle',0) ar_title = finder(results,location) date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0) ar_date = finder(results,date,CLEANDATE=1) def scrap_jobs(url,entry,session): jobs = [] log("in scrap jobs,url",url) if(session == 0): with requests.Session() as session: page = session.get(url) log(page) else: page = session.get(url) log(page) soup = BeautifulSoup(page.content,"html.parser") #print(soup.prettify()) results = soup.find_all("div",attrs={"data-feat":"searched_jobs"}) location_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn" location = item("p",location_class,0) ar_location = finder(results,location,LOCATION_CLEANUP=1) company_class = "P-sc-hyu5hk-0 Text__p2-sc-1lu7urs-10 Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 jZCxUn" company = item("p",company_class,3) ar_company = finder(results,company,DEFAULT=1) title = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 VacancyItem___StyledText2-sc-iugtv6-5 iaJYDR jlFpCz dMwMcR",0) ar_title = finder(results,title,DEFAULT=1) date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0) ar_date = finder(results,date,CLEANDATE=1) link = item("a","Link__ExtendedRR6Link-sc-czsz28-1 khAvCu Link-sc-czsz28-2 VacancyLink___StyledLink-sc-ufp08j-0 dXKwhi dDgwgk",0) ar_link = finder(results,link,LINK=1,BASEURL="https://jobs.ch") tag = entry.tag#get from config return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag) def next_url_jobs_ch(url,session,baseurl): next_link_str = '' if(session == 0): with requests.Session() as session: page = session.get(url) else: page = requests.get(url) soup = BeautifulSoup(page.content,"html.parser") result_next = soup.findAll("div",attrs={"data-cy":"paginator"}) next_=item("a",{"data-cy":"paginator-next"},0) next_link = finder(result_next,next_,ATTRS=1,LINK=1) if next_link: if(next_link[0] != "NOTFound"): next_link_str = str(next_link[0]) next_link_str = baseurl + next_link_str log(next_link_str) else: return 0 if next_link_str != '': return next_link_str else: return 0 def next_url_jobagent(base_url,session,c):#depreacted will be removed in the future found = False if(session == 0): with requests.Session() as session: page = session.get(base_url) else: page = requests.get(base_url) soup = BeautifulSoup(page.content,"html.parser") results = soup.find("ul",class_="pagination") if(results != None): pages = results.text if(results == None): print("pagination next not found, probably end of pages:") next_url_names = soup.find_all("a",class_="btn btn-sm btn-secondary") for i2 in next_url_names: striped_string = i2.text.strip() log(i2.text.strip(),"stripped:",striped_string) # print("Printable characters?",striped_string.isprintable()) if (striped_string) == "Nächste Seite": log(i2) next_url = i2.get("href") log("url of next site") found = True return next_url break if found == False: return 0 def scrap_jobagent(url,entry,session): jobs = [] log("in scrap jobs,url",url) if(session == 0): with requests.Session() as session: page = session.get(url) log(page) else: page = session.get(url) log(page) soup = BeautifulSoup(page.content,"html.parser") #print(soup.prettify()) results = soup.find_all("li",class_="item") title = item("span","jobtitle",0) ar_title = finder(results,title) location = item("span","location",0) ar_location = finder(results,location,LOCATION_CLEANUP=1) company = item("span","company",0) ar_company = finder(results,company,DEFAULT=1) link = item("a","title",0) ar_link = finder(results,link,LINK=1) date = item("span","pubdate",0) ar_date = finder(results,date,SWAPDATE=1) tag = entry.tag return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)