getting upstream
This commit is contained in:
parent
f3ab4db625
commit
850538a92f
40
lib/login.py
40
lib/login.py
@ -1,4 +1,5 @@
|
|||||||
import requests
|
import requests
|
||||||
|
from requests_html import HTMLSession
|
||||||
from helpers import *
|
from helpers import *
|
||||||
def login(entry):
|
def login(entry):
|
||||||
user = entry.user
|
user = entry.user
|
||||||
@ -17,7 +18,7 @@ def login(entry):
|
|||||||
"Origin": "https://www.jobagent.ch",
|
"Origin": "https://www.jobagent.ch",
|
||||||
"Connection": "keep-alive",
|
"Connection": "keep-alive",
|
||||||
"Referer": "https://jobagent.ch",
|
"Referer": "https://jobagent.ch",
|
||||||
"Cookie": "datadome=BbGio7V9YBqYELb~B2a7DqE9Zr5EWb315OypbcxGQOFKbhkJR48etFSooYwtnKF2sK5leCh7Q_0o6W5YMwl0qEH~Fw3BU0m~48MgrkuaxO3Z1s5MTqCnTZVW3PcQv7KM; _uc=ad_storage=granted:analytics_storage=granted; _gcl_au=1.1.1328234550.1724056973.1502078804.1724062620.1724062680; _ga=GA1.1.1840632117.1724056971; _ga_T0E2JNNRW2=GS1.1.1724062555.3.1.1724062683.0.1.2098134382; JSESSIONID=AB8CC398C2576A6A87C53A74CCD8F7F5; _pin_unauth=dWlkPU56Y3pabU14WW1JdFptTTFNeTAwTkdFMkxUbGlZV0V0TWprNVkyTXpZemd4WldNNA; _fbp=fb.1.1724056975123.543612652217076856; _clck=16bp9by%7C2%7Cfog%7C0%7C1692; _clsk=1o7y6b9%7C1724062683361%7C9%7C1%7Cu.clarity.ms%2Fcollect; _rm=ai53eXNzJTQwa29sYWJub3cuY2g6MTcyNjY1NDY4MTA0NDpTSEEyNTY6ZGRkMmZhYTRjZWY3MWZkZDU1M2VlMTI4ZjYzOGY1NmFiYmRkNjNiMmI3ZjE1NWRhNmU3YzcwZWU1NjQ2Mjc0Mw; _uetsid=0737af805e0711efbe7bdd027b00b063; _uetvid=0737b3005e0711efb7c7035382896421",
|
#"Cookie": "datadome=BbGio7V9YBqYELb~B2a7DqE9Zr5EWb315OypbcxGQOFKbhkJR48etFSooYwtnKF2sK5leCh7Q_0o6W5YMwl0qEH~Fw3BU0m~48MgrkuaxO3Z1s5MTqCnTZVW3PcQv7KM; _uc=ad_storage=granted:analytics_storage=granted; _gcl_au=1.1.1328234550.1724056973.1502078804.1724062620.1724062680; _ga=GA1.1.1840632117.1724056971; _ga_T0E2JNNRW2=GS1.1.1724062555.3.1.1724062683.0.1.2098134382; JSESSIONID=AB8CC398C2576A6A87C53A74CCD8F7F5; _pin_unauth=dWlkPU56Y3pabU14WW1JdFptTTFNeTAwTkdFMkxUbGlZV0V0TWprNVkyTXpZemd4WldNNA; _fbp=fb.1.1724056975123.543612652217076856; _clck=16bp9by%7C2%7Cfog%7C0%7C1692; _clsk=1o7y6b9%7C1724062683361%7C9%7C1%7Cu.clarity.ms%2Fcollect; _rm=ai53eXNzJTQwa29sYWJub3cuY2g6MTcyNjY1NDY4MTA0NDpTSEEyNTY6ZGRkMmZhYTRjZWY3MWZkZDU1M2VlMTI4ZjYzOGY1NmFiYmRkNjNiMmI3ZjE1NWRhNmU3YzcwZWU1NjQ2Mjc0Mw; _uetsid=0737af805e0711efbe7bdd027b00b063; _uetvid=0737b3005e0711efb7c7035382896421",
|
||||||
# "Upgrade-Insecure-Requests": "1",
|
# "Upgrade-Insecure-Requests": "1",
|
||||||
# "Sec-Fetch-Dest": "document",
|
# "Sec-Fetch-Dest": "document",
|
||||||
# "Sec-Fetch-Mode": "navigate",
|
# "Sec-Fetch-Mode": "navigate",
|
||||||
@ -29,11 +30,40 @@ def login(entry):
|
|||||||
payload = {"redirectUrl":"","email":user,"password":pw}
|
payload = {"redirectUrl":"","email":user,"password":pw}
|
||||||
resp = session.post(loginurl,data=payload)
|
resp = session.post(loginurl,data=payload)
|
||||||
print(payload)
|
print(payload)
|
||||||
print("response from login attempt",resp)
|
checkBlockers(session,resp)
|
||||||
if resp.url == 'https://www.jobagent.ch/user/login?error':
|
|
||||||
print("Error on login")
|
|
||||||
return -1
|
|
||||||
r = session.get(scrapurl)
|
r = session.get(scrapurl)
|
||||||
print(session.headers)
|
print(session.headers)
|
||||||
print("response:",r)
|
print("response:",r)
|
||||||
return session
|
return session
|
||||||
|
#solveCaptcha when :
|
||||||
|
#string "captcha" is in response
|
||||||
|
#search for <iframe
|
||||||
|
#get src tag
|
||||||
|
#open a webbrowser to solve the captcha
|
||||||
|
#somehow getting the cookie maype?
|
||||||
|
|
||||||
|
def solveCaptcha(session,resp):
|
||||||
|
found = 0
|
||||||
|
if "captcha" or "Enable JavaScript" in resp :
|
||||||
|
#soup = BeautifulSoup(resp,"html.parser")
|
||||||
|
#result = soup.find("iframe")
|
||||||
|
#while found==0:
|
||||||
|
# if "captcha" in resp:#result:
|
||||||
|
print("captcha link!! found:")
|
||||||
|
found=1
|
||||||
|
#else:
|
||||||
|
# result.find_next()
|
||||||
|
print("exit loop")
|
||||||
|
print("response:",resp)
|
||||||
|
#if found:
|
||||||
|
#print("captchaurl:", result["src"])
|
||||||
|
#x = input("continue")
|
||||||
|
#else:
|
||||||
|
# print("did not recognise a captcha")
|
||||||
|
def checkBlockers(session,resp):
|
||||||
|
print("response from login attempt",resp)
|
||||||
|
if resp:
|
||||||
|
if resp.url == 'https://www.jobagent.ch/user/login?error':
|
||||||
|
print("Error on login")
|
||||||
|
return -1
|
||||||
|
solveCaptcha(session,resp)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from helpers import *
|
from helpers import *
|
||||||
|
from login import solveCaptcha
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
|
|
||||||
def log(*s):
|
def log(*s):
|
||||||
@ -6,20 +7,25 @@ def log(*s):
|
|||||||
print(s)
|
print(s)
|
||||||
|
|
||||||
def scrap_indeed_com(url,entry,session):
|
def scrap_indeed_com(url,entry,session):
|
||||||
log("[scrap_indeed_com] url: ",url)
|
moz_cookies = getCookiesFromBrowser(url)
|
||||||
jobs = []
|
print("[scrap]cookies:", moz_cookies)
|
||||||
# if(session == 0):
|
|
||||||
with requests.Session() as session:
|
|
||||||
session.headers = {
|
session.headers = {
|
||||||
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
|
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0",
|
||||||
|
"Referer" : "https://ch.indeed.com/jobs?&from=searchOnHP",
|
||||||
|
"Cookie" : moz_cookies# """CTK=1i5q619l6jvkj800; indeed_rcc="LV:CTK:RQ"; CSRF=aEb4JWpfbYPy3j3g2rsUPDSixXqBe1Oe; INDEED_CSRF_TOKEN=4p83HqsTMan9QrVZun2Q0wrFeCoGm9mG; LV="LA=1724238852:LV=1724234376:CV=1724238852:TS=1724234376"; _cfuvid=27ptQm94DDaFeIjNGSNxW3g9GyDAJExtQz_RNr0jvE0-1724238843162-0.0.1.1-604800000; JSESSIONID=F196631331EF16D28C0E00AC7A43CB10; OptanonConsent=isGpcEnabled=1&datestamp=Wed+Aug+21+2024+13%3A14%3A47+GMT%2B0200+(Central+European+Summer+Time)&version=202210.1.0&isIABGlobal=false&hosts=&consentId=b0f6c692-930d-4929-9251-9a4f7bc72f61&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A0%2CC0003%3A0%2CC0004%3A0%2CC0007%3A0&AwaitingReconsent=false; _ga_LYNT3BTHPG=GS1.1.1724238849.2.1.1724238908.0.0.454081609; _ga=GA1.1.1356051481.1724234379; SURF=WCl9mMSuWXP2jp3GlLMyXzkQkAdKDg7W; FPID=FPID2.2.Dd22VS9g0Vfjh5dQoT9s%2Bws7zDmpmQlIzsYP9ZLW8kg%3D.1724234379; FPLC=Qmy8DxSR81EJxewKgZ7RfgP%2BdXEXWWU4RKVUs2Pn1vEIp%2Fu2Upaqz5%2Blgf05XLqfdY7S4qGRwWAbQqAbKQZb%2FBWQxZwpmvOzw%2Bhgpkfvj320PLIwamECv9iYH%2Bx%2FrQ%3D%3D; RQ="q=quereinsteiger&l=&ts=1724238933002&rbsalmin=0&rbsalmax=0:q=python+qt&l=&ts=1724234491003&rbsalmin=0&rbsalmax=0"; __cf_bm=X3BsfEnAGodB.ELxHVfYTAYd4K4n3TUbHVV7OyKMjBg-1724238843-1.0.1.1-4QMaUgbvnumBKmzwOcY2o0Taikgpvn72OoTXG_ZtU8q3qOCuf06riyYIJlXD.zsd7JxmZ_VdN1S9cCbGwXid6w; gonetap=closed; SHARED_INDEED_CSRF_TOKEN=4p83HqsTMan9QrVZun2Q0wrFeCoGm9mG"""
|
||||||
}
|
}
|
||||||
|
jobs = []
|
||||||
|
log("in scrap jobs,url",url)
|
||||||
|
if(session == 0 or session == -1):
|
||||||
|
with requests.Session() as session:
|
||||||
page = session.get(url)
|
page = session.get(url)
|
||||||
log(page)
|
log(page)
|
||||||
# else:
|
else:
|
||||||
# page = session.get(url)
|
page = session.get(url)
|
||||||
# log(page)
|
log(page)
|
||||||
|
solveCaptcha(session,page)
|
||||||
soup = BeautifulSoup(page.content,"html.parser")
|
soup = BeautifulSoup(page.content,"html.parser")
|
||||||
#print(soup.prettify())
|
print(soup.prettify())
|
||||||
|
|
||||||
|
|
||||||
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
|
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
|
||||||
@ -196,3 +202,4 @@ def scrap_jobagent(url,entry,session):
|
|||||||
|
|
||||||
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,7 +108,7 @@ def runner(entry,session,scrap_func,next_url_func):
|
|||||||
i=0
|
i=0
|
||||||
b_url = entry.scrapurl
|
b_url = entry.scrapurl
|
||||||
while b_url != 0 and i<50:
|
while b_url != 0 and i<50:
|
||||||
sleep(0.3)
|
sleep(0.5)
|
||||||
if b_url:
|
if b_url:
|
||||||
domain = extractDomain(b_url)
|
domain = extractDomain(b_url)
|
||||||
print(domain)
|
print(domain)
|
||||||
|
Loading…
Reference in New Issue
Block a user