From 1bf8198c707cb6f8211bf44dfca945826e71b79d Mon Sep 17 00:00:00 2001 From: ccppi Date: Mon, 19 Aug 2024 12:55:56 +0200 Subject: [PATCH] change user agent --- lib/db.py | 2 +- lib/login.py | 19 ++++++++++--------- lib/scrap_jobs.py | 13 +++++++++---- lib/sysparse.py | 34 ++++++++++++++++++++++------------ 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/lib/db.py b/lib/db.py index ba072a1..f8802e9 100644 --- a/lib/db.py +++ b/lib/db.py @@ -150,7 +150,7 @@ def isStillValid(file,skiprows): if resp.status >= 400 or isLink == False: print("link is no more valid, remove item") rm_cursor = connection.cursor() - rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ? AND star != 1;""",(row[0],)) + rm_itm = rm_cursor.execute("DELETE from jobs WHERE link = ? AND star != 1;",(row[0],)) print ("Deletion resultet in: ", rm_itm) print("result of commit: ", connection.commit()) return 0 diff --git a/lib/login.py b/lib/login.py index 106efa1..5ccd8ab 100644 --- a/lib/login.py +++ b/lib/login.py @@ -6,17 +6,18 @@ def login(entry): loginurl = entry.loginurl scrapurl = entry.scrapurl with requests.Session() as session: - headers = { + session.headers = { "Host": "www.jobagent.ch", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", - "Content-Type": "application/x-www-form-urlencoded", - "Content-Length": "58", + # "Content-Type": "application/x-www-form-urlencoded", + # "Content-Length": "58", "Origin": "https://www.jobagent.ch", - # "Connection": "keep-alive", - "Referer": "https://www.jobagent.ch/user/login", + "Connection": "keep-alive", + "Referer": "https://jobagent.ch", + "Cookie": "datadome=BbGio7V9YBqYELb~B2a7DqE9Zr5EWb315OypbcxGQOFKbhkJR48etFSooYwtnKF2sK5leCh7Q_0o6W5YMwl0qEH~Fw3BU0m~48MgrkuaxO3Z1s5MTqCnTZVW3PcQv7KM; _uc=ad_storage=granted:analytics_storage=granted; _gcl_au=1.1.1328234550.1724056973.1502078804.1724062620.1724062680; _ga=GA1.1.1840632117.1724056971; _ga_T0E2JNNRW2=GS1.1.1724062555.3.1.1724062683.0.1.2098134382; JSESSIONID=AB8CC398C2576A6A87C53A74CCD8F7F5; _pin_unauth=dWlkPU56Y3pabU14WW1JdFptTTFNeTAwTkdFMkxUbGlZV0V0TWprNVkyTXpZemd4WldNNA; _fbp=fb.1.1724056975123.543612652217076856; _clck=16bp9by%7C2%7Cfog%7C0%7C1692; _clsk=1o7y6b9%7C1724062683361%7C9%7C1%7Cu.clarity.ms%2Fcollect; _rm=ai53eXNzJTQwa29sYWJub3cuY2g6MTcyNjY1NDY4MTA0NDpTSEEyNTY6ZGRkMmZhYTRjZWY3MWZkZDU1M2VlMTI4ZjYzOGY1NmFiYmRkNjNiMmI3ZjE1NWRhNmU3YzcwZWU1NjQ2Mjc0Mw; _uetsid=0737af805e0711efbe7bdd027b00b063; _uetvid=0737b3005e0711efb7c7035382896421", # "Upgrade-Insecure-Requests": "1", # "Sec-Fetch-Dest": "document", # "Sec-Fetch-Mode": "navigate", @@ -24,15 +25,15 @@ def login(entry): # "DNT": "1", # "Sec-GPC": "1" } - r = session.get(loginurl) payload = {"redirectUrl":"","email":user,"password":pw} - resp = session.post(loginurl,data=payload,headers=headers) + resp = session.post(loginurl,data=payload) print(payload) - print("response from login attempt",resp.url) + print("response from login attempt",resp) if resp.url == 'https://www.jobagent.ch/user/login?error': print("Error on login") return -1 r = session.get(scrapurl) - + print(session.headers) + print("response:",r) return session diff --git a/lib/scrap_jobs.py b/lib/scrap_jobs.py index 7b44d78..a3469da 100644 --- a/lib/scrap_jobs.py +++ b/lib/scrap_jobs.py @@ -43,6 +43,9 @@ def scrap_indeed_com(url,entry,session): return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag) def scrap_jobs(url,entry,session): + session.headers = { + "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0" + } jobs = [] log("in scrap jobs,url",url) if(session == 0 or session == -1): @@ -157,17 +160,19 @@ def next_url_jobagent(base_url,session,c):#depreacted will be removed in the fut def scrap_jobagent(url,entry,session): jobs = [] - log("in scrap jobs,url",url) + log("[scrap_jobagent],url",url) if(session == 0 or session == -1): + log("session not sucessful transmitted ",session) with requests.Session() as session: page = session.get(url) log(page) else: page = session.get(url) - log(page) + page = session.get(url) + log("[scrap_jobagent]page:",page) soup = BeautifulSoup(page.content,"html.parser") - #print(soup.prettify()) - + print(soup.prettify()) + print(session.headers) results = soup.find_all("li",class_="item") if not results: print("no li items found") diff --git a/lib/sysparse.py b/lib/sysparse.py index fd3f6b2..1362c45 100644 --- a/lib/sysparse.py +++ b/lib/sysparse.py @@ -78,20 +78,30 @@ def login_loop(config_file,gui,worker): while (ret != 0): if gui: worker.dialog_rejected = False - ret = entry2 = config.readConfig(config_file,gui,worker) - print(entry2) - if(ret != 0 and ret_login != 1): - if(entry2.loginurl != 'NONE'): - session = -1 - log("[pre while] worker.dialog_rejected = ",worker.dialog_rejected) - while (session == -1 and worker.dialog_rejected == False): - log("worker.dialog_rejected = ",worker.dialog_rejected) - session = login(entry2) - ret_login = entry2.input_pw(gui,entry2.user,worker) - if gui: + ret = entry2 = config.readConfig(config_file,gui,worker) + print(entry2) + if(ret != 0 and ret_login != 1): + if(entry2.loginurl != 'NONE'): + session = -1 + log("[pre while] worker.dialog_rejected = ",worker.dialog_rejected) + worker.dialog_rejected = False + while (session == -1 and worker.dialog_rejected == False): + log("worker.dialog_rejected = ",worker.dialog_rejected) + session = login(entry2) + ret_login = entry2.input_pw(gui,entry2.user,worker) if worker.dialog_rejected == False: choose_scraper(entry2,session) - if not gui: + if not gui: + ret = entry2 = config.readConfig(config_file,gui,worker) + #print(entry2) + if(ret != 0 and ret_login != 1): + if(entry2.loginurl != 'NONE'): + session = -1 + while (session == -1): + session = login(entry2) + if session == -1: + ret_login = entry2.input_pw(gui,entry2.user,worker) + log("[login_loop] session:",session) choose_scraper(entry2,session) def runner(entry,session,scrap_func,next_url_func):