diff --git a/lib/db.py b/lib/db.py index f8802e9..937c8f4 100644 --- a/lib/db.py +++ b/lib/db.py @@ -3,7 +3,7 @@ import mmh3 import sys #import requests import httplib2 -DEBUG = True +DEBUG = False def log(*s): if DEBUG: diff --git a/lib/helpers.py b/lib/helpers.py index fdc3acf..c9aebc3 100644 --- a/lib/helpers.py +++ b/lib/helpers.py @@ -11,7 +11,7 @@ import sqlite3 import webbrowser import mozilla -DEBUG = True +DEBUG = False number = ['0','1','2','3','4','5','6','7','8','9'] diff --git a/lib/login.py b/lib/login.py index 27d88e9..342cb52 100644 --- a/lib/login.py +++ b/lib/login.py @@ -43,23 +43,14 @@ def login(entry): #somehow getting the cookie maype? def solveCaptcha(session,resp): - found = 0 - if "captcha" or "Enable JavaScript" in resp : - #soup = BeautifulSoup(resp,"html.parser") - #result = soup.find("iframe") - #while found==0: - # if "captcha" in resp:#result: - print("captcha link!! found:") - found=1 - #else: - # result.find_next() - print("exit loop") print("response:",resp) - #if found: - #print("captchaurl:", result["src"]) - #x = input("continue") - #else: - # print("did not recognise a captcha") + if "captcha" or "Enable JavaScript" in resp : + print("captcha link!! found:") + return 1 + else: + return 0 + + def checkBlockers(session,resp): print("response from login attempt",resp) if resp: diff --git a/lib/mozilla.py b/lib/mozilla.py index 134609d..bf43c17 100644 --- a/lib/mozilla.py +++ b/lib/mozilla.py @@ -29,7 +29,7 @@ def findDefaultProfile(path): else: return target -def getCookiesFromBrowser(url): +def getCookiesFromBrowser(url,force=False): DBFILE = "../db/sqlite3.db" if os.name == 'posix': homePath = os.path.expanduser('~') @@ -63,7 +63,7 @@ def getCookiesFromBrowser(url): cookie += ";" print("Cookies:",cookie) - if cookie == '': + if cookie == '' and force == False: if os.name == 'posix': webbrowser.register("firefox",None,webbrowser.BackgroundBrowser("firefox")) webbrowser.get('firefox').open(url) diff --git a/lib/scrap_jobs.py b/lib/scrap_jobs.py index a305f3e..0804740 100644 --- a/lib/scrap_jobs.py +++ b/lib/scrap_jobs.py @@ -13,7 +13,7 @@ def scrap_indeed_com(url,entry,session): session.headers = { "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0", "Referer" : "https://ch.indeed.com/jobs?&from=searchOnHP", - "Cookie" : moz_cookies# """CTK=1i5q619l6jvkj800; indeed_rcc="LV:CTK:RQ"; CSRF=aEb4JWpfbYPy3j3g2rsUPDSixXqBe1Oe; INDEED_CSRF_TOKEN=4p83HqsTMan9QrVZun2Q0wrFeCoGm9mG; LV="LA=1724238852:LV=1724234376:CV=1724238852:TS=1724234376"; _cfuvid=27ptQm94DDaFeIjNGSNxW3g9GyDAJExtQz_RNr0jvE0-1724238843162-0.0.1.1-604800000; JSESSIONID=F196631331EF16D28C0E00AC7A43CB10; OptanonConsent=isGpcEnabled=1&datestamp=Wed+Aug+21+2024+13%3A14%3A47+GMT%2B0200+(Central+European+Summer+Time)&version=202210.1.0&isIABGlobal=false&hosts=&consentId=b0f6c692-930d-4929-9251-9a4f7bc72f61&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A0%2CC0003%3A0%2CC0004%3A0%2CC0007%3A0&AwaitingReconsent=false; _ga_LYNT3BTHPG=GS1.1.1724238849.2.1.1724238908.0.0.454081609; _ga=GA1.1.1356051481.1724234379; SURF=WCl9mMSuWXP2jp3GlLMyXzkQkAdKDg7W; FPID=FPID2.2.Dd22VS9g0Vfjh5dQoT9s%2Bws7zDmpmQlIzsYP9ZLW8kg%3D.1724234379; FPLC=Qmy8DxSR81EJxewKgZ7RfgP%2BdXEXWWU4RKVUs2Pn1vEIp%2Fu2Upaqz5%2Blgf05XLqfdY7S4qGRwWAbQqAbKQZb%2FBWQxZwpmvOzw%2Bhgpkfvj320PLIwamECv9iYH%2Bx%2FrQ%3D%3D; RQ="q=quereinsteiger&l=&ts=1724238933002&rbsalmin=0&rbsalmax=0:q=python+qt&l=&ts=1724234491003&rbsalmin=0&rbsalmax=0"; __cf_bm=X3BsfEnAGodB.ELxHVfYTAYd4K4n3TUbHVV7OyKMjBg-1724238843-1.0.1.1-4QMaUgbvnumBKmzwOcY2o0Taikgpvn72OoTXG_ZtU8q3qOCuf06riyYIJlXD.zsd7JxmZ_VdN1S9cCbGwXid6w; gonetap=closed; SHARED_INDEED_CSRF_TOKEN=4p83HqsTMan9QrVZun2Q0wrFeCoGm9mG""" + "Cookie" : moz_cookies } jobs = [] log("in scrap jobs,url",url) @@ -24,7 +24,10 @@ def scrap_indeed_com(url,entry,session): else: page = session.get(url) log(page) - solveCaptcha(session,page) + if solveCaptcha(session,page) == 1: + print("Cookie stealing unsuccesfull retry with force") + moz_cookies = mozilla.getCookiesFromBrowser(url,force=True) + soup = BeautifulSoup(page.content,"html.parser") #print(soup.prettify())