job-scrapper/lib/sysparse.py

129 lines
4.9 KiB
Python
Raw Normal View History

2024-06-13 09:14:04 +00:00
import argparse
import config
import sys
from enum import IntEnum
from scrap_jobs import *
from login import *
from time import sleep
from db import *
2024-06-17 08:27:13 +00:00
DEBUG = False
def log(*s):
if DEBUG:
print(s)
2024-06-13 09:14:04 +00:00
def choose_scraper(entry,session):
if not session:
session = requests.Session()
domain = extractDomain(entry.scrapurl)
match domain:
case 'https://www.jobs.ch':
runner(entry,session,scrap_jobs,next_url_jobs_ch)
case 'https://software-job.ch':
runner(entry,session,scrap_jobagent,next_url_jobagent)
case 'https://www.jobagent.ch':
runner(entry,session,scrap_jobagent,next_url_jobagent)
case 'https://ch.indeed.com':
runner(entry,session,scrap_indeed_com,next_url_indeed_com)
2024-06-13 09:14:04 +00:00
def parse(**kwargs):
session=0
if len(sys.argv)>1:
worker=0
parser = argparse.ArgumentParser()
parser.add_argument("-c","--config",help = "Specific a config file to use,from where to scrap the jobs")
parser.add_argument("-t","--test",help = "only for test purposes while developing",action="store_true")
parser.add_argument("--importregiondb",help = "Import a database used for querring by Regions or Cantons",action="store_true")
parser.add_argument("--initdb",help = "Initialice a new db from scratch without entrys",action="store_true")
parser.add_argument("--rmdb",help = "!!reove existing db!!DATALOSS!!",action="store_true")
# parser.add_argument("--help",help = "print help")
parser.add_argument("--login",nargs=3,help = "login by specifing login and passwor by a given url",metavar=('USERNAME','PASSWORD','URL'))
parser.add_argument("--createnwview",help = "Create a VIEW for the Region Nordwest Schweiz",action="store_true")
parser.add_argument("-VC","--ValidationCheck",help = "Check if links are still valid, if not remove them",action="store_true")
2024-06-13 09:14:04 +00:00
args = parser.parse_args()
if args.test:
2024-06-18 07:58:54 +00:00
addFineFilter("../db/sqlite3.db","filters","testfilterentry")
2024-06-13 09:14:04 +00:00
if args.importregiondb:
importdb("../db/sqlite3.db","../db/Cantons.db","Cantons")
if args.initdb:
initdb("../db/sqlite3.db")
if args.rmdb:
rmdb("../db/sqlite3.db","jobs")
if args.login:
user,pw,url = args.login
session = login(user,pw,url,url)
choose_scraper(url,session)
if args.config:
login_loop(args.config,False,worker)
if args.createnwview:
createnwview("../db/sqlite3.db")
if args.ValidationCheck:
isStillValid("../db/sqlite3.db")
2024-06-13 09:14:04 +00:00
if len(kwargs)>0:
print("no sysargs fiven, running as a module")
vconfig = kwargs.get('config')
worker = kwargs.get('worker')
print("config:",vconfig)
if vconfig:
login_loop(vconfig,True,worker)
worker.finished.emit()
print("finished sync job")
def login_loop(config_file,gui,worker):
ret = -1
ret_login = 0
session = 0
while (ret != 0):
ret = entry2 = config.readConfig(config_file,gui,worker)
print(entry2)
if(ret != 0 and ret_login != 1):
if(entry2.loginurl != 'NONE'):
session = -1
while session == -1:
session = login(entry2)
if session == -1:
ret_login = entry2.input_pw(gui,entry2.user,worker)
choose_scraper(entry2,session)
def runner(entry,session,scrap_func,next_url_func):
i=0
b_url = entry.scrapurl
while b_url != 0 and i<50:
sleep(0.3)
if b_url:
domain = extractDomain(b_url)
print(domain)
if domain == 'https://www.jobagent.ch' or domain == 'https://software-job.ch':
jobs = scrap_func(b_url,entry,session)
2024-06-17 08:22:28 +00:00
if jobs:
writedb(jobs)
else:
print("nothing found on this page")
2024-06-13 09:14:04 +00:00
b_url = next_url_func(b_url,session,0)
elif domain == 'https://www.jobs.ch':
jobs = scrap_func(b_url,entry,session)
2024-06-17 08:22:28 +00:00
if jobs:
writedb(jobs)
else:
print("nothing found on this page")
2024-06-13 09:14:04 +00:00
b_url = next_url_func(b_url,session,"https://www.jobs.ch")
elif domain == 'https://ch.indeed.com':
jobs = scrap_func(b_url,entry,session)
if jobs:
writedb(jobs)
else:
print("nothing found on this page")
b_url = next_url_func(b_url,session,domain)
2024-06-13 09:14:04 +00:00
if b_url != 0:
print("main:" + b_url)
if b_url==0:
print("End of listed items, or did not find any other Nächste Seite Buttons")
i=i+1
print(i)