job-scrapper/lib/db.py

158 lines
6.5 KiB
Python
Raw Normal View History

2024-06-13 09:14:04 +00:00
import sqlite3
import mmh3
import sys
#import requests
import httplib2
2024-07-26 10:46:36 +00:00
DEBUG = True
2024-06-13 09:14:04 +00:00
def log(*s):
if DEBUG:
print(s)
def initdb(file):
with sqlite3.connect(file) as connection:
print("db connection", connection.total_changes)
cursor = connection.cursor()
cursor.execute("CREATE TABLE jobs (star INT,tag INT ,title TEXT, location TEXT, company TEXT,link TEXT,pubdate TEXT,hash INT,viewed INT)")
2024-06-13 09:14:04 +00:00
sys.exit()
def rmdb(file,table):
with sqlite3.connect(file) as connection:
question = input("Do you really wont to empty the db(press Y)?")
if(question == "Y"):
cursor = connection.cursor()
drop_cmd = f"""DROP TABLE {table}"""
cursor.execute(drop_cmd)
else:
print("abroting removing table")
sys.exit()
def importdb(file,importdb,table):
with sqlite3.connect(file) as connection:
print("db connection",connection.total_changes)
cmd = f"""ATTACH "{importdb}" AS regions"""
cmd2 = f"""CREATE TABLE IF NOT EXISTS {table} AS SELECT * from regions.{table}"""
cmd_view = f"""
CREATE VIEW Canoton_Filter
AS
SELECT * FROM jobs as b
WHERE EXISTS
(SELECT GDENAME FROM {table} as w
where w.GDEKT = 'ZH' AND
b.location LIKE GDENAME);"""
cursor = connection.cursor()
cursor.execute(cmd)
print(cmd,cmd2)
cursor.execute(cmd2)
cursor.execute(cmd_view)
print("db connection",connection.total_changes)
def createnwview(file):
with sqlite3.connect(file) as connection:
2024-06-18 07:58:54 +00:00
cmd_create_nw_table = f"""CREATE VIEW IF NOT EXISTS "Nordwest-SCHWEIZ" AS SELECT * FROM jobs as b
2024-06-13 09:14:04 +00:00
WHERE EXISTS
(SELECT GDENAME FROM Cantons as w
where w.GDEKT = 'ZH' AND
b.location LIKE GDENAME)
OR EXISTS
(SELECT GDENAME FROM Cantons as w
where w.GDEKT = 'AG' AND
b.location LIKE GDENAME)
OR EXISTS
(SELECT GDENAME FROM Cantons as w
where w.GDEKT = 'SO' AND
b.location LIKE GDENAME)"""
cursor = connection.cursor()
cursor.execute(cmd_create_nw_table)
print("db connection",connection.total_changes)
2024-06-18 07:58:54 +00:00
createFilterTable(file)
def createFilterTable(file):
2024-07-26 10:46:36 +00:00
with sqlite3.connect(file,timeout=10) as connection:
2024-06-18 07:58:54 +00:00
cmd_create_filter_table = f"""CREATE TABLE IF NOT EXISTS filters(cmd TEXT);"""
cursor = connection.cursor()
cursor.execute(cmd_create_filter_table)
print("db connection:",connection.total_changes)
def addFineFilter(file,table,filterstr):
2024-07-26 10:46:36 +00:00
with sqlite3.connect(file,timeout=10) as connection:
2024-06-18 10:52:18 +00:00
cmd_createFineFilter = f"""INSERT INTO {table}(cmd) VALUES(?);"""
cmd_checkIfExists = f"""SELECT * FROM {table} WHERE cmd = ?"""
2024-06-18 07:58:54 +00:00
cursor = connection.cursor()
2024-06-18 10:52:18 +00:00
if cursor.execute(cmd_checkIfExists,(filterstr,)).fetchone() == None:
cursor.execute(cmd_createFineFilter,(filterstr,))
2024-06-13 09:14:04 +00:00
def writedb(jobs):
2024-07-26 10:46:36 +00:00
with sqlite3.connect("../db/sqlite3.db",timeout=10) as connection:
connection.execute("pragma journal_mode=wal")
2024-06-13 09:14:04 +00:00
print("db connection", connection.total_changes)
cursor = connection.cursor()
# cursor.execute("CREATE TABLE jobs (title TEXT, location TEXT, company TEXT,link TEXT,hash INT)")
for i3,job in enumerate(jobs):
hash1 = mmh3.hash(job.title+job.company+job.location+job.date)
log(hash1);
if(cursor.execute("SELECT * FROM jobs WHERE hash = ?",(hash1,)).fetchone() != None):
log("Hash already exist")
elif(cursor.execute("SELECT * FROM jobs where link = ?",(job.link,)).fetchone() != None):
log("link already exist")
2024-06-13 09:14:04 +00:00
else:
log("NEW_ENTRY")
cursor.execute("INSERT INTO jobs (star,tag,title,company,location,link,pubdate,hash,viewed) VALUES (?,?,?,?,?,?,?,?,?)",(job.starred,job.tag,job.title,job.company,job.location,job.link,job.date,hash1,0))
2024-08-06 11:42:58 +00:00
def viewedEntry(hash1):
viewedEntry.list = []
viewedEntry.list.append(hash1)
print("viewedEntry.list:",viewedEntry.list)
if len(viewedEntry.list) >= 5:
with sqlite3.connect("../db/sqlite3.db",timeout=10) as connection:
cursor = connection.cursor()
for x in viewedEntry.list:
print("hash:",x)
cursor.execute("UPDATE jobs SET viewed = '1' WHERE hash = ?",(x,))
viewedEntry.list = []
2024-08-06 11:42:58 +00:00
print("modified rows: ",cursor.rowcount)
2024-07-26 10:46:36 +00:00
def isStillValid(file,skiprows):
rows = [0,0,0]
2024-07-26 10:46:36 +00:00
with sqlite3.connect(file,timeout=10) as connection:
cmd_read_chunk = f"""SELECT link from jobs;"""
2024-07-26 10:46:36 +00:00
connection.execute("pragma journal_mode=wal")
cursor = connection.cursor()
cursor.execute(cmd_read_chunk)
2024-07-26 10:46:36 +00:00
#cursor.fetchmany(skiprows)#drop rows
while(len(rows)!=0):
isLink = True
rows = []
2024-07-26 10:46:36 +00:00
rows = cursor.fetchmany(256)
h = httplib2.Http()
for row in rows:
print("row: ",row[0])
try:
(resp,content) = h.request(row[0], 'HEAD')
except IOError as e:
print("link is no valid URL so remove item")
print("error: ",e)
isLink = False
2024-07-22 10:56:38 +00:00
except httplib2.error.RelativeURIError:
isLink = False
print("RelativeURIError: Not a valid link")
#rm_cursor = connection.cursor()
#rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
finally:
try:
resp
except NameError:
print("Not a valid link")
rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ? AND star != 1;""",(row[0],))
else:
if resp.status >= 400 or isLink == False:
print("link is no more valid, remove item")
rm_cursor = connection.cursor()
2024-08-19 10:55:56 +00:00
rm_itm = rm_cursor.execute("DELETE from jobs WHERE link = ? AND star != 1;",(row[0],))
print ("Deletion resultet in: ", rm_itm)
2024-07-22 10:56:38 +00:00
print("result of commit: ", connection.commit())
2024-07-26 10:46:36 +00:00
return 0