From a96d4b59de4c2ded404baf6eda58d9562f68ffd7 Mon Sep 17 00:00:00 2001 From: ccppi Date: Mon, 22 Jul 2024 12:23:17 +0200 Subject: [PATCH] change checking to httplib2 because we can then make a head request and do not take bandwith --- lib/db.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/lib/db.py b/lib/db.py index f739385..f8da2dd 100644 --- a/lib/db.py +++ b/lib/db.py @@ -1,7 +1,8 @@ import sqlite3 import mmh3 import sys -import requests +#import requests +import httplib2 DEBUG = False def log(*s): @@ -104,18 +105,32 @@ def isStillValid(file): isLink = True rows = [] rows = cursor.fetchmany(256) + h = httplib2.Http() for row in rows: - with requests.Session() as session: + #with requests.Session() as session: print("row: ",row[0]) try: - page = session.get(row[0]) - except: + (resp,content) = h.request(row[0], 'HEAD') + #page = session.get(row[0]) + except IOError as e: print("link is no valid URL so remove item") + print("error: ",e) isLink = False - - finally: - if page.ok == False or isLink == False: - print("link is no more valid, remove item") + except RelativeURIError: + isLink = False + print("Not a valid link") rm_cursor = connection.cursor() rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],)) - print ("Deletion resultet in: ", rm_itm) + finally: + try: + resp + except NameError: + print("Not a valid link") + rm_cursor = connection.cursor() + rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],)) + else: + if resp.status >= 400 or isLink == False: + print("link is no more valid, remove item") + rm_cursor = connection.cursor() + rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],)) + print ("Deletion resultet in: ", rm_itm)