change checking to httplib2

because
we can then make a head request and do not take bandwith
This commit is contained in:
ccppi 2024-07-22 12:23:17 +02:00
parent dd60c722d8
commit a96d4b59de

View File

@ -1,7 +1,8 @@
import sqlite3 import sqlite3
import mmh3 import mmh3
import sys import sys
import requests #import requests
import httplib2
DEBUG = False DEBUG = False
def log(*s): def log(*s):
@ -104,18 +105,32 @@ def isStillValid(file):
isLink = True isLink = True
rows = [] rows = []
rows = cursor.fetchmany(256) rows = cursor.fetchmany(256)
h = httplib2.Http()
for row in rows: for row in rows:
with requests.Session() as session: #with requests.Session() as session:
print("row: ",row[0]) print("row: ",row[0])
try: try:
page = session.get(row[0]) (resp,content) = h.request(row[0], 'HEAD')
except: #page = session.get(row[0])
except IOError as e:
print("link is no valid URL so remove item") print("link is no valid URL so remove item")
print("error: ",e)
isLink = False isLink = False
except RelativeURIError:
finally: isLink = False
if page.ok == False or isLink == False: print("Not a valid link")
print("link is no more valid, remove item")
rm_cursor = connection.cursor() rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],)) rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
print ("Deletion resultet in: ", rm_itm) finally:
try:
resp
except NameError:
print("Not a valid link")
rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
else:
if resp.status >= 400 or isLink == False:
print("link is no more valid, remove item")
rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
print ("Deletion resultet in: ", rm_itm)