change checking to httplib2

because
we can then make a head request and do not take bandwith
This commit is contained in:
ccppi 2024-07-22 12:23:17 +02:00
parent dd60c722d8
commit a96d4b59de

View File

@ -1,7 +1,8 @@
import sqlite3
import mmh3
import sys
import requests
#import requests
import httplib2
DEBUG = False
def log(*s):
@ -104,18 +105,32 @@ def isStillValid(file):
isLink = True
rows = []
rows = cursor.fetchmany(256)
h = httplib2.Http()
for row in rows:
with requests.Session() as session:
#with requests.Session() as session:
print("row: ",row[0])
try:
page = session.get(row[0])
except:
(resp,content) = h.request(row[0], 'HEAD')
#page = session.get(row[0])
except IOError as e:
print("link is no valid URL so remove item")
print("error: ",e)
isLink = False
finally:
if page.ok == False or isLink == False:
print("link is no more valid, remove item")
except RelativeURIError:
isLink = False
print("Not a valid link")
rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
print ("Deletion resultet in: ", rm_itm)
finally:
try:
resp
except NameError:
print("Not a valid link")
rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
else:
if resp.status >= 400 or isLink == False:
print("link is no more valid, remove item")
rm_cursor = connection.cursor()
rm_itm = rm_cursor.execute(f"""DELETE from jobs WHERE link = ?;""",(row[0],))
print ("Deletion resultet in: ", rm_itm)