job-scrapper/lib/helpers.py

import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
import shutil
from dateconverter import *
from datetime import datetime
import os
import sqlite3

DEBUG = True
number = ['0','1','2','3','4','5','6','7','8','9']
homePath = os.path.expanduser('~')
cookiePath = homePath + "/.mozilla/firefox/imibizoh.default/cookies.sqlite"
tmpPath = "/tmp/cookies.sqlite"
DBFILE = "../db/sqlite3.db"
def log(*s):
    if DEBUG:
        print(s)
class mode():
    #def set(self,flag,value):
     #   self.flag = flag
      #  self.value = value
    #def __init__(self,):
    DEFAULT = 0
    LINK = 0
    LOCATION_CLEANUP = 0
    SWAPDATE = 0
    CLEANDATE = 0
    ATTRS = 0
months = [    
    ('January','01'),
    ('February','02'),
    ('March','03'),
    ('April','04'),
    ('May','05'),
    ('June','06'),
    ('July','07'),
    ('August','08'),
    ('September','09'),
    ('October','10'),
    ('November','11'),
    ('December','12')]
class item():
    def __init__(self,tag,tag_content,index,name=None):
        self.tag = tag
        self.tag_content = tag_content
        self.index = index
        if name is not None:
            self.name = name
        else:
            self.name = "not defined"

class job():
    def __init__(self,title,profession,company,location,date,description,link,tag,starred):
        self.title = title
        self.profession = profession
        self.company =  company
        self.location = location
        self.date = date
        self.description = description
        self.link = link
        self.tag = tag
        self.starred = starred
    def __str__(self):
        return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)

def finder(results,item,**modes):
    GETCHILDREN = modes.get("GETCHILDREN",'')
    ATTRS = modes.get('ATTRS',0)
    LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
    LINK = modes.get('LINK',0)
    SWAPDATE = modes.get('SWAPDATE',0)
    CLEANDATE = modes.get('CLEANDATE',0)
    BASEURL = modes.get('BASEURL','')
    INDEEDDATE = modes.get('INDEEDDATE',0)
    content = []
    i = item.index
    log("name",item.name)
    log("Item tag: ",item.tag)
    log("Modes:",modes)
    log("tag_content: ",item.tag_content) 
    for entry in results:
        if ATTRS==1:
            result = entry.findAll(item.tag,attrs=item.tag_content)
            log(item.tag_content)
        else:
            result = entry.findAll(item.tag,class_=item.tag_content)
            log("found count results:",len(result))
            if len(result)==0 and  DEBUG == True:
                log("len result: ",len(result))
                for x in result:
                    log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
                    input()
        if result:
            log("theres a result")
            if i>(len(result)-1):
                log("len:",len(result)-1,"i:",i)
                log("index out of bounds fall back to the %d count",i)
                i=(len(result)-1)
            result2 = result[i]
            if GETCHILDREN != '':
                found = False
                for results in result:
                    child = results.find(GETCHILDREN)
                    log("[finder] search for '",GETCHILDREN,"' in: ",child)
                    if child != None and found == False:
                        log("CHILD text strip: ",child.text.strip())
                        found = True
                        content.append(child.text.strip())
                if found == False:
                    log("[finder] No matching Child found: ",child)
                    content.append("CHILD_NOT_FOUND: " + GETCHILDREN)

            elif LOCATION_CLEANUP==1:
                location = CleanLocation(result2.text.strip())
                content.append(location)
            elif LINK==1:
                string = result2.get("href")
                if BASEURL:
                    string = BASEURL+string
                content.append(string)
            elif SWAPDATE==1:
                content.append(DateCHToUS(result2.text.strip()))
            elif CLEANDATE==1:
                log("[finder] pre cleandate:",result2.text.strip)
                content.append(jobs_ch_clean_date(result2.text.strip()))
            elif INDEEDDATE==1:
                log("[finder] pre indeeddate:",result2.text.strip)
                content.append(indeedExtractDays(result2.text.strip()))
            else:
                log(result2)
                content.append(result2.text.strip())
        if not result:
            if item.tag_content == "pubdate":
                today = datetime.today().strftime('%Y-%m-%d')
                content.append(today)
            else:
                content.append("NOTFound")
    return content


def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def arrayToClass(titles,companys,locations,dates,links,tag):
    jobs = []
    if(len(titles) == len(companys) == len(locations) == len(dates)):
        log("len:",len(titles))
        for i, title in enumerate(titles):
            jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
            log("class job:",jobs[i])
        return jobs
    else:
        log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
        return 0
def jobs_ch_clean_date(date):
    newdate=''

    for i in range(11,len(date)):#remove string "Published:"
        newdate+=date[i]

    newdate2 = jobs_ch_switch_month(newdate)
    return newdate2

def jobs_ch_switch_month(date):
    newdate=''
    newmonth=''
    day = ''
    year = ''

    for i in range(3,len(date)-5):
        newmonth += date[i]
    for month in months:
        if(month[0] == newmonth):
            newmonth = month[1]
    
    for i in range(0,2):
        day+=date[i]
    for i in range(len(date)-2,len(date)):
        year += date[i]
    newdate = '20'+year+'-'+newmonth+'-'+day
    return newdate

def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def extractDomain(url):
    pattern = r'https:\/\/.*\..+?(?=\/)'
    domain = re.match(pattern,url)
    if domain:
        return domain.group()
    else:
        return 0
    
def makeSession(url):
    with requests.Session() as session:
        page = session.get(url)
        return session
def indeedExtractDays(datestr):
    cleannumstr=''
    cleannumint=-1
    cleandate=''
    foundchar=False
    for a in datestr:
        print(a)
        if a in number and foundchar==False:
            foundchar=True
            cleannumstr+=a
        elif a in number and foundchar == True:
            cleannumstr+=a
        elif a not in number and foundchar == True:
            break
    if cleannumstr != '':
        cleannumint = int(cleannumstr)
        today = int(datetime.utcnow().timestamp())
        cleandate = today - cleannumint * 60 * 60 * 7 * 4
    #print("int:",cleannumint,"today:",today,"cleandate:",datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d'))
        return datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d') 
    return "NOTFound"
def getCookiesFromBrowser(url):
    #workaround for loked database
    shutil.copyfile(cookiePath,tmpPath)
    cookie = ''
    rows = [0]

    with sqlite3.connect(tmpPath) as connection:
        cmd_read_cookies = f"""SELECT name,value FROM moz_cookies WHERE host like ?;"""
        print(cmd_read_cookies)
        cursor = connection.cursor()
        cursor.execute(cmd_read_cookies,('%'+url+'%',))
        while len(rows)!=0:
            rows = cursor.fetchmany(25)
            for row in rows:
                print("row:",row)
                cookie = cookie + row[0] + '=' + row[1]
                cookie += ";"

        print("Cookies:",cookie)
    return cookie
#access cookies from firefox:
#copy (because locked): cp .mozilla/firefox/imibizoh.default/cookies.sqlite cookies.sqlite
#Select value from moz_cookies where host like '%indeed%'
def urlToDomain(url):
    pos = patternSearch(url,"https://")
    urlCut = dropBeforePos(url,pos)
    print("url cut",urlCut)
    posDot = skipAfterChar(urlCut,'.')
    urlCut = dropBeforePos(urlCut,posDot)
    print("url after cut dot:",urlCut)
    posDot = skipAfterChar(urlCut,'.')
    urlCut = dropAfterPos(urlCut,posDot)
    print("url after cut dot:",urlCut)
def patternSearch(url,pattern):
    x = 0
    for a,i in enumerate(url):
        print("i:",i)
        if i == pattern[x]:
            if x<len(pattern)-1:
                x = x + 1
            elif x==len(pattern)-1:
                print("FULL PATTERN FOUND at pos :",a)
                break
        else:
            x = 0
    return a
def skipAfterChar(aString,char):
    for a,i in enumerate(aString):
        if i == char:
            break
    return a
def dropBeforePos(aString,pos):
    aString2=''
    pos+=1
    if pos < len(aString):
        for i in range(pos,len(aString)):
            aString2 += aString[i]
    return aString2
def dropAfterPos(aString,pos):
    aString2=''
    if pos < len(aString):
        for i in range(0,pos):
            aString2 += aString[i]
    return aString2
Initial commit 2024-06-13 09:14:04 +00:00			`import string`
			`import requests`
			`from bs4 import BeautifulSoup`
			`from enum import Enum`
			`import re`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`import shutil`
Initial commit 2024-06-13 09:14:04 +00:00			`from dateconverter import *`
			`from datetime import datetime`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`import os`
			`import sqlite3`

- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`DEBUG = True`
add time converter for indeed 2024-08-21 09:39:44 +00:00			`number = ['0','1','2','3','4','5','6','7','8','9']`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`homePath = os.path.expanduser('~')`
			`cookiePath = homePath + "/.mozilla/firefox/imibizoh.default/cookies.sqlite"`
			`tmpPath = "/tmp/cookies.sqlite"`
			`DBFILE = "../db/sqlite3.db"`
Initial commit 2024-06-13 09:14:04 +00:00			`def log(*s):`
			`if DEBUG:`
			`print(s)`
			`class mode():`
			`#def set(self,flag,value):`
			`# self.flag = flag`
			`# self.value = value`
			`#def __init__(self,):`
			`DEFAULT = 0`
			`LINK = 0`
			`LOCATION_CLEANUP = 0`
			`SWAPDATE = 0`
			`CLEANDATE = 0`
			`ATTRS = 0`
			`months = [`
			`('January','01'),`
			`('February','02'),`
			`('March','03'),`
			`('April','04'),`
			`('May','05'),`
			`('June','06'),`
			`('July','07'),`
			`('August','08'),`
			`('September','09'),`
			`('October','10'),`
			`('November','11'),`
			`('December','12')]`
			`class item():`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`def __init__(self,tag,tag_content,index,name=None):`
Initial commit 2024-06-13 09:14:04 +00:00			`self.tag = tag`
			`self.tag_content = tag_content`
			`self.index = index`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`if name is not None:`
			`self.name = name`
			`else:`
			`self.name = "not defined"`
Initial commit 2024-06-13 09:14:04 +00:00
			`class job():`
			`def __init__(self,title,profession,company,location,date,description,link,tag,starred):`
			`self.title = title`
			`self.profession = profession`
			`self.company = company`
			`self.location = location`
			`self.date = date`
			`self.description = description`
			`self.link = link`
			`self.tag = tag`
			`self.starred = starred`
			`def __str__(self):`
			`return "%s\| %s\|%s\|%s\|%s\|%s\|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)`

			`def finder(results,item,**modes):`
-implement GETCHILD from a tag like <strong>sdfsafd</strong> 2024-07-18 11:30:20 +00:00			`GETCHILDREN = modes.get("GETCHILDREN",'')`
Initial commit 2024-06-13 09:14:04 +00:00			`ATTRS = modes.get('ATTRS',0)`
			`LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)`
			`LINK = modes.get('LINK',0)`
			`SWAPDATE = modes.get('SWAPDATE',0)`
			`CLEANDATE = modes.get('CLEANDATE',0)`
			`BASEURL = modes.get('BASEURL','')`
add finder option indeedExtractDays 2024-08-21 09:42:47 +00:00			`INDEEDDATE = modes.get('INDEEDDATE',0)`
Initial commit 2024-06-13 09:14:04 +00:00			`content = []`
			`i = item.index`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`log("name",item.name)`
			`log("Item tag: ",item.tag)`
Initial commit 2024-06-13 09:14:04 +00:00			`log("Modes:",modes)`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`log("tag_content: ",item.tag_content)`
Initial commit 2024-06-13 09:14:04 +00:00			`for entry in results:`
			`if ATTRS==1:`
			`result = entry.findAll(item.tag,attrs=item.tag_content)`
			`log(item.tag_content)`
			`else:`
			`result = entry.findAll(item.tag,class_=item.tag_content)`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`log("found count results:",len(result))`
typo results instead of result add debugg logs 2024-07-22 08:54:28 +00:00			`if len(result)==0 and DEBUG == True:`
			`log("len result: ",len(result))`
			`for x in result:`
sdf 2024-07-18 11:48:44 +00:00			`log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)`
- change search classes - change some to attributes - implement better debuging solution in finder() 2024-07-18 09:26:13 +00:00			`input()`
Initial commit 2024-06-13 09:14:04 +00:00			`if result:`
			`log("theres a result")`
			`if i>(len(result)-1):`
			`log("len:",len(result)-1,"i:",i)`
			`log("index out of bounds fall back to the %d count",i)`
			`i=(len(result)-1)`
			`result2 = result[i]`
typo results instead of result add debugg logs 2024-07-22 08:54:28 +00:00			`if GETCHILDREN != '':`
-implement GETCHILD from a tag like <strong>sdfsafd</strong> 2024-07-18 11:30:20 +00:00			`found = False`
			`for results in result:`
			`child = results.find(GETCHILDREN)`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`log("[finder] search for '",GETCHILDREN,"' in: ",child)`
-implement GETCHILD from a tag like <strong>sdfsafd</strong> 2024-07-18 11:30:20 +00:00			`if child != None and found == False:`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`log("CHILD text strip: ",child.text.strip())`
-implement GETCHILD from a tag like <strong>sdfsafd</strong> 2024-07-18 11:30:20 +00:00			`found = True`
			`content.append(child.text.strip())`
			`if found == False:`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`log("[finder] No matching Child found: ",child)`
-implement GETCHILD from a tag like <strong>sdfsafd</strong> 2024-07-18 11:30:20 +00:00			`content.append("CHILD_NOT_FOUND: " + GETCHILDREN)`

			`elif LOCATION_CLEANUP==1:`
Initial commit 2024-06-13 09:14:04 +00:00			`location = CleanLocation(result2.text.strip())`
			`content.append(location)`
			`elif LINK==1:`
			`string = result2.get("href")`
			`if BASEURL:`
			`string = BASEURL+string`
			`content.append(string)`
			`elif SWAPDATE==1:`
			`content.append(DateCHToUS(result2.text.strip()))`
			`elif CLEANDATE==1:`
added indeed suport start extracting date from "vor X days" where time = today-X 2024-07-23 12:55:37 +00:00			`log("[finder] pre cleandate:",result2.text.strip)`
Initial commit 2024-06-13 09:14:04 +00:00			`content.append(jobs_ch_clean_date(result2.text.strip()))`
add finder option indeedExtractDays 2024-08-21 09:42:47 +00:00			`elif INDEEDDATE==1:`
			`log("[finder] pre indeeddate:",result2.text.strip)`
			`content.append(indeedExtractDays(result2.text.strip()))`
Initial commit 2024-06-13 09:14:04 +00:00			`else:`
typo results instead of result add debugg logs 2024-07-22 08:54:28 +00:00			`log(result2)`
Initial commit 2024-06-13 09:14:04 +00:00			`content.append(result2.text.strip())`
			`if not result:`
fix typo in timeformat %M ->%m %D -> %d 2024-06-17 09:03:50 +00:00			`if item.tag_content == "pubdate":`
			`today = datetime.today().strftime('%Y-%m-%d')`
Initial commit 2024-06-13 09:14:04 +00:00			`content.append(today)`
logic error when pubdate not found 2024-06-17 08:22:28 +00:00			`else:`
			`content.append("NOTFound")`
Initial commit 2024-06-13 09:14:04 +00:00			`return content`


			`def CleanLocation(location):`
			`#p = re.compile('CH-[0-9]{4}')`
			`location = re.sub('CH-[0-9]{4}\|[0-9]{4}\| ','',location)`
			`return location`

			`def arrayToClass(titles,companys,locations,dates,links,tag):`
			`jobs = []`
			`if(len(titles) == len(companys) == len(locations) == len(dates)):`
			`log("len:",len(titles))`
			`for i, title in enumerate(titles):`
			`jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))`
logic error when pubdate not found 2024-06-17 08:22:28 +00:00			`log("class job:",jobs[i])`
Initial commit 2024-06-13 09:14:04 +00:00			`return jobs`
			`else:`
logic error when pubdate not found 2024-06-17 08:27:13 +00:00			`log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))`
Initial commit 2024-06-13 09:14:04 +00:00			`return 0`
			`def jobs_ch_clean_date(date):`
			`newdate=''`

			`for i in range(11,len(date)):#remove string "Published:"`
			`newdate+=date[i]`

			`newdate2 = jobs_ch_switch_month(newdate)`
			`return newdate2`

			`def jobs_ch_switch_month(date):`
			`newdate=''`
			`newmonth=''`
			`day = ''`
			`year = ''`

			`for i in range(3,len(date)-5):`
			`newmonth += date[i]`
			`for month in months:`
			`if(month[0] == newmonth):`
			`newmonth = month[1]`

			`for i in range(0,2):`
			`day+=date[i]`
			`for i in range(len(date)-2,len(date)):`
			`year += date[i]`
			`newdate = '20'+year+'-'+newmonth+'-'+day`
			`return newdate`

			`def CleanLocation(location):`
			`#p = re.compile('CH-[0-9]{4}')`
			`location = re.sub('CH-[0-9]{4}\|[0-9]{4}\| ','',location)`
			`return location`

			`def extractDomain(url):`
			`pattern = r'https:\/\/.*\..+?(?=\/)'`
			`domain = re.match(pattern,url)`
			`if domain:`
			`return domain.group()`
			`else:`
			`return 0`

			`def makeSession(url):`
			`with requests.Session() as session:`
			`page = session.get(url)`
			`return session`
add time converter for indeed 2024-08-21 09:39:44 +00:00			`def indeedExtractDays(datestr):`
			`cleannumstr=''`
			`cleannumint=-1`
			`cleandate=''`
			`foundchar=False`
			`for a in datestr:`
			`print(a)`
			`if a in number and foundchar==False:`
			`foundchar=True`
			`cleannumstr+=a`
			`elif a in number and foundchar == True:`
			`cleannumstr+=a`
			`elif a not in number and foundchar == True:`
			`break`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`if cleannumstr != '':`
			`cleannumint = int(cleannumstr)`
			`today = int(datetime.utcnow().timestamp())`
			`cleandate = today - cleannumint * 60 * 60 * 7 * 4`
add time converter for indeed 2024-08-21 09:39:44 +00:00			`#print("int:",cleannumint,"today:",today,"cleandate:",datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d'))`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`return datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d')`
			`return "NOTFound"`
add some string operation functions 2024-08-22 08:52:45 +00:00			`def getCookiesFromBrowser(url):`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`#workaround for loked database`
			`shutil.copyfile(cookiePath,tmpPath)`
			`cookie = ''`
			`rows = [0]`

			`with sqlite3.connect(tmpPath) as connection:`
			`cmd_read_cookies = f"""SELECT name,value FROM moz_cookies WHERE host like ?;"""`
			`print(cmd_read_cookies)`
			`cursor = connection.cursor()`
add some string operation functions 2024-08-22 08:52:45 +00:00			`cursor.execute(cmd_read_cookies,('%'+url+'%',))`
add cookie extractor only supports firefox for now 2024-08-22 08:04:39 +00:00			`while len(rows)!=0:`
			`rows = cursor.fetchmany(25)`
			`for row in rows:`
			`print("row:",row)`
			`cookie = cookie + row[0] + '=' + row[1]`
			`cookie += ";"`

			`print("Cookies:",cookie)`
			`return cookie`
			`#access cookies from firefox:`
			`#copy (because locked): cp .mozilla/firefox/imibizoh.default/cookies.sqlite cookies.sqlite`
			`#Select value from moz_cookies where host like '%indeed%'`
add some string operation functions 2024-08-22 08:52:45 +00:00			`def urlToDomain(url):`
			`pos = patternSearch(url,"https://")`
			`urlCut = dropBeforePos(url,pos)`
			`print("url cut",urlCut)`
			`posDot = skipAfterChar(urlCut,'.')`
			`urlCut = dropBeforePos(urlCut,posDot)`
			`print("url after cut dot:",urlCut)`
			`posDot = skipAfterChar(urlCut,'.')`
			`urlCut = dropAfterPos(urlCut,posDot)`
			`print("url after cut dot:",urlCut)`
			`def patternSearch(url,pattern):`
			`x = 0`
			`for a,i in enumerate(url):`
			`print("i:",i)`
			`if i == pattern[x]:`
			`if x<len(pattern)-1:`
			`x = x + 1`
			`elif x==len(pattern)-1:`
			`print("FULL PATTERN FOUND at pos :",a)`
			`break`
			`else:`
			`x = 0`
			`return a`
			`def skipAfterChar(aString,char):`
			`for a,i in enumerate(aString):`
			`if i == char:`
			`break`
			`return a`
			`def dropBeforePos(aString,pos):`
			`aString2=''`
			`pos+=1`
			`if pos < len(aString):`
			`for i in range(pos,len(aString)):`
			`aString2 += aString[i]`
			`return aString2`
			`def dropAfterPos(aString,pos):`
			`aString2=''`
			`if pos < len(aString):`
			`for i in range(0,pos):`
			`aString2 += aString[i]`
			`return aString2`