job-scrapper/lib/helpers.py

import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
import shutil
from dateconverter import *
from datetime import datetime
import os
import sqlite3
import webbrowser
from time import sleep

DEBUG = True
number = ['0','1','2','3','4','5','6','7','8','9']
homePath = os.path.expanduser('~')
cookiePath = homePath + "/.mozilla/firefox/imibizoh.default/cookies.sqlite"
tmpPath = "/tmp/cookies.sqlite"
DBFILE = "../db/sqlite3.db"
winFirefoxPath = f"""C:\Program Files\Mozilla Firefox\firefox.exe"""
def log(*s):
    if DEBUG:
        print(s)
class mode():
    #def set(self,flag,value):
     #   self.flag = flag
      #  self.value = value
    #def __init__(self,):
    DEFAULT = 0
    LINK = 0
    LOCATION_CLEANUP = 0
    SWAPDATE = 0
    CLEANDATE = 0
    ATTRS = 0
months = [
    ('January','01'),
    ('February','02'),
    ('March','03'),
    ('April','04'),
    ('May','05'),
    ('June','06'),
    ('July','07'),
    ('August','08'),
    ('September','09'),
    ('October','10'),
    ('November','11'),
    ('December','12')]
class item():
    def __init__(self,tag,tag_content,index,name=None):
        self.tag = tag
        self.tag_content = tag_content
        self.index = index
        if name is not None:
            self.name = name
        else:
            self.name = "not defined"

class job():
    def __init__(self,title,profession,company,location,date,description,link,tag,starred):
        self.title = title
        self.profession = profession
        self.company =  company
        self.location = location
        self.date = date
        self.description = description
        self.link = link
        self.tag = tag
        self.starred = starred
    def __str__(self):
        return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)

def finder(results,item,**modes):
    GETCHILDREN = modes.get("GETCHILDREN",'')
    ATTRS = modes.get('ATTRS',0)
    LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
    LINK = modes.get('LINK',0)
    SWAPDATE = modes.get('SWAPDATE',0)
    CLEANDATE = modes.get('CLEANDATE',0)
    BASEURL = modes.get('BASEURL','')
    INDEEDDATE = modes.get('INDEEDDATE',0)
    content = []
    i = item.index
    log("name",item.name)
    log("Item tag: ",item.tag)
    log("Modes:",modes)
    log("tag_content: ",item.tag_content)
    for entry in results:
        if ATTRS==1:
            result = entry.findAll(item.tag,attrs=item.tag_content)
            log(item.tag_content)
        else:
            result = entry.findAll(item.tag,class_=item.tag_content)
            log("found count results:",len(result))
            if len(result)==0 and  DEBUG == True:
                log("len result: ",len(result))
                for x in result:
                    log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
                    input()
        if result:
            log("theres a result")
            if i>(len(result)-1):
                log("len:",len(result)-1,"i:",i)
                log("index out of bounds fall back to the %d count",i)
                i=(len(result)-1)
            result2 = result[i]
            if GETCHILDREN != '':
                found = False
                for results in result:
                    child = results.find(GETCHILDREN)
                    log("[finder] search for '",GETCHILDREN,"' in: ",child)
                    if child != None and found == False:
                        log("CHILD text strip: ",child.text.strip())
                        found = True
                        content.append(child.text.strip())
                if found == False:
                    log("[finder] No matching Child found: ",child)
                    content.append("CHILD_NOT_FOUND: " + GETCHILDREN)

            elif LOCATION_CLEANUP==1:
                location = CleanLocation(result2.text.strip())
                content.append(location)
            elif LINK==1:
                string = result2.get("href")
                if BASEURL:
                    string = BASEURL+string
                content.append(string)
            elif SWAPDATE==1:
                content.append(DateCHToUS(result2.text.strip()))
            elif CLEANDATE==1:
                log("[finder] pre cleandate:",result2.text.strip)
                content.append(jobs_ch_clean_date(result2.text.strip()))
            elif INDEEDDATE==1:
                log("[finder] pre indeeddate:",result2.text.strip)
                content.append(indeedExtractDays(result2.text.strip()))
            else:
                log(result2)
                content.append(result2.text.strip())
        if not result:
            if item.tag_content == "pubdate":
                today = datetime.today().strftime('%Y-%m-%d')
                content.append(today)
            else:
                content.append("NOTFound")
    return content


def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def arrayToClass(titles,companys,locations,dates,links,tag):
    jobs = []
    if(len(titles) == len(companys) == len(locations) == len(dates)):
        log("len:",len(titles))
        for i, title in enumerate(titles):
            jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
            log("class job:",jobs[i])
        return jobs
    else:
        log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
        return 0
def jobs_ch_clean_date(date):
    newdate=''

    for i in range(11,len(date)):#remove string "Published:"
        newdate+=date[i]

    newdate2 = jobs_ch_switch_month(newdate)
    return newdate2

def jobs_ch_switch_month(date):
    newdate=''
    newmonth=''
    day = ''
    year = ''

    for i in range(3,len(date)-5):
        newmonth += date[i]
    for month in months:
        if(month[0] == newmonth):
            newmonth = month[1]

    for i in range(0,2):
        day+=date[i]
    for i in range(len(date)-2,len(date)):
        year += date[i]
    newdate = '20'+year+'-'+newmonth+'-'+day
    return newdate

def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def extractDomain(url):
    pattern = r'https:\/\/.*\..+?(?=\/)'
    domain = re.match(pattern,url)
    if domain:
        return domain.group()
    else:
        return 0

def makeSession(url):
    with requests.Session() as session:
        page = session.get(url)
        return session
def indeedExtractDays(datestr):
    cleannumstr=''
    cleannumint=-1
    cleandate=''
    foundchar=False
    for a in datestr:
        print(a)
        if a in number and foundchar==False:
            foundchar=True
            cleannumstr+=a
        elif a in number and foundchar == True:
            cleannumstr+=a
        elif a not in number and foundchar == True:
            break
    if cleannumstr != '':
        cleannumint = int(cleannumstr)
        today = int(datetime.utcnow().timestamp())
        cleandate = today - cleannumint * 60 * 60 * 7 * 4
    #print("int:",cleannumint,"today:",today,"cleandate:",datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d'))
        return datetime.fromtimestamp(cleandate).strftime('%Y-%m-%d')
    return "NOTFound"
def getCookiesFromBrowser(url):
    #workaround for loked database
    tries=0
    cookie = ''
    rows = [0]
    while(cookie == '' and tries < 2):
        tries+=1;
        shutil.copyfile(cookiePath,tmpPath)
        with sqlite3.connect(tmpPath) as connection:
            cmd_read_cookies = f"""SELECT name,value FROM moz_cookies WHERE host like ?;"""
            print(cmd_read_cookies)
            cursor = connection.cursor()
            cursor.execute(cmd_read_cookies,(urlToDomain(url),))
            while len(rows)!=0:
                rows = cursor.fetchmany(25)
                for row in rows:
                    print("row:",row)
                    cookie = cookie + row[0] + '=' + row[1]
                    cookie += ";"

            print("Cookies:",cookie)
            if cookie == '':
                if os.name == 'posix':
                    webbrowser.register("firefox",None,webbrowser.BackgroundBrowser("firefox"))
                    webbrowser.get('firefox').open(url)
                elif os.name == 'nt':
                    webbrowser.register("firefox",None,webbrowser.BackgroundBrowser(winFirefoxPath))
                    webbrowser.get('firefox').open(url)
                sleep(1)
    return cookie
#access cookies from firefox:
#copy (because locked): cp .mozilla/firefox/imibizoh.default/cookies.sqlite cookies.sqlite
#Select value from moz_cookies where host like '%indeed%'
def urlToDomain(url):
    pos = patternSearch(url,"https://")
    urlCut = dropBeforePos(url,pos)
    posDot = skipAfterChar(urlCut,'.') - 1
    urlCut = dropBeforePos(urlCut,posDot)
    posDot = skipAfterChar(urlCut,'/')
    urlCut = dropAfterPos(urlCut,posDot)
    print("url after cut dot:",urlCut)
    return urlCut

def patternSearch(url,pattern):
    x = 0
    for a,i in enumerate(url):
        print("i:",i)
        if i == pattern[x]:
            if x<len(pattern)-1:
                x = x + 1
            elif x==len(pattern)-1:
                print("FULL PATTERN FOUND at pos :",a)
                break
        else:
            x = 0
    return a
def skipAfterChar(aString,char):
    for a,i in enumerate(aString):
        if i == char:
            break
    return a
def dropBeforePos(aString,pos):
    aString2=''
    pos+=1
    if pos < len(aString):
        for i in range(pos,len(aString)):
            aString2 += aString[i]
    return aString2
def dropAfterPos(aString,pos):
    aString2=''
    if pos < len(aString):
        for i in range(0,pos):
            aString2 += aString[i]
    return aString2