job-scrapper/lib/helpers.py

import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
from dateconverter import *
from datetime import datetime
DEBUG = True

def log(*s):
    if DEBUG:
        print(s)
class mode():
    #def set(self,flag,value):
     #   self.flag = flag
      #  self.value = value
    #def __init__(self,):
    DEFAULT = 0
    LINK = 0
    LOCATION_CLEANUP = 0
    SWAPDATE = 0
    CLEANDATE = 0
    ATTRS = 0
months = [
    ('January','01'),
    ('February','02'),
    ('March','03'),
    ('April','04'),
    ('May','05'),
    ('June','06'),
    ('July','07'),
    ('August','08'),
    ('September','09'),
    ('October','10'),
    ('November','11'),
    ('December','12')]
class item():
    def __init__(self,tag,tag_content,index,name=None):
        self.tag = tag
        self.tag_content = tag_content
        self.index = index
        if name is not None:
            self.name = name
        else:
            self.name = "not defined"

class job():
    def __init__(self,title,profession,company,location,date,description,link,tag,starred):
        self.title = title
        self.profession = profession
        self.company =  company
        self.location = location
        self.date = date
        self.description = description
        self.link = link
        self.tag = tag
        self.starred = starred
    def __str__(self):
        return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)

def finder(results,item,**modes):
    GETCHILDREN = modes.get("GETCHILDREN",'')
    ATTRS = modes.get('ATTRS',0)
    LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
    LINK = modes.get('LINK',0)
    SWAPDATE = modes.get('SWAPDATE',0)
    CLEANDATE = modes.get('CLEANDATE',0)
    BASEURL = modes.get('BASEURL','')
    content = []
    i = item.index
    log("name",item.name)
    log("Item tag: ",item.tag)
    log("Modes:",modes)
    log("tag_content: ",item.tag_content)
    for entry in results:
        if ATTRS==1:
            result = entry.findAll(item.tag,attrs=item.tag_content)
            log(item.tag_content)
        else:
            result = entry.findAll(item.tag,class_=item.tag_content)
            log("found count results:",len(result))
            if len(result)==0 and  DEBUG == True:
                log("len result: ",len(result))
                for x in result:
                    log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x)
                    input()
        if result:
            log("theres a result")
            if i>(len(result)-1):
                log("len:",len(result)-1,"i:",i)
                log("index out of bounds fall back to the %d count",i)
                i=(len(result)-1)
            result2 = result[i]
            if GETCHILDREN != '':
                found = False
                for results in result:
                    child = results.find(GETCHILDREN)
                    log(child)
                    if child != None and found == False:
                        log("CHILD: ",child.text.strip())
                        found = True
                        content.append(child.text.strip())
                if found == False:
                    content.append("CHILD_NOT_FOUND: " + GETCHILDREN)

            elif LOCATION_CLEANUP==1:
                location = CleanLocation(result2.text.strip())
                content.append(location)
            elif LINK==1:
                string = result2.get("href")
                if BASEURL:
                    string = BASEURL+string
                content.append(string)
            elif SWAPDATE==1:
                content.append(DateCHToUS(result2.text.strip()))
            elif CLEANDATE==1:
                content.append(jobs_ch_clean_date(result2.text.strip()))
            else:
                log(result2)
                content.append(result2.text.strip())
        if not result:
            if item.tag_content == "pubdate":
                today = datetime.today().strftime('%Y-%m-%d')
                content.append(today)
            else:
                content.append("NOTFound")
    return content


def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def arrayToClass(titles,companys,locations,dates,links,tag):
    jobs = []
    if(len(titles) == len(companys) == len(locations) == len(dates)):
        log("len:",len(titles))
        for i, title in enumerate(titles):
            jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
            log("class job:",jobs[i])
        return jobs
    else:
        log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates))
        return 0
def jobs_ch_clean_date(date):
    newdate=''

    for i in range(11,len(date)):#remove string "Published:"
        newdate+=date[i]

    newdate2 = jobs_ch_switch_month(newdate)
    return newdate2

def jobs_ch_switch_month(date):
    newdate=''
    newmonth=''
    day = ''
    year = ''

    for i in range(3,len(date)-5):
        newmonth += date[i]
    for month in months:
        if(month[0] == newmonth):
            newmonth = month[1]

    for i in range(0,2):
        day+=date[i]
    for i in range(len(date)-2,len(date)):
        year += date[i]
    newdate = '20'+year+'-'+newmonth+'-'+day
    return newdate

def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def extractDomain(url):
    pattern = r'https:\/\/.*\..+?(?=\/)'
    domain = re.match(pattern,url)
    if domain:
        return domain.group()
    else:
        return 0

def makeSession(url):
    with requests.Session() as session:
        page = session.get(url)
        return session