job-scrapper/lib/helpers.py

import string
import requests
from bs4 import BeautifulSoup
from enum import Enum
import re
from dateconverter import *
from datetime import datetime
DEBUG = False

def log(*s):
    if DEBUG:
        print(s)
class mode():
    #def set(self,flag,value):
     #   self.flag = flag
      #  self.value = value
    #def __init__(self,):
    DEFAULT = 0
    LINK = 0
    LOCATION_CLEANUP = 0
    SWAPDATE = 0
    CLEANDATE = 0
    ATTRS = 0
months = [    
    ('January','01'),
    ('February','02'),
    ('March','03'),
    ('April','04'),
    ('May','05'),
    ('June','06'),
    ('July','07'),
    ('August','08'),
    ('September','09'),
    ('October','10'),
    ('November','11'),
    ('December','12')]
class item():
    def __init__(self,tag,tag_content,index):
        self.tag = tag
        self.tag_content = tag_content
        self.index = index

class job():
    def __init__(self,title,profession,company,location,date,description,link,tag,starred):
        self.title = title
        self.profession = profession
        self.company =  company
        self.location = location
        self.date = date
        self.description = description
        self.link = link
        self.tag = tag
        self.starred = starred

    def __str__(self):
        return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)

def finder(results,item,**modes):
    ATTRS = modes.get('ATTRS',0)
    LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)
    LINK = modes.get('LINK',0)
    SWAPDATE = modes.get('SWAPDATE',0)
    CLEANDATE = modes.get('CLEANDATE',0)
    BASEURL = modes.get('BASEURL','')
    content = []
    i = item.index
    log("Modes:",modes)
    
    for entry in results:
        if ATTRS==1:
            result = entry.findAll(item.tag,attrs=item.tag_content)
            log(item.tag_content)
        else:
            result = entry.findAll(item.tag,class_=item.tag_content)
            log("found:",len(result))
        if result:
            log("theres a result")
            if i>(len(result)-1):
                log("len:",len(result)-1,"i:",i)
                log("index out of bounds fall back to the %d count",i)
               # input("Press Enter..")
                i=(len(result)-1)
            result2 = result[i]
            if LOCATION_CLEANUP==1:
                location = CleanLocation(result2.text.strip())
                content.append(location)
            elif LINK==1:
                string = result2.get("href")
                if BASEURL:
                    string = BASEURL+string
                content.append(string)
            elif SWAPDATE==1:
                content.append(DateCHToUS(result2.text.strip()))
            elif CLEANDATE==1:
                content.append(jobs_ch_clean_date(result2.text.strip()))
            else:
                content.append(result2.text.strip())
        if not result:
            if CLEANDATE:
                today = datetime.today().strftime('%Y-%M-%D')
                content.append(today)
            content.append("NOTFound")
    return content


def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def arrayToClass(titles,companys,locations,dates,links,tag):
    jobs = []
    if(len(titles) == len(companys) == len(locations) == len(dates)):
        log("len:",len(titles))
        for i, title in enumerate(titles):
            jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))
            log(jobs[i])
        return jobs
    else:
        print("Something went wrong unequal length of data arrays")
        return 0
def jobs_ch_clean_date(date):
    newdate=''

    for i in range(11,len(date)):#remove string "Published:"
        newdate+=date[i]

    newdate2 = jobs_ch_switch_month(newdate)
    return newdate2

def jobs_ch_switch_month(date):
    newdate=''
    newmonth=''
    day = ''
    year = ''

    for i in range(3,len(date)-5):
        newmonth += date[i]
    for month in months:
        if(month[0] == newmonth):
            newmonth = month[1]
    
    for i in range(0,2):
        day+=date[i]
    for i in range(len(date)-2,len(date)):
        year += date[i]
    newdate = '20'+year+'-'+newmonth+'-'+day
    return newdate

def CleanLocation(location):
    #p = re.compile('CH-[0-9]{4}')
    location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location)
    return location

def extractDomain(url):
    pattern = r'https:\/\/.*\..+?(?=\/)'
    domain = re.match(pattern,url)
    if domain:
        return domain.group()
    else:
        return 0
    
def makeSession(url):
    with requests.Session() as session:
        page = session.get(url)
        return session
Initial commit 2024-06-13 09:11:58 +00:00			`import string`
			`import requests`
			`from bs4 import BeautifulSoup`
			`from enum import Enum`
			`import re`
			`from dateconverter import *`
			`from datetime import datetime`
			`DEBUG = False`

			`def log(*s):`
			`if DEBUG:`
			`print(s)`
			`class mode():`
			`#def set(self,flag,value):`
			`# self.flag = flag`
			`# self.value = value`
			`#def __init__(self,):`
			`DEFAULT = 0`
			`LINK = 0`
			`LOCATION_CLEANUP = 0`
			`SWAPDATE = 0`
			`CLEANDATE = 0`
			`ATTRS = 0`
			`months = [`
			`('January','01'),`
			`('February','02'),`
			`('March','03'),`
			`('April','04'),`
			`('May','05'),`
			`('June','06'),`
			`('July','07'),`
			`('August','08'),`
			`('September','09'),`
			`('October','10'),`
			`('November','11'),`
			`('December','12')]`
			`class item():`
			`def __init__(self,tag,tag_content,index):`
			`self.tag = tag`
			`self.tag_content = tag_content`
			`self.index = index`

			`class job():`
			`def __init__(self,title,profession,company,location,date,description,link,tag,starred):`
			`self.title = title`
			`self.profession = profession`
			`self.company = company`
			`self.location = location`
			`self.date = date`
			`self.description = description`
			`self.link = link`
			`self.tag = tag`
			`self.starred = starred`

			`def __str__(self):`
			`return "%s\| %s\|%s\|%s\|%s\|%s\|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link)`

			`def finder(results,item,**modes):`
			`ATTRS = modes.get('ATTRS',0)`
			`LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0)`
			`LINK = modes.get('LINK',0)`
			`SWAPDATE = modes.get('SWAPDATE',0)`
			`CLEANDATE = modes.get('CLEANDATE',0)`
			`BASEURL = modes.get('BASEURL','')`
			`content = []`
			`i = item.index`
			`log("Modes:",modes)`

			`for entry in results:`
			`if ATTRS==1:`
			`result = entry.findAll(item.tag,attrs=item.tag_content)`
			`log(item.tag_content)`
			`else:`
			`result = entry.findAll(item.tag,class_=item.tag_content)`
			`log("found:",len(result))`
			`if result:`
			`log("theres a result")`
			`if i>(len(result)-1):`
			`log("len:",len(result)-1,"i:",i)`
			`log("index out of bounds fall back to the %d count",i)`
			`# input("Press Enter..")`
			`i=(len(result)-1)`
			`result2 = result[i]`
			`if LOCATION_CLEANUP==1:`
			`location = CleanLocation(result2.text.strip())`
			`content.append(location)`
			`elif LINK==1:`
			`string = result2.get("href")`
			`if BASEURL:`
			`string = BASEURL+string`
			`content.append(string)`
			`elif SWAPDATE==1:`
			`content.append(DateCHToUS(result2.text.strip()))`
			`elif CLEANDATE==1:`
			`content.append(jobs_ch_clean_date(result2.text.strip()))`
			`else:`
			`content.append(result2.text.strip())`
			`if not result:`
			`if CLEANDATE:`
			`today = datetime.today().strftime('%Y-%M-%D')`
			`content.append(today)`
			`content.append("NOTFound")`
			`return content`


			`def CleanLocation(location):`
			`#p = re.compile('CH-[0-9]{4}')`
			`location = re.sub('CH-[0-9]{4}\|[0-9]{4}\| ','',location)`
			`return location`

			`def arrayToClass(titles,companys,locations,dates,links,tag):`
			`jobs = []`
			`if(len(titles) == len(companys) == len(locations) == len(dates)):`
			`log("len:",len(titles))`
			`for i, title in enumerate(titles):`
			`jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0))`
			`log(jobs[i])`
			`return jobs`
			`else:`
			`print("Something went wrong unequal length of data arrays")`
			`return 0`
			`def jobs_ch_clean_date(date):`
			`newdate=''`

			`for i in range(11,len(date)):#remove string "Published:"`
			`newdate+=date[i]`

			`newdate2 = jobs_ch_switch_month(newdate)`
			`return newdate2`

			`def jobs_ch_switch_month(date):`
			`newdate=''`
			`newmonth=''`
			`day = ''`
			`year = ''`

			`for i in range(3,len(date)-5):`
			`newmonth += date[i]`
			`for month in months:`
			`if(month[0] == newmonth):`
			`newmonth = month[1]`

			`for i in range(0,2):`
			`day+=date[i]`
			`for i in range(len(date)-2,len(date)):`
			`year += date[i]`
			`newdate = '20'+year+'-'+newmonth+'-'+day`
			`return newdate`

			`def CleanLocation(location):`
			`#p = re.compile('CH-[0-9]{4}')`
			`location = re.sub('CH-[0-9]{4}\|[0-9]{4}\| ','',location)`
			`return location`

			`def extractDomain(url):`
			`pattern = r'https:\/\/.*\..+?(?=\/)'`
			`domain = re.match(pattern,url)`
			`if domain:`
			`return domain.group()`
			`else:`
			`return 0`

			`def makeSession(url):`
			`with requests.Session() as session:`
			`page = session.get(url)`
			`return session`