import string import requests from bs4 import BeautifulSoup from enum import Enum import re from dateconverter import * from datetime import datetime DEBUG = True def log(*s): if DEBUG: print(s) class mode(): #def set(self,flag,value): # self.flag = flag # self.value = value #def __init__(self,): DEFAULT = 0 LINK = 0 LOCATION_CLEANUP = 0 SWAPDATE = 0 CLEANDATE = 0 ATTRS = 0 months = [ ('January','01'), ('February','02'), ('March','03'), ('April','04'), ('May','05'), ('June','06'), ('July','07'), ('August','08'), ('September','09'), ('October','10'), ('November','11'), ('December','12')] class item(): def __init__(self,tag,tag_content,index,name=None): self.tag = tag self.tag_content = tag_content self.index = index if name is not None: self.name = name else: self.name = "not defined" class job(): def __init__(self,title,profession,company,location,date,description,link,tag,starred): self.title = title self.profession = profession self.company = company self.location = location self.date = date self.description = description self.link = link self.tag = tag self.starred = starred def __str__(self): return "%s| %s|%s|%s|%s|%s|%s" % (self.title,self.profession,self.company,self.location,self.date,self.description,self.link) def finder(results,item,**modes): GETCHILDREN = modes.get("GETCHILDREN",'') ATTRS = modes.get('ATTRS',0) LOCATION_CLEANUP = modes.get('LOCATION_CLEANUP',0) LINK = modes.get('LINK',0) SWAPDATE = modes.get('SWAPDATE',0) CLEANDATE = modes.get('CLEANDATE',0) BASEURL = modes.get('BASEURL','') content = [] i = item.index log("name",item.name) log("Item tag: ",item.tag) log("Modes:",modes) log("tag_content: ",item.tag_content) for entry in results: if ATTRS==1: result = entry.findAll(item.tag,attrs=item.tag_content) log(item.tag_content) else: result = entry.findAll(item.tag,class_=item.tag_content) log("found count results:",len(result)) if not result and DEBUG == True: for x in results: log("No entry found for: ",item.name,item.tag,item.tag_content," -->", x) input() if result: log("theres a result") if i>(len(result)-1): log("len:",len(result)-1,"i:",i) log("index out of bounds fall back to the %d count",i) # input("Press Enter..") i=(len(result)-1) result2 = result[i] if GETCHILDREN!='': found = False for results in result: child = results.find(GETCHILDREN) log(child) if child != None and found == False: log("CHILD: ",child.text.strip()) found = True content.append(child.text.strip()) if found == False: content.append("CHILD_NOT_FOUND: " + GETCHILDREN) elif LOCATION_CLEANUP==1: location = CleanLocation(result2.text.strip()) content.append(location) elif LINK==1: string = result2.get("href") if BASEURL: string = BASEURL+string content.append(string) elif SWAPDATE==1: content.append(DateCHToUS(result2.text.strip())) elif CLEANDATE==1: content.append(jobs_ch_clean_date(result2.text.strip())) else: content.append(result2.text.strip()) if not result: if item.tag_content == "pubdate": today = datetime.today().strftime('%Y-%m-%d') content.append(today) else: content.append("NOTFound") return content def CleanLocation(location): #p = re.compile('CH-[0-9]{4}') location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location) return location def arrayToClass(titles,companys,locations,dates,links,tag): jobs = [] if(len(titles) == len(companys) == len(locations) == len(dates)): log("len:",len(titles)) for i, title in enumerate(titles): jobs.append(job(title,"test_prof",companys[i],locations[i],dates[i],"test_desc",links[i],tag,0)) log("class job:",jobs[i]) return jobs else: log("Something went wrong unequal length of data arrays: ",len(titles),len(companys),len(locations),len(dates)) return 0 def jobs_ch_clean_date(date): newdate='' for i in range(11,len(date)):#remove string "Published:" newdate+=date[i] newdate2 = jobs_ch_switch_month(newdate) return newdate2 def jobs_ch_switch_month(date): newdate='' newmonth='' day = '' year = '' for i in range(3,len(date)-5): newmonth += date[i] for month in months: if(month[0] == newmonth): newmonth = month[1] for i in range(0,2): day+=date[i] for i in range(len(date)-2,len(date)): year += date[i] newdate = '20'+year+'-'+newmonth+'-'+day return newdate def CleanLocation(location): #p = re.compile('CH-[0-9]{4}') location = re.sub('CH-[0-9]{4}|[0-9]{4}| ','',location) return location def extractDomain(url): pattern = r'https:\/\/.*\..+?(?=\/)' domain = re.match(pattern,url) if domain: return domain.group() else: return 0 def makeSession(url): with requests.Session() as session: page = session.get(url) return session