Skip to content
Snippets Groups Projects
Commit 035f28b7 authored by pabmarc's avatar pabmarc
Browse files

Upload New File

parent b541baeb
Branches
No related tags found
No related merge requests found
from scrapers.WebScraper import *
class EFE(WebScraper):
def extract_author(self, soup: bs4.BeautifulSoup) -> str:
pass
def extract_headline(self, soup: bs4.BeautifulSoup) -> str:
try:
headline = soup.find("a").text
except:
print("Head not found EFE")
headline = ""
return headline
def extract_date(self, soup: bs4.BeautifulSoup) -> pd.Timestamp:
try:
datetime = soup.find("time")['datetime']
datetime = pd.to_datetime(datetime)
datetime = datetime.tz_localize(None)
except:
print("Date not found EFE")
datetime = pd.Timestamp('2100-01-01')
return datetime
def extract_content(self, soup: bs4.BeautifulSoup) -> str:
try:
content = soup.find_all("div", {"class": "entry-content"})
texto = []
for part in content:
texto.append(part.find_all("p"))
html_str = ' '.join(str(e) for e in texto)
text = html2text.html2text(html_str)
except:
print("text not found EFE")
text = ""
return text
def extract_tags(self, soup: bs4.BeautifulSoup) -> list:
try:
tags = []
tags_full = soup.find_all("a", {"rel": "tag"})
for tag in tags_full:
tags.append(tag.text)
except:
print("no tags EFE")
tags = []
return tags
def extract_maxpag(self, soup: bs4.BeautifulSoup) -> int:
try:
maxpag = int(soup.find_all("a", {"class": "page-numbers"})[1].text[6:])
except:
print("maxpag not found EFE")
maxpag = 10
return maxpag
def iterate_articles(self, limit_date: pd.Timestamp = pd.Timestamp("1970-01-01")):
topics = ["mundo", "espana", "deportes", "cultura", "economia"]
for topic in topics:
date_limit_reached = False
url = f"https://efe.com/{topic}/page/1/"
homeweb = self.get_web(url)
max_pag = self.extract_maxpag(homeweb)
for page in range(1, max_pag + 1):
if date_limit_reached:
break
url = f"https://efe.com/{topic}/page/{page}/"
homeweb = self.get_web(url)
webpage = homeweb.find_all("div", {"class": "inside-article"})
for article in webpage:
article_data = {'headline': self.extract_headline(article), 'link': self.extract_news_link(article),
'date': self.extract_date(article), 'topic': topic, 'author': ""}
tags = self.extract_tags(article)
articleweb = self.get_web(article_data.get('link'))
article_data['text'] = self.extract_content(articleweb)
news_id = self.add_article("EFE", article_data)
self.add_tags(tags, news_id)
if article_data.get('date') < limit_date:
date_limit_reached = True
break
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment