Upload New File

035f28b7 · pabmarc · b541baeb · 035f28b7
Commit 035f28b7 authored Jun 22, 2023 by pabmarc
--- a/Scraping/scrapers/EjemploScraper.py
+++ b/Scraping/scrapers/EjemploScraper.py
+from scrapers.WebScraper import *
+
+
+class EFE(WebScraper):
+    def extract_author(self, soup: bs4.BeautifulSoup) -> str:
+        pass
+
+    def extract_headline(self, soup: bs4.BeautifulSoup) -> str:
+        try:
+            headline = soup.find("a").text
+        except:
+            print("Head not found EFE")
+            headline = ""
+        return headline
+
+    def extract_date(self, soup: bs4.BeautifulSoup) -> pd.Timestamp:
+        try:
+            datetime = soup.find("time")['datetime']
+            datetime = pd.to_datetime(datetime)
+            datetime = datetime.tz_localize(None)
+        except:
+            print("Date not found EFE")
+            datetime = pd.Timestamp('2100-01-01')
+        return datetime
+
+    def extract_content(self, soup: bs4.BeautifulSoup) -> str:
+        try:
+            content = soup.find_all("div", {"class": "entry-content"})
+            texto = []
+            for part in content:
+                texto.append(part.find_all("p"))
+            html_str = ' '.join(str(e) for e in texto)
+            text = html2text.html2text(html_str)
+        except:
+            print("text not found EFE")
+            text = ""
+        return text
+
+    def extract_tags(self, soup: bs4.BeautifulSoup) -> list:
+        try:
+            tags = []
+            tags_full = soup.find_all("a", {"rel": "tag"})
+            for tag in tags_full:
+                tags.append(tag.text)
+        except:
+            print("no tags EFE")
+            tags = []
+        return tags
+
+    def extract_maxpag(self, soup: bs4.BeautifulSoup) -> int:
+        try:
+            maxpag = int(soup.find_all("a", {"class": "page-numbers"})[1].text[6:])
+        except:
+            print("maxpag not found EFE")
+            maxpag = 10
+        return maxpag
+
+    def iterate_articles(self, limit_date: pd.Timestamp = pd.Timestamp("1970-01-01")):
+        topics = ["mundo", "espana", "deportes", "cultura", "economia"]
+        for topic in topics:
+            date_limit_reached = False
+            url = f"https://efe.com/{topic}/page/1/"
+            homeweb = self.get_web(url)
+            max_pag = self.extract_maxpag(homeweb)
+
+            for page in range(1, max_pag + 1):
+                if date_limit_reached:
+                    break
+                url = f"https://efe.com/{topic}/page/{page}/"
+                homeweb = self.get_web(url)
+                webpage = homeweb.find_all("div", {"class": "inside-article"})
+
+                for article in webpage:
+                    article_data = {'headline': self.extract_headline(article), 'link': self.extract_news_link(article),
+                                    'date': self.extract_date(article), 'topic': topic, 'author': ""}
+                    tags = self.extract_tags(article)
+                    articleweb = self.get_web(article_data.get('link'))
+                    article_data['text'] = self.extract_content(articleweb)
+                    news_id = self.add_article("EFE", article_data)
+                    self.add_tags(tags, news_id)
+
+                    if article_data.get('date') < limit_date:
+                        date_limit_reached = True
+                        break