Upload New File

b541baeb · pabmarc · 4b3a6a7f · b541baeb
Commit b541baeb authored Jun 22, 2023 by pabmarc
--- a/Scraping/scrapers/WebScraper.py
+++ b/Scraping/scrapers/WebScraper.py
+from abc import ABC, abstractmethod
+import bs4
+import requests
+import html2text
+import pandas as pd
+import time
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from datetime import datetime
+import numpy as np
+import random
+import warnings
+import psycopg2
+import re
+import psycopg2.extras
+import uuid
+import os
+warnings.filterwarnings("ignore")
+class WebScraper(ABC):
+    def __init__(self):
+        self.conn = None
+        self.agentlist = None
+        self.cursor = None
+    def extractData(self, limit_date: pd.Timestamp = pd.Timestamp("1970-01-01")):
+        self.connect_database()
+        self.create_agentlist()
+        self.iterate_articles(limit_date)
+        self.clean_data()
+    def connect_database(self):
+        self.conn = psycopg2.connect(dbname=os.environ["POSTGRES_DB"], user=os.environ["POSTGRES_USER"], password=os.environ["POSTGRES_PASSWORD"],
+                                     host=os.environ["POSTGRES_HOST"], port=os.environ["POSTGRES_PORT"])
+        self.cursor = self.conn.cursor()
+    def create_agentlist(self):
+        self.agentlist = []
+        try:
+            fp = open('/Scraping/scrapers/useragents.txt')
+            line = fp.readline()
+            while line:
+                line = fp.readline()
+                line = line.replace("\n", "")  # Limpiamos la lista de caracteres indeseados
+                self.agentlist.append(line)
+            self.agentlist.pop(-1)  # Eliminamos la ultima linea ya que esta vacia
+        finally:
+            fp.close()
+        self.agentlist = list(dict.fromkeys(self.agentlist))  # Eliminamos agentes repetidos en caso de existir
+    def get_web(self, url: str) -> bs4.BeautifulSoup:
+        header = {'User-Agent': random.choice(self.agentlist), }
+        try:
+            page = requests.get(url, headers=header)
+            soup = BeautifulSoup(page.content, "html.parser")
+        except:
+            print("URL not found")
+            soup = BeautifulSoup("", "html.parser")
+        return soup
+    def add_article(self, source: str, data: dict) -> str:
+        headline = data.get('headline')
+        author = data.get('author')
+        date = data.get('date')
+        topic = data.get('topic')
+        link = data.get('link')
+        text = data.get('text')
+        # Asignamos uuid4 y comprobamos que sea unico
+        news_id = str(uuid.uuid4())
+        self.cursor.execute("SELECT EXISTS (SELECT 1 FROM news_item WHERE news_id= %s)", (news_id,))
+        if self.cursor.fetchone()[0]:
+            while self.cursor.fetchone()[0]:
+                news_id = str(uuid.uuid4())
+                self.cursor.execute("SELECT EXISTS (SELECT 1 FROM news_item WHERE news_id= %s)", (news_id,))
+        sql = (news_id, headline, author, date, pd.Timestamp.now(), topic, link, text)
+        try:
+            self.cursor.execute(
+                f"SELECT EXISTS (SELECT 1 FROM news_item WHERE publication_date='{date}' AND topic LIKE '{topic}')")  # Chequeamos que la noticia no este repetida
+            if not self.cursor.fetchone()[0]:
+                self.cursor.execute(
+                    "INSERT INTO news_item (news_id,headline,author,publication_date,collection_date,topic,link,content) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);",
+                    sql)
+                self.conn.commit()
+                sql2 = (source, news_id)
+                try:
+                    self.cursor.execute("INSERT INTO source (name,news_id) VALUES (%s,%s);", sql2)
+                    self.conn.commit()
+                except Exception as e:
+                    print(e)
+        except Exception as e:
+            print(e)
+        return news_id
+    def add_tags(self, tags: list, news_id: str):
+        self.cursor.execute("SELECT EXISTS (SELECT 1 FROM news_item WHERE news_id= %s)", (news_id,))
+        if self.cursor.fetchone()[0]:
+            # Recuperamos las tags de la noticia
+            tag_set = set(tags)
+            for tag in tag_set:
+                sql3 = (tag, news_id)
+                try:
+                    self.cursor.execute("INSERT INTO tags (tag,news_id) VALUES (%s,%s);", sql3)
+                    self.conn.commit()
+                except Exception as e:
+                    print(e)
+    def extract_news_link(self, soup: bs4.BeautifulSoup) -> str:
+        try:
+            link = soup.find('a')['href']
+        except:
+            print("article link not found")
+            link = ""
+        return link
+    def clean_data(self):
+        self.cursor.execute("UPDATE news_item SET publication_date=collection_date WHERE publication_date='2100-01-01';")
+        self.conn.commit()
+        self.cursor.execute("WITH todelete AS (SELECT news_id FROM news_item WHERE content='' OR headline='' OR content='\n\n') DELETE FROM source WHERE news_id IN (SELECT * FROM todelete);")
+        self.conn.commit()
+        self.cursor.execute("WITH todelete AS (SELECT news_id FROM news_item WHERE content='' OR headline='' OR content='\n\n') DELETE FROM tags WHERE news_id IN (SELECT * FROM todelete);")
+        self.conn.commit()
+        self.cursor.execute("DELETE FROM news_item WHERE content='' OR headline='' OR content='\n\n';")
+        self.conn.commit()
+        self.conn.close()
+    @abstractmethod
+    def iterate_articles(self, limit_date: pd.Timestamp = pd.Timestamp("1970-01-01")):
+        pass
+    @abstractmethod
+    def extract_headline(self, soup: bs4.BeautifulSoup) -> str:
+        pass
+    @abstractmethod
+    def extract_date(self, soup: bs4.BeautifulSoup) -> pd.Timestamp:
+        pass
+    @abstractmethod
+    def extract_content(self, soup: bs4.BeautifulSoup) -> str:
+        pass
+    @abstractmethod
+    def extract_author(self, soup: bs4.BeautifulSoup) -> str:
+        pass
+    @abstractmethod
+    def extract_tags(self, soup: bs4.BeautifulSoup) -> list:
+        pass
+    @abstractmethod
+    def extract_maxpag(self, soup: bs4.BeautifulSoup) -> int:
+        pass
\ No newline at end of file