Skip to content
Snippets Groups Projects
Commit b541baeb authored by pabmarc's avatar pabmarc
Browse files

Upload New File

parent 4b3a6a7f
No related branches found
No related tags found
No related merge requests found
from abc import ABC, abstractmethod
import bs4
import requests
import html2text
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import datetime
import numpy as np
import random
import warnings
import psycopg2
import re
import psycopg2.extras
import uuid
import os
warnings.filterwarnings("ignore")
class WebScraper(ABC):
def __init__(self):
self.conn = None
self.agentlist = None
self.cursor = None
def extractData(self, limit_date: pd.Timestamp = pd.Timestamp("1970-01-01")):
self.connect_database()
self.create_agentlist()
self.iterate_articles(limit_date)
self.clean_data()
def connect_database(self):
self.conn = psycopg2.connect(dbname=os.environ["POSTGRES_DB"], user=os.environ["POSTGRES_USER"], password=os.environ["POSTGRES_PASSWORD"],
host=os.environ["POSTGRES_HOST"], port=os.environ["POSTGRES_PORT"])
self.cursor = self.conn.cursor()
def create_agentlist(self):
self.agentlist = []
try:
fp = open('/Scraping/scrapers/useragents.txt')
line = fp.readline()
while line:
line = fp.readline()
line = line.replace("\n", "") # Limpiamos la lista de caracteres indeseados
self.agentlist.append(line)
self.agentlist.pop(-1) # Eliminamos la ultima linea ya que esta vacia
finally:
fp.close()
self.agentlist = list(dict.fromkeys(self.agentlist)) # Eliminamos agentes repetidos en caso de existir
def get_web(self, url: str) -> bs4.BeautifulSoup:
header = {'User-Agent': random.choice(self.agentlist), }
try:
page = requests.get(url, headers=header)
soup = BeautifulSoup(page.content, "html.parser")
except:
print("URL not found")
soup = BeautifulSoup("", "html.parser")
return soup
def add_article(self, source: str, data: dict) -> str:
headline = data.get('headline')
author = data.get('author')
date = data.get('date')
topic = data.get('topic')
link = data.get('link')
text = data.get('text')
# Asignamos uuid4 y comprobamos que sea unico
news_id = str(uuid.uuid4())
self.cursor.execute("SELECT EXISTS (SELECT 1 FROM news_item WHERE news_id= %s)", (news_id,))
if self.cursor.fetchone()[0]:
while self.cursor.fetchone()[0]:
news_id = str(uuid.uuid4())
self.cursor.execute("SELECT EXISTS (SELECT 1 FROM news_item WHERE news_id= %s)", (news_id,))
sql = (news_id, headline, author, date, pd.Timestamp.now(), topic, link, text)
try:
self.cursor.execute(
f"SELECT EXISTS (SELECT 1 FROM news_item WHERE publication_date='{date}' AND topic LIKE '{topic}')") # Chequeamos que la noticia no este repetida
if not self.cursor.fetchone()[0]:
self.cursor.execute(
"INSERT INTO news_item (news_id,headline,author,publication_date,collection_date,topic,link,content) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);",
sql)
self.conn.commit()
sql2 = (source, news_id)
try:
self.cursor.execute("INSERT INTO source (name,news_id) VALUES (%s,%s);", sql2)
self.conn.commit()
except Exception as e:
print(e)
except Exception as e:
print(e)
return news_id
def add_tags(self, tags: list, news_id: str):
self.cursor.execute("SELECT EXISTS (SELECT 1 FROM news_item WHERE news_id= %s)", (news_id,))
if self.cursor.fetchone()[0]:
# Recuperamos las tags de la noticia
tag_set = set(tags)
for tag in tag_set:
sql3 = (tag, news_id)
try:
self.cursor.execute("INSERT INTO tags (tag,news_id) VALUES (%s,%s);", sql3)
self.conn.commit()
except Exception as e:
print(e)
def extract_news_link(self, soup: bs4.BeautifulSoup) -> str:
try:
link = soup.find('a')['href']
except:
print("article link not found")
link = ""
return link
def clean_data(self):
self.cursor.execute("UPDATE news_item SET publication_date=collection_date WHERE publication_date='2100-01-01';")
self.conn.commit()
self.cursor.execute("WITH todelete AS (SELECT news_id FROM news_item WHERE content='' OR headline='' OR content='\n\n') DELETE FROM source WHERE news_id IN (SELECT * FROM todelete);")
self.conn.commit()
self.cursor.execute("WITH todelete AS (SELECT news_id FROM news_item WHERE content='' OR headline='' OR content='\n\n') DELETE FROM tags WHERE news_id IN (SELECT * FROM todelete);")
self.conn.commit()
self.cursor.execute("DELETE FROM news_item WHERE content='' OR headline='' OR content='\n\n';")
self.conn.commit()
self.conn.close()
@abstractmethod
def iterate_articles(self, limit_date: pd.Timestamp = pd.Timestamp("1970-01-01")):
pass
@abstractmethod
def extract_headline(self, soup: bs4.BeautifulSoup) -> str:
pass
@abstractmethod
def extract_date(self, soup: bs4.BeautifulSoup) -> pd.Timestamp:
pass
@abstractmethod
def extract_content(self, soup: bs4.BeautifulSoup) -> str:
pass
@abstractmethod
def extract_author(self, soup: bs4.BeautifulSoup) -> str:
pass
@abstractmethod
def extract_tags(self, soup: bs4.BeautifulSoup) -> list:
pass
@abstractmethod
def extract_maxpag(self, soup: bs4.BeautifulSoup) -> int:
pass
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment