Skip to content
Snippets Groups Projects
Commit 171fcbe5 authored by javigom's avatar javigom
Browse files

cleanse

parent 90544f97
No related branches found
No related tags found
No related merge requests found
import org.apache.spark.sql.types.{IntegerType, DoubleType, StringType, StructField, StructType, FloatType}
import org.apache.spark.sql.{DataFrame, SparkSession, Row}
import org.apache.spark.sql.functions.{min, max}
import org.apache.spark.sql.functions.col
// Ubicacion
val path = "./../datos/"
val pib = "pib_datos.csv"
val mil = "militar_datos.csv"
val edu = "educacion_datos.csv"
// Schema
val schema = StructType(Array(
StructField("Country Name", StringType, true),
StructField("Country Code", StringType, true),
StructField("Indicator Name", StringType, true),
StructField("Indicator Code", StringType, true),
StructField("1960", DoubleType, true),
StructField("1961", DoubleType, true),
StructField("1962", DoubleType, true),
StructField("1963", DoubleType, true),
StructField("1964", DoubleType, true),
StructField("1965", DoubleType, true),
StructField("1966", DoubleType, true),
StructField("1967", DoubleType, true),
StructField("1968", DoubleType, true),
StructField("1969", DoubleType, true),
StructField("1970", DoubleType, true),
StructField("1971", DoubleType, true),
StructField("1972", DoubleType, true),
StructField("1973", DoubleType, true),
StructField("1974", DoubleType, true),
StructField("1975", DoubleType, true),
StructField("1976", DoubleType, true),
StructField("1977", DoubleType, true),
StructField("1978", DoubleType, true),
StructField("1979", DoubleType, true),
StructField("1980", DoubleType, true),
StructField("1981", DoubleType, true),
StructField("1982", DoubleType, true),
StructField("1983", DoubleType, true),
StructField("1984", DoubleType, true),
StructField("1985", DoubleType, true),
StructField("1986", DoubleType, true),
StructField("1987", DoubleType, true),
StructField("1988", DoubleType, true),
StructField("1989", DoubleType, true),
StructField("1990", DoubleType, true),
StructField("1991", DoubleType, true),
StructField("1992", DoubleType, true),
StructField("1993", DoubleType, true),
StructField("1994", DoubleType, true),
StructField("1995", DoubleType, true),
StructField("1996", DoubleType, true),
StructField("1997", DoubleType, true),
StructField("1998", DoubleType, true),
StructField("1999", DoubleType, true),
StructField("2000", DoubleType, true),
StructField("2001", DoubleType, true),
StructField("2002", DoubleType, true),
StructField("2003", DoubleType, true),
StructField("2004", DoubleType, true),
StructField("2005", DoubleType, true),
StructField("2006", DoubleType, true),
StructField("2007", DoubleType, true),
StructField("2008", DoubleType, true),
StructField("2009", DoubleType, true),
StructField("2010", DoubleType, true),
StructField("2011", DoubleType, true),
StructField("2012", DoubleType, true),
StructField("2013", DoubleType, true),
StructField("2014", DoubleType, true),
StructField("2015", DoubleType, true),
StructField("2016", DoubleType, true),
StructField("2017", DoubleType, true),
StructField("2018", DoubleType, true),
StructField("2019", DoubleType, true),
StructField("2020", DoubleType, true),
StructField("2021", DoubleType, true),
));
// DataFrames
var df_edu = spark.read.format("csv").option("header", true).option("delimiter", ",").schema(schema).load(path + edu)
var df_mil = spark.read.format("csv").option("header", true).option("delimiter", ",").schema(schema).load(path + mil)
var df_pib = spark.read.format("csv").option("header", true).option("delimiter", ",").schema(schema).load(path + pib)
// Eliminar columnas innecesarias
df_edu = df_edu.drop("Indicator Name")
df_mil = df_mil.drop("Indicator Name")
df_pib = df_pib.drop("Indicator Name")
df_edu = df_edu.drop("Indicator Code")
df_mil = df_mil.drop("Indicator Code")
df_pib = df_pib.drop("Indicator Code")
// Instancias
println("Registros totales edu: "+df_edu.count())
println("Registros totales mil: "+df_mil.count())
println("Registros totales pib: "+df_pib.count())
// Registros vacios
println("Registros vacíos edu: " + (df_edu.count() - df_edu.drop().count())
println("Registros vacíos mil: " + (df_mil.count() - df_mil.drop().count())
println("Registros vacíos pib: " + (df_pib.count() - df_pib.drop().count())
// Eliminar instancias sin pais definido
df_edu = df_edu.filter($"Country Name"=!="")
df_mil = df_mil.filter($"Country Name"=!="")
df_pib = df_pib.filter($"Country Name"=!="")
// Eliminar nulos
for((atributo, tipo) <- df_edu.dtypes) {
if(tipo == "DoubleType"){
df_edu = df_edu.withColumn(atributo, when(col(atributo) === "", "-1").otherwise(col(atributo)))
df_mil = df_mil.withColumn(atributo, when(col(atributo) === "", "-1").otherwise(col(atributo)))
df_pib = df_pib.withColumn(atributo, when(col(atributo) === "", "-1").otherwise(col(atributo)))
}
// CSV
df_edu.write.csv(path+"edu.csv")
df_mil.write.csv(path+"mil.csv")
df_pib.write.csv(path+"pib.csv")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment