Skip to content
Snippets Groups Projects
Commit bd5a43a0 authored by Mario Garrido Tapias's avatar Mario Garrido Tapias
Browse files

Data loading and univariate analysis separated in different files

parents
Branches
No related tags found
No related merge requests found
setwd("/home/mariogt/TFGs/Estadistica/")
#####################################
# FUNCIONES #
#####################################
aceptaTildes <- function(datos) {
datos$player <- gsub("É", "É", datos$player)
datos$player <- gsub("Á", "Á", datos$player)
datos$player <- gsub("Ă“", "Ó", datos$player)
datos$player <- gsub("Ø", "Ø", datos$player)
datos$player <- gsub("ć", "ć", datos$player)
datos$player <- gsub("š", "š", datos$player)
datos$player <- gsub("Ă©", "é", datos$player)
datos$player <- gsub("Ă«", "ë", datos$player)
datos$player <- gsub("á", "á", datos$player)
datos$player <- gsub("ĂŁ", "ã", datos$player)
datos$player <- gsub("â", "â", datos$player)
datos$player <- gsub("Ăł", "ó", datos$player)
datos$player <- gsub("Ăş", "ú", datos$player)
datos$player <- gsub("ñ", "ñ", datos$player)
datos$player <- gsub("Ă‘", "Ñ", datos$player)
datos$player <- gsub("ĂŻ", "ï", datos$player)
datos$player <- gsub("Ă", "í", datos$player)
datos$squad <- gsub("Ă©", "é", datos$squad)
return(datos)
}
######################################
# LECTURA de DATOS #
######################################
allLeagues.1718 <- read.csv2("Datos/Kaggle/transfermarkt_fbref_201718.csv", header = TRUE, dec = ".")
allLeagues.1819 <- read.csv2("Datos/Kaggle/transfermarkt_fbref_201819.csv", header = TRUE, dec = ".")
allLeagues.1920 <- read.csv2("Datos/Kaggle/transfermarkt_fbref_201920.csv", header = TRUE, dec = ".")
laLigaPlayersPos <- which(allLeagues.1718$league == "La Liga")
laLigaPlayers.1718 <- allLeagues.1718[laLigaPlayersPos, ]
dim(laLigaPlayers.1718)
laLigaPlayers.1718 <- aceptaTildes(laLigaPlayers.1718)
laLigaPlayersPos <- which(allLeagues.1819$league == "La Liga")
laLigaPlayers.1819 <- allLeagues.1819[laLigaPlayersPos, ]
dim(laLigaPlayers.1819)
laLigaPlayers.1819 <- aceptaTildes(laLigaPlayers.1819)
laLigaPlayersPos <- which(allLeagues.1920$league == "La Liga")
laLigaPlayers.1920 <- allLeagues.1920[laLigaPlayersPos, ]
dim(laLigaPlayers.1920)
laLigaPlayers.1920 <- aceptaTildes(laLigaPlayers.1920)
allLaLigaPlayer <- list(laLigaPlayers.1718, laLigaPlayers.1819, laLigaPlayers.1920)
##########################################################
# Jugadores que CAMBIARON de club en una misma temporada #
##########################################################
rowsPIn2CGlobal <- c()
for (i in 1:length(allLaLigaPlayer)) {
repeatedPlayers <- which(duplicated(allLaLigaPlayer[[i]]$player) == TRUE)
playersIn2C <- allLaLigaPlayer[[i]]$player[repeatedPlayers]
season <- switch (i, "17-18", "18-19", "19-20")
for (name in playersIn2C) {
rows <- which(allLaLigaPlayer[[i]]$player == name)
teams <- c(allLaLigaPlayer[[i]]$squad[rows[1]], allLaLigaPlayer[[i]]$squad[rows[2]])
rowsPIn2CGlobal <- rbind(rowsPIn2CGlobal, c(name, rows, teams, season))
}
}
colnames(rowsPIn2CGlobal) <- c("Jugador", "Fila 1", "Fila 2", "Equipo 1º", "Equipo 2º", "Temporada")
rowsPIn2CGlobal
####################################################
# Fusion de las muestras de jugadores en 2 equipos #
####################################################.
pos <- which(rowsPIn2CGlobal[,6] == "17-18")
r1 <- as.numeric(rowsPIn2CGlobal[pos, 2])
r2 <- as.numeric(rowsPIn2CGlobal[pos, 3])
players <- c()
for (i in 1:length(r1)) {
permanent <- laLigaPlayers.1718[r1[i], 1:12]
team1 <- as.numeric(laLigaPlayers.1718[r1[i], 13:400])
team2 <- as.numeric(laLigaPlayers.1718[r2[i], 13:400])
aux <- data.frame(rbind(team1, team2))
result <- apply(aux, 2, sum)
result <- c(permanent, result)
players <- rbind(players, result)
}
colnames(players) <- colnames(laLigaPlayers.1718)
names <- rowsPIn2CGlobal[pos, 1]
for (name in names) {
auxPos <- which(laLigaPlayers.1718$player == name)
laLigaPlayers.1718 <- laLigaPlayers.1718[-auxPos,]
}
laLigaPlayers.1718 <- rbind(laLigaPlayers.1718, players)
pos <- which(rowsPIn2CGlobal[,6] == "18-19")
r1 <- as.numeric(rowsPIn2CGlobal[pos, 2])
r2 <- as.numeric(rowsPIn2CGlobal[pos, 3])
players <- c()
for (i in 1:length(r1)) {
permanent <- laLigaPlayers.1819[r1[i], 1:12]
team1 <- as.numeric(laLigaPlayers.1819[r1[i], 13:400])
team2 <- as.numeric(laLigaPlayers.1819[r2[i], 13:400])
aux <- data.frame(rbind(team1, team2))
result <- apply(aux, 2, sum)
result <- c(permanent, result)
players <- rbind(players, result)
}
colnames(players) <- colnames(laLigaPlayers.1819)
names <- rowsPIn2CGlobal[pos, 1]
for (name in names) {
auxPos <- which(laLigaPlayers.1819$player == name)
laLigaPlayers.1819 <- laLigaPlayers.1819[-auxPos,]
}
laLigaPlayers.1819 <- rbind(laLigaPlayers.1819, players)
pos <- which(rowsPIn2CGlobal[,6] == "19-20")
r1 <- as.numeric(rowsPIn2CGlobal[pos, 2])
r2 <- as.numeric(rowsPIn2CGlobal[pos, 3])
players <- c()
for (i in 1:length(r1)) {
permanent <- laLigaPlayers.1920[r1[i], 1:12]
team1 <- as.numeric(laLigaPlayers.1920[r1[i], 13:400])
team2 <- as.numeric(laLigaPlayers.1920[r2[i], 13:400])
aux <- data.frame(rbind(team1, team2))
result <- apply(aux, 2, sum)
result <- c(permanent, result)
players <- rbind(players, result)
}
colnames(players) <- colnames(laLigaPlayers.1920)
names <- rowsPIn2CGlobal[pos, 1]
for (name in names) {
auxPos <- which(laLigaPlayers.1920$player == name)
laLigaPlayers.1920 <- laLigaPlayers.1920[-auxPos,]
}
laLigaPlayers.1920 <- rbind(laLigaPlayers.1920, players)
##############
# INNER JOIN #
##############
library(dplyr)
laLigaPlayers.1719 <- inner_join(laLigaPlayers.1718, laLigaPlayers.1819, by = c("player" = "player"), suffix = c(".1718", ".1819"))
auxLaLigaPlayers.1920 <- inner_join(laLigaPlayers.1920, laLigaPlayers.1920, by = c("player" = "player"), suffix = c(".1920", ".1920"))
laLigaPlayers <- inner_join(laLigaPlayers.1719, auxLaLigaPlayers.1920, by = c("player" = "player"))
# Veamos que jugadores jugaron en 2 CLUBES en algunas de las 3 temporadas contempladas
repeatedPlayers <- which(duplicated(laLigaPlayers$player) == TRUE)
playersIn2Clubs <- laLigaPlayers$player[repeatedPlayers]
rowsPIn2C <- c()
for (name in playersIn2Clubs) {
rowsPIn2C <- rbind(rowsPIn2C, which(laLigaPlayers$player == name))
}
rownames(rowsPIn2C) <- playersIn2Clubs
rowsPIn2C
############################# ####################
# CONJUNTO de ENTRENAMIENTO # # CONJUNTO de TEST #
############################# ####################
set.seed(5682)
ind <- sample(2, nrow(laLigaPlayers), replace = TRUE, prob = c(0.8, 0.2))
train <- laLigaPlayers[ind == 1, ]
test <- laLigaPlayers[ind == 2, ]
\ No newline at end of file
setwd("/home/mariogt/TFGs/Estadistica/")
#####################################
# FUNCIONES #
#####################################
plotFor3Seasons <- function(data, variable) {
par(mfrow = c(1, 3))
dfs <- list()
maxPInPos <- 0
for (i in 1:3) {
season <- switch(i, ".1718", ".1819", ".1920")
var <- select(data, contains(paste(variable, season, sep = "")))
cat(names(var))
GK <- var %>% filter(str_detect(var[,1], 'GK'))
numGK <- dim(GK)[1]
DF <- var %>% filter(str_detect(var[,1], 'DF'))
numDF <- dim(DF)[1]
MF <- var %>% filter(str_detect(var[,1], 'MF'))
numMF <- dim(MF)[1]
FW <- var %>% filter(str_detect(var[,1], 'FW'))
numFW <- dim(FW)[1]
df <- data.frame(
position = c("Portero", "Defensa", "Mediocentro", "Delantero"),
numberOf = c(numGK, numDF, numMF, numFW)
)
dfs[[i]] <- df
if(maxPInPos < max(df$numberOf)) {
maxPInPos <- max(df$numberOf)
}
}
for (i in 1:3) {
season <- switch(i, ".1718", ".1819", ".1920")
bp <- barplot(height = dfs[[i]]$numberOf, names = dfs[[i]]$position,
col = rgb(0.8,0.1,0.1,0.6),
xlab = "posiciones",
ylab = "cantidad",
main= paste(c("Distribución posiciones ", season, "(TRAIN)"), sep = ""),
ylim = c(0, maxPInPos)
)
}
}
univariantAnalysis <- function(data, variable, ids) {
info <- data %>% select(contains(paste(variable, ".", sep = "")))
info <- cbind(ids, info)
# WIDE to LONG
dl <- gather(info, season, variable, 2:4, factor_key = TRUE)
dl[, 3] <- as.numeric(dl[, 3])
bp <- ggplot(dl, aes(x = factor(season), y = variable)) +
geom_boxplot(alpha = 0.5, fill="darkblue", color="black", outlier.color="red")+
labs(title="Boxplot GCA por temporada", x = "Temporada", y = "Acciones de creación de gol")
g <- group_by(dl, season)
g2 <- summarise(g, media = round(mean(variable), 1))
gp <- ggplot(g2, aes(x = season, media, group = 1))+
geom_point(alpha=0.5, color = "blue", size=3) +
geom_line(color = "red", size = 1) +
labs(title = "Promedio de GCA por temporada", x = "Temporada", y = "Acciones de creación de gol")
figure <- ggarrange(bp, gp,
labels = c("A", "B"),
ncol = 1, nrow = 2)
figure
}
#############
# LIBRERIAS #
#############
library(tidyverse)
library(reshape)
library(tidyr) # gather() -> WIDE data to LONG data
library(ggplot2) # graphics
library(ggpubr) # multiple ggplots
##########################################
# ANALISIS UNIVARIANTE #
##########################################
# Get players identifiers
ids <- data.frame(train$X)
ids <- t(ids)
########
# Edad #
########
##########
# Altura #
##########
################
# Pierna buena #
################
##################################
# Goles y asistencias por 90 min #
##################################
######
# xG #
######
########
# npxG #
########
######
# xA #
######
##################################
# Porcentaje de tiros a porteria #
##################################
#######################
# Porcentaje de pases #
#######################
passes_pct <- train %>% select(contains("passes_pct."))
passes_pct <- cbind(ids, passes_pct)
data_long = gather(passes_pct, season, passes, passes_pct.1718:passes_pct.1920, convert = TRUE, factor_key=TRUE)
data_long
data_long[, 3] <- as.numeric(data_long[, 3])
ggplot(data_long, aes(x = factor(season), y = passes)) +
geom_boxplot(alpha = 0.5, fill="darkblue", color="black", outlier.color="red")+
labs(title="Boxplot susceptibilidad por tiempo y entrenamiento", x="Temporada",
y = "Porcentaje de pases exitosos")
##################################
# Distancia total mediante pases #
##################################
#############################
# Pases que derivan en tiro #
#############################
###################################
# Cambios de orientación de juego #
###################################
###################
# SCA passes dead #
###################
sca_passes_dead <- train %>% select(contains("sca_passes_dead."))
sca_passes_dead <- cbind(ids, sca_passes_dead)
data_long = gather(sca_passes_dead, season, sca_passes_dead, sca_passes_dead.1718:sca_passes_dead.1920, convert = TRUE, factor_key=TRUE)
data_long
data_long[, 3] <- as.numeric(data_long[, 3])
ggplot(data_long, aes(x = factor(season), y = sca_passes_dead)) +
geom_boxplot(alpha = 0.5, fill="darkblue", color="black", outlier.color="red")+
labs(title="Boxplot SCA por temporada", x="Temporada",
y = "Acciones de creación de tiro")
###################
# GCA passes dead #
###################
univariantAnalysis(train, "passes_intercepted", ids)
gca_passes_dead <- train %>% select(contains("gca_passes_dead."))
gca_passes_dead <- cbind(ids, gca_passes_dead)
par(mfrow = c(2, 1))
data_long = gather(gca_passes_dead, season, gca_passes_dead, gca_passes_dead.1718:gca_passes_dead.1920, factor_key=TRUE)
data_long
data_long[, 3] <- as.numeric(data_long[, 3])
ggplot(data_long, aes(x = factor(season), y = gca_passes_dead)) +
geom_boxplot(alpha = 0.5, fill="darkblue", color="black", outlier.color="red")+
labs(title="Boxplot GCA por temporada", x = "Temporada", y = "Acciones de creación de gol")
g <- group_by(data_long, season)
g2 <- summarise(g, media = round(mean(gca_passes_dead), 1))
ggplot(g2, aes(x = season, media, group = 1))+
geom_point(alpha=0.5, color = "blue", size=3) +
geom_line(color = "red", size = 1) +
labs(title = "Promedio de GCA por temporada", x = "Temporada", y = "Acciones de creación de gol")
#######################
# Pases interceptados #
#######################
####################################
# Porcentaje de presiones exitosas #
####################################
##################################
# Porcentaje de regates exitosos #
##################################
####################
# Faltas cometidas #
####################
#######################################
# Porcentaje de juegos aereos ganados #
#######################################
######################
# Penalties atajados #
######################
##############################
# Goles en contra por 90 min #
##############################
###################################
# Porcentaje de ocasiones paradas #
###################################
laLigaPlayers$foot <- as.factor(laLigaPlayers$foot)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment