library(DT)
library(mice)
library(readxl)
<- read_excel("C:/Users/wsand/Dropbox/2021-II/Ulibertadores/Mineria de datos/premier_league.xls") premier_league
Div = League Division
Date = Match Date (dd/mm/yy)
Time = Time of match kick-off
HomeTeam = Home Team
Away team = Away Team
FTHG and HG = Full Time Home Team Goals
FTAG and AG = Full-Time Away Team Goals
FTR and Res = Full-Time Result (H=Home Win, D=Draw, A=Away Win)
HTHG = Half Time Home Team Goals
HTAG = Half Time Away Team Goals
HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)
Attendance = Crowd Attendance
Referee = Match Referee
HS = Home Team Shots
AS = Away Team Shots
HST = Home Team Shots on Target
AST = Away Team Shots on Target
HHW = Home Team Hit Woodwork
AHW = Away Team Hit Woodwork
HC = Home Team Corners
AC = Away Team Corners
HF = Home Team Fouls Committed
AF = Away Team Fouls Committed
HFKC = Home Team Free Kicks Conceded
AFKC = Away Team Free Kicks Conceded
HO = Home Team Offsides
AO = Away Team Offsides
HY = Home Team Yellow Cards
AY = Away Team Yellow Cards
HR = Home Team Red Cards
AR = Away Team Red Cards
HBP = Home Team Bookings Points (10 = yellow, 25 = red)
ABP = Away Team Bookings Points (10 = yellow, 25 = red)
=premier_league
pleague::datatable(pleague) DT
dim(pleague)
## [1] 208 106
- La base de tados tiene 208 filas y 106 columnas
sum(is.na(pleague))
## [1] 72
- Se encontraron en total dentro de la base 72
NAs
Vamos a filtrar la base solo para las columnas que contienen NAs
library(tidyverse)
=colSums(is.na(pleague))
a=pleague[,as.numeric(a)>0] premier
::datatable(premier) DT
sum(is.na(premier))
## [1] 72
Después de averiguar las variables donde hay NAs
encontramos la cantidad por columna
colSums(is.na(premier))
## HomeTeam AwayTeam FTHG HTHG Referee HF AR PSA
## 1 14 10 9 8 10 10 10
str(premier)
## tibble [208 x 8] (S3: tbl_df/tbl/data.frame)
## $ HomeTeam: chr [1:208] "Liverpool" "West Ham" "Bournemouth" "Burnley" ...
## $ AwayTeam: chr [1:208] "Norwich" "Man City" "Sheffield United" "Southampton" ...
## $ FTHG : num [1:208] NA 0 1 3 0 0 3 0 0 4 ...
## $ HTHG : num [1:208] 4 NA 0 0 0 0 0 0 0 1 ...
## $ Referee : chr [1:208] "M Oliver" "M Dean" NA "G Scott" ...
## $ HF : num [1:208] 9 6 NA 6 16 15 13 3 12 15 ...
## $ AR : num [1:208] NA 0 0 0 1 0 0 0 0 0 ...
## $ PSA : chr [1:208] NA "1.26" "3.9" "2.81" ...
$PSA<-as.numeric(premier$PSA) premier
summary(premier)
## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.0000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.000 Median :0.0000
## Mean :1.485 Mean :0.6784
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :8.000 Max. :5.0000
## NA's :10 NA's :9
## Referee HF AR PSA
## Length:208 Min. : 0.00 Min. :0.00000 Min. : 1.140
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.310
## Mode :character Median :10.00 Median :0.00000 Median : 3.310
## Mean :10.32 Mean :0.05051 Mean : 4.940
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 5.173
## Max. :21.00 Max. :1.00000 Max. :34.710
## NA's :10 NA's :10 NA's :10
PARTE I
Ejercicio 1
- 1.Hacer imputación de datos con las siguientes opciones:
Omitir las filas con observaciones NA
= na.omit(premier)
premier_omitir_na ::datatable(premier_omitir_na) DT
Al eliminar las filas u observaciones nos quedariamos com 167 filas
Imputar con la media
= premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
premier_imputados_media ~replace(., is.na(.), mean(.,
na.rm=TRUE))
)
Imputar con la mediana
= premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
premier_imputados_mediana ~replace(., is.na(.), median(.,
na.rm=TRUE))
)
Reemplazar NAs por cero
= premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
premier_imputados_cero ~replace(., is.na(.), 0)
)
Ejercicio 2
2.Analizar, empleando la función summary(), los estadísticos resultantes para cada opción de imputación. Seleccionar la mejor opción.
summary(premier_omitir_na)
## HomeTeam AwayTeam FTHG HTHG
## Length:164 Length:164 Min. :0.000 Min. :0.0000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.000 Median :0.0000
## Mean :1.433 Mean :0.6159
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :8.000 Max. :5.0000
## Referee HF AR PSA
## Length:164 Min. : 3.00 Min. :0.00000 Min. : 1.200
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.410
## Mode :character Median :10.00 Median :0.00000 Median : 3.525
## Mean :10.44 Mean :0.04878 Mean : 5.066
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 5.098
## Max. :21.00 Max. :1.00000 Max. :34.710
summary(premier_imputados_media)
## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.0000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.000 Median :0.0000
## Mean :1.485 Mean :0.6784
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :8.000 Max. :5.0000
## Referee HF AR PSA
## Length:208 Min. : 0.00 Min. :0.00000 Min. : 1.140
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.330
## Mode :character Median :10.00 Median :0.00000 Median : 3.470
## Mean :10.32 Mean :0.05051 Mean : 4.940
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 4.942
## Max. :21.00 Max. :1.00000 Max. :34.710
summary(premier_imputados_mediana)
## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.000
## Mode :character Mode :character Median :1.000 Median :0.000
## Mean :1.462 Mean :0.649
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :8.000 Max. :5.000
## Referee HF AR PSA
## Length:208 Min. : 0.00 Min. :0.00000 Min. : 1.140
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.330
## Mode :character Median :10.00 Median :0.00000 Median : 3.310
## Mean :10.31 Mean :0.04808 Mean : 4.862
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 4.890
## Max. :21.00 Max. :1.00000 Max. :34.710
summary(premier_imputados_cero)
## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.000
## Mode :character Mode :character Median :1.000 Median :0.000
## Mean :1.413 Mean :0.649
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :8.000 Max. :5.000
## Referee HF AR PSA
## Length:208 Min. : 0.000 Min. :0.00000 Min. : 0.000
## Class :character 1st Qu.: 7.000 1st Qu.:0.00000 1st Qu.: 2.150
## Mode :character Median :10.000 Median :0.00000 Median : 3.175
## Mean : 9.827 Mean :0.04808 Mean : 4.702
## 3rd Qu.:13.000 3rd Qu.:0.00000 3rd Qu.: 4.890
## Max. :21.000 Max. :1.00000 Max. :34.710
- Comparación para la variable
FTHG
Estadística | sin Imputar | FTHG omitir_na | FTHG media | FTHG mediana | FTHG cero |
---|---|---|---|---|---|
Min | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
1st Qu | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
Median | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
Mean | 1.485 | 1.433 | 1.485 | 1.462 | 1.413 |
3rd Qu | 2.000 | 2.000 | 2.000 | 2.000 | 2.000 |
Max | 8.00 | 8.000 | 8.000 | 8.000 | 8.000 |
- Comparación para la variable
HTHG
Estadística | sin imputar | HTHG omitir_na | HTHG media | HTHG mediana | FTHG cero |
---|---|---|---|---|---|
Min | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 |
1st Qu | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
Median | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
Mean | 0.6784 | 0.6159 | 0.6784 | 0.649 | 0.649 |
3rd Qu | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
Max | 5.000 | 5.000 | 5.000 | 5.000 | 5.000 |
- Comparación para la variable
HF
Estadística | sin imputar | HF omitir_na | HF media | HF mediana | HF cero |
---|---|---|---|---|---|
Min | 0.000 | 3.000 | 0.000 | 0.000 | 0.000 |
1st Qu | 8.000 | 8.000 | 8.000 | 8.000 | 7.000 |
Median | 10.000 | 10.000 | 10.000 | 10.000 | 10.000 |
Mean | 10.32 | 10.44 | 10.320 | 10.31 | 9.827 |
3rd Qu | 13.000 | 13.000 | 13.000 | 13.000 | 13.000 |
Max | 21.00 | 21.000 | 21.000 | 21.000 | 21.000 |
- Comparación para la variable
AR
Estadística | Sin Imputar | AR omitir_na | AR media | AR mediana | AR cero |
---|---|---|---|---|---|
Min | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
1st Qu | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
Median | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
Mean | 0.05051 | 0.04878 | 0.05051 | 0.04808 | 0.04808 |
3rd Qu | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
Max | 1.0000 | 1.000 | 1.000 | 1.000 | 1.000 |
Ejercicio 3
3.Con la librería tidyverse, el operador %>% y las funciones
filter()
yggplot()
realizar un análisis descriptivo del dataset seleccionado en el numeral 2. considerando:- La combinación de tres (3) variables entre si
$FTR<-pleague$FTR premier_imputados_mediana
library(GGally)
ggpairs(premier_imputados_mediana, columns=c("HTHG", "FTHG", "HF","AR","PSA"),
::aes(colour=FTR)) ggplot2
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
- Mínimo dos gráficas de dispersión
library(ggplot2)
ggplot(data=pleague, aes(x=HS, y=FTHG, color=FTR))+
geom_point()+
ggtitle( "HS vs FTHG")
- Mínimo dos gráficos de boxplot
library(plotly)
=ggplot(premier_imputados_mediana, aes(y=FTHG , col=FTR))+
p1geom_boxplot()+
ggtitle("Goles del equipo local ")
ggplotly(p1)
=ggplot(premier_imputados_mediana, aes(y=FTHG , col=HomeTeam))+
p2geom_boxplot()+
ggtitle("Goles del equipo local ")
ggplotly(p2)
- Variable
PSA
=ggplot(data=premier_imputados_mediana, aes(y=PSA))+
qgeom_boxplot(fill="steelblue")+
ggtitle("PSA")
ggplotly(q)
- Equipo con mas goles local
=premier_imputados_mediana %>% group_by(HomeTeam) %>% summarise(mediagoles=mean(FTHG))
a
=a %>% arrange(desc(mediagoles))
b=b[1:10,]
c$HomeTeam <- factor(c$HomeTeam, levels=c$HomeTeam) c
=ggplot(c, aes(x=HomeTeam, y=mediagoles, fill=HomeTeam))+
orgeom_bar(stat = "identity", )+
ggtitle("Los 5 equipos con más goles en media de local")
ggplotly(or)
Observamos que el equipo más goleador de local es el Liverpool con una media de 2.4 goles por partido
PARTE II
Ejercicio 4
- Imputar datos para una variable categórica empleando la moda
Para calcular la moda usaremos la siguiente función:
<- function(v) {
getmode <- unique(v)
uniqv which.max(tabulate(match(v, uniqv)))]
uniqv[ }
= premier
premier2 $HomeTeam<-ifelse(is.na(premier$HomeTeam),
premier2getmode(premier$HomeTeam),
$HomeTeam)
premier::datatable(premier2) DT
Ejercicio 5
- Imputar datos para una variable categórica con cadenas de texto
= premier2 %>% mutate(AwayTeam = replace_na(AwayTeam, "equipo_desconocido"))
premier2 ::datatable(premier2) DT
Ejercicio 6
- Hacer dos imputaciones con el paquete “mice”
en este caso se imputaran las variables FTHG
Y HTHG
<- c("FTHG", "HTHG")
columns
<- mice(premier2[,names(premier2) %in% columns], m=1,
imputed_data maxit = 1, method = "mean", seed = 2018, print=F)
<- mice::complete(imputed_data)
complete.data
$FTHG <- round(complete.data$FTHG) # Se redondea para que el resultado sea entero
premier2$HTHG <- round(complete.data$HTHG)
premier2::datatable(premier2) DT
colSums(is.na(premier2))
## HomeTeam AwayTeam FTHG HTHG Referee HF AR PSA
## 0 0 0 0 8 10 10 10
Ejercicio 7
- Convertir una variable categórica en numérica
Convertimos la variable HomeTeam
en variable numerica
library(caret) # contiene la función dummyVars
library(fastDummies)
=dummy_cols(premier2, select_columns = c("HomeTeam")) %>%
disci_dummieselect(-c("HomeTeam"))
::datatable(disci_dummie) DT
Ejercicio 8
- Discretizar una de las variables
<- cut(premier_league$HS, breaks = quantile(premier_league$HS, probs = c(0, 0.33, 0.66, 1)),
HS2 labels = c("local_defensivo", "local_no_ofensivo", "local_ofensivo"),
right = TRUE)
=data.frame(premier_league %>% select("HomeTeam", "HS"), HS2)
ofensivo::datatable(ofensivo) DT