library(DT)
library(mice)library(readxl)
premier_league <- read_excel("C:/Users/wsand/Dropbox/2021-II/Ulibertadores/Mineria de datos/premier_league.xls")Div = League Division
Date = Match Date (dd/mm/yy)
Time = Time of match kick-off
HomeTeam = Home Team
Away team = Away Team
FTHG and HG = Full Time Home Team Goals
FTAG and AG = Full-Time Away Team Goals
FTR and Res = Full-Time Result (H=Home Win, D=Draw, A=Away Win)
HTHG = Half Time Home Team Goals
HTAG = Half Time Away Team Goals
HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)
Attendance = Crowd Attendance
Referee = Match Referee
HS = Home Team Shots
AS = Away Team Shots
HST = Home Team Shots on Target
AST = Away Team Shots on Target
HHW = Home Team Hit Woodwork
AHW = Away Team Hit Woodwork
HC = Home Team Corners
AC = Away Team Corners
HF = Home Team Fouls Committed
AF = Away Team Fouls Committed
HFKC = Home Team Free Kicks Conceded
AFKC = Away Team Free Kicks Conceded
HO = Home Team Offsides
AO = Away Team Offsides
HY = Home Team Yellow Cards
AY = Away Team Yellow Cards
HR = Home Team Red Cards
AR = Away Team Red Cards
HBP = Home Team Bookings Points (10 = yellow, 25 = red)
ABP = Away Team Bookings Points (10 = yellow, 25 = red)
pleague=premier_league
DT::datatable(pleague)dim(pleague)## [1] 208 106
- La base de tados tiene 208 filas y 106 columnas
sum(is.na(pleague))## [1] 72
- Se encontraron en total dentro de la base 72
NAs
Vamos a filtrar la base solo para las columnas que contienen NAs
library(tidyverse)
a=colSums(is.na(pleague))
premier=pleague[,as.numeric(a)>0]DT::datatable(premier)sum(is.na(premier))## [1] 72
Después de averiguar las variables donde hay NAs encontramos la cantidad por columna
colSums(is.na(premier))## HomeTeam AwayTeam FTHG HTHG Referee HF AR PSA
## 1 14 10 9 8 10 10 10
str(premier)## tibble [208 x 8] (S3: tbl_df/tbl/data.frame)
## $ HomeTeam: chr [1:208] "Liverpool" "West Ham" "Bournemouth" "Burnley" ...
## $ AwayTeam: chr [1:208] "Norwich" "Man City" "Sheffield United" "Southampton" ...
## $ FTHG : num [1:208] NA 0 1 3 0 0 3 0 0 4 ...
## $ HTHG : num [1:208] 4 NA 0 0 0 0 0 0 0 1 ...
## $ Referee : chr [1:208] "M Oliver" "M Dean" NA "G Scott" ...
## $ HF : num [1:208] 9 6 NA 6 16 15 13 3 12 15 ...
## $ AR : num [1:208] NA 0 0 0 1 0 0 0 0 0 ...
## $ PSA : chr [1:208] NA "1.26" "3.9" "2.81" ...
premier$PSA<-as.numeric(premier$PSA)summary(premier)## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.0000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.000 Median :0.0000
## Mean :1.485 Mean :0.6784
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :8.000 Max. :5.0000
## NA's :10 NA's :9
## Referee HF AR PSA
## Length:208 Min. : 0.00 Min. :0.00000 Min. : 1.140
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.310
## Mode :character Median :10.00 Median :0.00000 Median : 3.310
## Mean :10.32 Mean :0.05051 Mean : 4.940
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 5.173
## Max. :21.00 Max. :1.00000 Max. :34.710
## NA's :10 NA's :10 NA's :10
PARTE I
Ejercicio 1
- 1.Hacer imputación de datos con las siguientes opciones:
Omitir las filas con observaciones NA
premier_omitir_na = na.omit(premier)
DT::datatable(premier_omitir_na)Al eliminar las filas u observaciones nos quedariamos com 167 filas
Imputar con la media
premier_imputados_media = premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
~replace(., is.na(.), mean(.,
na.rm=TRUE))
)Imputar con la mediana
premier_imputados_mediana = premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
~replace(., is.na(.), median(.,
na.rm=TRUE))
)Reemplazar NAs por cero
premier_imputados_cero = premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
~replace(., is.na(.), 0)
)Ejercicio 2
2.Analizar, empleando la función summary(), los estadísticos resultantes para cada opción de imputación. Seleccionar la mejor opción.
summary(premier_omitir_na)## HomeTeam AwayTeam FTHG HTHG
## Length:164 Length:164 Min. :0.000 Min. :0.0000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.000 Median :0.0000
## Mean :1.433 Mean :0.6159
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :8.000 Max. :5.0000
## Referee HF AR PSA
## Length:164 Min. : 3.00 Min. :0.00000 Min. : 1.200
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.410
## Mode :character Median :10.00 Median :0.00000 Median : 3.525
## Mean :10.44 Mean :0.04878 Mean : 5.066
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 5.098
## Max. :21.00 Max. :1.00000 Max. :34.710
summary(premier_imputados_media)## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.0000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.000 Median :0.0000
## Mean :1.485 Mean :0.6784
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :8.000 Max. :5.0000
## Referee HF AR PSA
## Length:208 Min. : 0.00 Min. :0.00000 Min. : 1.140
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.330
## Mode :character Median :10.00 Median :0.00000 Median : 3.470
## Mean :10.32 Mean :0.05051 Mean : 4.940
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 4.942
## Max. :21.00 Max. :1.00000 Max. :34.710
summary(premier_imputados_mediana)## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.000
## Mode :character Mode :character Median :1.000 Median :0.000
## Mean :1.462 Mean :0.649
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :8.000 Max. :5.000
## Referee HF AR PSA
## Length:208 Min. : 0.00 Min. :0.00000 Min. : 1.140
## Class :character 1st Qu.: 8.00 1st Qu.:0.00000 1st Qu.: 2.330
## Mode :character Median :10.00 Median :0.00000 Median : 3.310
## Mean :10.31 Mean :0.04808 Mean : 4.862
## 3rd Qu.:13.00 3rd Qu.:0.00000 3rd Qu.: 4.890
## Max. :21.00 Max. :1.00000 Max. :34.710
summary(premier_imputados_cero)## HomeTeam AwayTeam FTHG HTHG
## Length:208 Length:208 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.000
## Mode :character Mode :character Median :1.000 Median :0.000
## Mean :1.413 Mean :0.649
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :8.000 Max. :5.000
## Referee HF AR PSA
## Length:208 Min. : 0.000 Min. :0.00000 Min. : 0.000
## Class :character 1st Qu.: 7.000 1st Qu.:0.00000 1st Qu.: 2.150
## Mode :character Median :10.000 Median :0.00000 Median : 3.175
## Mean : 9.827 Mean :0.04808 Mean : 4.702
## 3rd Qu.:13.000 3rd Qu.:0.00000 3rd Qu.: 4.890
## Max. :21.000 Max. :1.00000 Max. :34.710
- Comparación para la variable
FTHG
| Estadística | sin Imputar | FTHG omitir_na | FTHG media | FTHG mediana | FTHG cero |
|---|---|---|---|---|---|
| Min | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| 1st Qu | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Median | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Mean | 1.485 | 1.433 | 1.485 | 1.462 | 1.413 |
| 3rd Qu | 2.000 | 2.000 | 2.000 | 2.000 | 2.000 |
| Max | 8.00 | 8.000 | 8.000 | 8.000 | 8.000 |
- Comparación para la variable
HTHG
| Estadística | sin imputar | HTHG omitir_na | HTHG media | HTHG mediana | FTHG cero |
|---|---|---|---|---|---|
| Min | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 |
| 1st Qu | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| Median | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| Mean | 0.6784 | 0.6159 | 0.6784 | 0.649 | 0.649 |
| 3rd Qu | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Max | 5.000 | 5.000 | 5.000 | 5.000 | 5.000 |
- Comparación para la variable
HF
| Estadística | sin imputar | HF omitir_na | HF media | HF mediana | HF cero |
|---|---|---|---|---|---|
| Min | 0.000 | 3.000 | 0.000 | 0.000 | 0.000 |
| 1st Qu | 8.000 | 8.000 | 8.000 | 8.000 | 7.000 |
| Median | 10.000 | 10.000 | 10.000 | 10.000 | 10.000 |
| Mean | 10.32 | 10.44 | 10.320 | 10.31 | 9.827 |
| 3rd Qu | 13.000 | 13.000 | 13.000 | 13.000 | 13.000 |
| Max | 21.00 | 21.000 | 21.000 | 21.000 | 21.000 |
- Comparación para la variable
AR
| Estadística | Sin Imputar | AR omitir_na | AR media | AR mediana | AR cero |
|---|---|---|---|---|---|
| Min | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| 1st Qu | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| Median | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| Mean | 0.05051 | 0.04878 | 0.05051 | 0.04808 | 0.04808 |
| 3rd Qu | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
| Max | 1.0000 | 1.000 | 1.000 | 1.000 | 1.000 |
Ejercicio 3
3.Con la librería tidyverse, el operador %>% y las funciones
filter()yggplot()realizar un análisis descriptivo del dataset seleccionado en el numeral 2. considerando:- La combinación de tres (3) variables entre si
premier_imputados_mediana$FTR<-pleague$FTRlibrary(GGally)
ggpairs(premier_imputados_mediana, columns=c("HTHG", "FTHG", "HF","AR","PSA"),
ggplot2::aes(colour=FTR))## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
- Mínimo dos gráficas de dispersión
library(ggplot2)
ggplot(data=pleague, aes(x=HS, y=FTHG, color=FTR))+
geom_point()+
ggtitle( "HS vs FTHG")- Mínimo dos gráficos de boxplot
library(plotly)
p1=ggplot(premier_imputados_mediana, aes(y=FTHG , col=FTR))+
geom_boxplot()+
ggtitle("Goles del equipo local ")
ggplotly(p1)p2=ggplot(premier_imputados_mediana, aes(y=FTHG , col=HomeTeam))+
geom_boxplot()+
ggtitle("Goles del equipo local ")
ggplotly(p2)- Variable
PSA
q=ggplot(data=premier_imputados_mediana, aes(y=PSA))+
geom_boxplot(fill="steelblue")+
ggtitle("PSA")
ggplotly(q)- Equipo con mas goles local
a=premier_imputados_mediana %>% group_by(HomeTeam) %>% summarise(mediagoles=mean(FTHG))
b=a %>% arrange(desc(mediagoles))
c=b[1:10,]
c$HomeTeam <- factor(c$HomeTeam, levels=c$HomeTeam)or=ggplot(c, aes(x=HomeTeam, y=mediagoles, fill=HomeTeam))+
geom_bar(stat = "identity", )+
ggtitle("Los 5 equipos con más goles en media de local")
ggplotly(or)Observamos que el equipo más goleador de local es el Liverpool con una media de 2.4 goles por partido
PARTE II
Ejercicio 4
- Imputar datos para una variable categórica empleando la moda
Para calcular la moda usaremos la siguiente función:
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}premier2 = premier
premier2$HomeTeam<-ifelse(is.na(premier$HomeTeam),
getmode(premier$HomeTeam),
premier$HomeTeam)
DT::datatable(premier2)Ejercicio 5
- Imputar datos para una variable categórica con cadenas de texto
premier2 = premier2 %>% mutate(AwayTeam = replace_na(AwayTeam, "equipo_desconocido"))
DT::datatable(premier2)Ejercicio 6
- Hacer dos imputaciones con el paquete “mice”
en este caso se imputaran las variables FTHG Y HTHG
columns <- c("FTHG", "HTHG")
imputed_data <- mice(premier2[,names(premier2) %in% columns], m=1,
maxit = 1, method = "mean", seed = 2018, print=F)
complete.data <- mice::complete(imputed_data)
premier2$FTHG <- round(complete.data$FTHG) # Se redondea para que el resultado sea entero
premier2$HTHG <- round(complete.data$HTHG)
DT::datatable(premier2)colSums(is.na(premier2))## HomeTeam AwayTeam FTHG HTHG Referee HF AR PSA
## 0 0 0 0 8 10 10 10
Ejercicio 7
- Convertir una variable categórica en numérica
Convertimos la variable HomeTeam en variable numerica
library(caret) # contiene la función dummyVars
library(fastDummies)
disci_dummie=dummy_cols(premier2, select_columns = c("HomeTeam")) %>%
select(-c("HomeTeam"))
DT::datatable(disci_dummie)Ejercicio 8
- Discretizar una de las variables
HS2 <- cut(premier_league$HS, breaks = quantile(premier_league$HS, probs = c(0, 0.33, 0.66, 1)),
labels = c("local_defensivo", "local_no_ofensivo", "local_ofensivo"),
right = TRUE)
ofensivo=data.frame(premier_league %>% select("HomeTeam", "HS"), HS2)
DT::datatable(ofensivo)