library(DT)
library(mice)

library(readxl)
premier_league <- read_excel("C:/Users/wsand/Dropbox/2021-II/Ulibertadores/Mineria de datos/premier_league.xls")

Div = League Division
Date = Match Date (dd/mm/yy)
Time = Time of match kick-off
HomeTeam = Home Team
Away team = Away Team
FTHG and HG = Full Time Home Team Goals
FTAG and AG = Full-Time Away Team Goals
FTR and Res = Full-Time Result (H=Home Win, D=Draw, A=Away Win)
HTHG = Half Time Home Team Goals
HTAG = Half Time Away Team Goals
HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)
Attendance = Crowd Attendance
Referee = Match Referee
HS = Home Team Shots
AS = Away Team Shots
HST = Home Team Shots on Target
AST = Away Team Shots on Target
HHW = Home Team Hit Woodwork
AHW = Away Team Hit Woodwork
HC = Home Team Corners
AC = Away Team Corners
HF = Home Team Fouls Committed
AF = Away Team Fouls Committed
HFKC = Home Team Free Kicks Conceded
AFKC = Away Team Free Kicks Conceded
HO = Home Team Offsides
AO = Away Team Offsides
HY = Home Team Yellow Cards
AY = Away Team Yellow Cards
HR = Home Team Red Cards
AR = Away Team Red Cards
HBP = Home Team Bookings Points (10 = yellow, 25 = red)
ABP = Away Team Bookings Points (10 = yellow, 25 = red)

pleague=premier_league
DT::datatable(pleague)

Show entries

Search:

	Div	Date	Time	HomeTeam	AwayTeam	FTHG	FTAG	FTR	HTHG	HTAG	HTR	Referee	HS	AS	HST	AST	HF	AF	HC	AC	HY	AY	AR	B365H	B365D	B365A	BWH	BWD	BWA	IWH	IWD	IWA	PSH	PSD	PSA	WHH	WHD	WHA	VCH	VCD	VCA	MaxH	MaxD	MaxA	AvgH	AvgD	AvgA	B365>2.5	B365<2.5	P>2.5	P<2.5	Max>2.5	Max<2.5	Avg>2.5	Avg<2.5	AHh	B365AHH	B365AHA	PAHH	PAHA	MaxAHH	MaxAHA	AvgAHH	AvgAHA	B365CH	B365CD	B365CA	BWCH	BWCD	BWCA	IWCH	IWCD	IWCA	PSCH	PSCD	PSCA	WHCH	WHCD	WHCA	VCCH	VCCD	VCCA	MaxCH	MaxCD	MaxCA	AvgCH	AvgCD	AvgCA	B365C>2.5	B365C<2.5	PC>2.5	PC<2.5	MaxC>2.5	MaxC<2.5	AvgC>2.5	AvgC<2.5	AHCh	B365CAHH	B365CAHA	PCAHH	PCAHA	MaxCAHH	MaxCAHA	AvgCAHH	AvgCAHA
1	E0	2019-08-09T00:00:00Z	1899-12-31T20:00:00Z	Liverpool	Norwich		1	H	4	0	H	M Oliver	15	12	7	5	9	9	11	2	0	2		1.14	10	19	1.14	8.25	18.5	1.15	8	18	1.15	9.59		1.12	8.5	21	1.14	9.5	23	1.16	10	23	1.14	8.75	19.83	1.4	3	1.4	3.11	1.45	3.11	1.41	2.92	-2.25	1.96	1.94	1.97	1.95	1.97	2	1.94	1.94	1.14	9.5	21	1.14	9	20	1.15	8	18	1.14	10.43	19.63	1.11	9.5	21	1.14	9.5	23	1.16	10.5	23	1.14	9.52	19.18	1.3	3.5	1.34	3.44	1.36	3.76	1.32	3.43	-2.25	1.91	1.99	1.94	1.98	1.99	2.07	1.9	1.99
2	E0	2019-08-10T00:00:00Z	1899-12-31T12:30:00Z	West Ham	Man City	0	5	A		1	A	M Dean	5	14	3	9	6	13	1	1	2	2	0	12	6.5	1.22	11.5	5.75	1.26	11	6.1	1.25	11.68	6.53	1.26	13	6	1.24	12	6.5	1.25	13	6.75	1.29	11.84	6.28	1.25	1.44	2.75	1.49	2.77	1.51	2.77	1.48	2.65	1.75	2	1.9	2.02	1.9	2.02	1.92	1.99	1.89	12	7	1.25	11	6	1.26	11	6.1	1.25	11.11	6.68	1.27	11	6.5	1.24	12	6.5	1.25	13	7	1.29	11.14	6.46	1.26	1.4	3	1.43	3.03	1.5	3.22	1.41	2.91	1.75	1.95	1.95	1.96	1.97	2.07	1.98	1.97	1.92
3	E0	2019-08-10T00:00:00Z	1899-12-31T15:00:00Z	Bournemouth	Sheffield United	1	1	D	0	0	D		13	8	3	3		19	3	4	2	1	0	1.95	3.6	3.6	1.95	3.6	3.9	1.97	3.55	3.8	2.04	3.57	3.9	2	3.5	3.8	2	3.6	4	2.06	3.65	4	2.01	3.53	3.83	1.9	1.9	1.96	1.96	2	1.99	1.9	1.93	-0.5	2.01	1.89	2.04	1.88	2.04	1.91	2	1.88	1.95	3.7	4.2	1.95	3.6	3.9	1.97	3.55	3.85	1.98	3.67	4.06	1.95	3.6	3.9	2	3.6	4	2.03	3.7	4.2	1.98	3.58	3.96	1.9	1.9	1.94	1.97	1.97	1.98	1.91	1.92	-0.5	1.95	1.95	1.98	1.95	2	1.96	1.96	1.92
4	E0	2019-08-10T00:00:00Z	1899-12-31T15:00:00Z	Burnley	Southampton	3	0	H	0	0	D	G Scott	10	11	4	3	6	12	2	7	0	0	0	2.62	3.2	2.75	2.65	3.2	2.75	2.65	3.2	2.75	2.71	3.31	2.81	2.7	3.2	2.75	2.7	3.3	2.8	2.8	3.33	2.85	2.68	3.22	2.78	2.1	1.72	2.17	1.77	2.2	1.78	2.12	1.73	0	1.92	1.98	1.93	2	1.94	2	1.91	1.98	2.7	3.25	2.9	2.65	3.1	2.85	2.6	3.2	2.85	2.71	3.19	2.9	2.62	3.2	2.8	2.7	3.25	2.9	2.72	3.26	2.95	2.65	3.18	2.88	2.1	1.72	2.19	1.76	2.25	1.78	2.17	1.71	0	1.87	2.03	1.89	2.03	1.9	2.07	1.86	2.02
5	E0	2019-08-10T00:00:00Z	1899-12-31T15:00:00Z		Everton	0	0	D	0	0	D	J Moss	6	10	2	3	16	14	6	2	2	1	1	3	3.25	2.37	3.2	3.2	2.35	3.1	3.2	2.4	3.21	3.37	2.39	3.1	3.3	2.35	3.2	3.3	2.45	3.21	3.4	2.52	3.13	3.27	2.4	2.2	1.66	2.23	1.74	2.25	1.74	2.18	1.7	0.25	1.85	2.05	1.88	2.05	1.88	2.09	1.84	2.04	3.4	3.5	2.25	3.3	3.3	2.25	3.4	3.3	2.2	3.37	3.45	2.27	3.3	3.3	2.25	3.4	3.3	2.25	3.55	3.5	2.34	3.41	3.37	2.23	2.2	1.66	2.22	1.74	2.28	1.77	2.17	1.71	0.25	1.82	2.08	1.97	1.96	2.03	2.08	1.96	1.93
6	E0	2019-08-10T00:00:00Z	1899-12-31T15:00:00Z	Watford	Brighton	0	3	A	0	1	A	C Pawson	11	5	3	3	15	11	5	2	0	1	0	1.9	3.4	4	1.9	3.4	4.33	1.93	3.4	4.25	1.98	3.44	4.37	1.95	3.4	4.2	1.95	3.5	4.33	2	3.5	4.6	1.94	3.41	4.26	2.1	1.72	2.19	1.76	2.24	1.76	2.16	1.71	-0.5	1.95	1.95	1.98	1.95	1.98	1.98	1.94	1.94	2.1	3.25	4.2	2.1	3.1	4	2.05	3.2	4	2.05	3.38	4.12	2.05	3.25	4	2.15	3.3	3.9	2.15	3.38	4.2	2.07	3.27	4.04	2.1	1.72	2.16	1.78	2.2	1.78	2.14	1.73	-0.5	2.04	1.86	2.05	1.88	2.12	1.91	2.05	1.84
7	E0	2019-08-10T00:00:00Z	1899-12-31T17:30:00Z	Tottenham		3	1	H	0	1	A	C Kavagh	31	7	7	4	13	9	14	0	1	0	0	1.3	5.25	10	1.3	5.5	10	1.3	5.5	9.6	1.3	5.84	10.96	1.29	5.5	10	1.3	5.5	12	1.33	5.95	12	1.3	5.53	10.51	1.66	2.2	1.64	2.4	1.7	2.4	1.65	2.26	-1.5	1.97	1.93	1.99	1.93	2	2	1.93	1.94	1.36	5.5	9	1.35	5	9	1.3	5.5	9.6	1.39	5.35	8.42	1.35	5.25	8	1.4	5.2	9	1.4	5.7	10	1.36	5.29	8.82	1.57	2.37	1.58	2.52	1.65	2.55	1.58	2.4	-1.5	2.1	1.7	2.18	1.77	2.21	1.87	2.08	1.8
8	E0	2019-08-11T00:00:00Z	1899-12-31T14:00:00Z	Leicester	Wolves	0	0	D	0	0	D	A Marriner	15	8	1	2	3	13	12	3	0	2	0	2.2	3.2	3.4	2.25	3.3	3.3	2.2	3.25	3.45	2.21	3.34	3.66	2.2	3.25	3.5	2.25	3.3	3.6	2.29	3.38	3.66	2.22	3.28	3.48	2.2	1.66	2.23	1.74	2.25	1.74	2.17	1.7	-0.25	1.9	2	1.9	2.04	1.95	2.04	1.91	1.98	2.4	3.25	3.3	2.35	3.2	3.3	2.35	3.15	3.2	2.5	3.12	3.3	2.35	3.1	3.3	2.45	3.2	3.3	2.55	3.25	3.58	2.41	3.14	3.29	2.3	1.61	2.45	1.63	2.45	1.71	2.33	1.62	-0.25	2.07	1.83	2.11	1.83	2.12	1.98	2.06	1.84
9	E0	2019-08-11T00:00:00Z	1899-12-31T14:00:00Z	Newcastle	Arsel	0	1	A	0	0	D	M Atkinson	9	8	2	2	12	7	5	3	1	3	0	4.5	3.75	1.72	4.5	3.75	1.78	4.4	3.85	1.77	4.58	3.93	1.81	4.5	3.75	1.78	4.6	3.9	1.8	4.7	4	1.83	4.49	3.82	1.79	1.8	2	1.83	2.1	1.83	2.14	1.77	2.07	0.75	1.85	2.05	1.86	2.07	1.88	2.08	1.85	2.03	3.4	3.6	2.2	3.3	3.5	2.2	3.25	3.5	2.2	3.36	3.56	2.25	3.5	3.4	2.15	3.4	3.5	2.25	3.76	3.65	2.25	3.36	3.51	2.2	1.8	2	1.83	2.09	1.85	2.17	1.79	2.05	0.25	1.99	1.91	1.99	1.95	2.17	1.97	2	1.89
10	E0	2019-08-11T00:00:00Z	1899-12-31T16:30:00Z	Man United	Chelsea	4	0	H	1	0	H	A Taylor	11	18	5	7	15	13	3	5	3	4	0	2.1	3.3	3.5	2.15	3.3	3.5	2.15	3.35	3.4	2.21	3.37	3.63	2.15	3.3	3.5	2.25	3.3	3.5	2.28	3.43	3.63	2.19	3.32	3.49	2	1.8	2.05	1.87	2.1	1.87	2.01	1.83	-0.25	1.9	2	1.9	2.04	1.92	2.04	1.89	2	2.05	3.5	4	2.1	3.3	3.8	2.05	3.3	3.85	2.04	3.44	4.14	2	3.4	4	2.05	3.4	4.1	2.2	3.5	4.4	2.05	3.36	3.99	2	1.8	2.05	1.88	2.07	2.04	1.99	1.84	-0.5	2.02	1.88	2.04	1.9	2.1	1.91	2.04	1.85

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

dim(pleague)

## [1] 208 106

La base de tados tiene 208 filas y 106 columnas

sum(is.na(pleague))

## [1] 72

Se encontraron en total dentro de la base 72 NAs

Vamos a filtrar la base solo para las columnas que contienen NAs

library(tidyverse)
a=colSums(is.na(pleague))
premier=pleague[,as.numeric(a)>0]

DT::datatable(premier)

Show entries

Search:

	HomeTeam	AwayTeam	FTHG	HTHG	Referee	HF	AR	PSA
1	Liverpool	Norwich		4	M Oliver	9
2	West Ham	Man City	0		M Dean	6	0	1.26
3	Bournemouth	Sheffield United	1	0			0	3.9
4	Burnley	Southampton	3	0	G Scott	6	0	2.81
5		Everton	0	0	J Moss	16	1	2.39
6	Watford	Brighton	0	0	C Pawson	15	0	4.37
7	Tottenham		3	0	C Kavagh	13	0	10.96
8	Leicester	Wolves	0	0	A Marriner	3	0	3.66
9	Newcastle	Arsel	0	0	M Atkinson	12	0	1.81
10	Man United	Chelsea	4	1	A Taylor	15	0	3.63

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

sum(is.na(premier))

## [1] 72

Después de averiguar las variables donde hay NAs encontramos la cantidad por columna

colSums(is.na(premier))

## HomeTeam AwayTeam     FTHG     HTHG  Referee       HF       AR      PSA 
##        1       14       10        9        8       10       10       10

str(premier)

## tibble [208 x 8] (S3: tbl_df/tbl/data.frame)
##  $ HomeTeam: chr [1:208] "Liverpool" "West Ham" "Bournemouth" "Burnley" ...
##  $ AwayTeam: chr [1:208] "Norwich" "Man City" "Sheffield United" "Southampton" ...
##  $ FTHG    : num [1:208] NA 0 1 3 0 0 3 0 0 4 ...
##  $ HTHG    : num [1:208] 4 NA 0 0 0 0 0 0 0 1 ...
##  $ Referee : chr [1:208] "M Oliver" "M Dean" NA "G Scott" ...
##  $ HF      : num [1:208] 9 6 NA 6 16 15 13 3 12 15 ...
##  $ AR      : num [1:208] NA 0 0 0 1 0 0 0 0 0 ...
##  $ PSA     : chr [1:208] NA "1.26" "3.9" "2.81" ...

premier$PSA<-as.numeric(premier$PSA)

summary(premier)

##    HomeTeam           AwayTeam              FTHG            HTHG       
##  Length:208         Length:208         Min.   :0.000   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :1.000   Median :0.0000  
##                                        Mean   :1.485   Mean   :0.6784  
##                                        3rd Qu.:2.000   3rd Qu.:1.0000  
##                                        Max.   :8.000   Max.   :5.0000  
##                                        NA's   :10      NA's   :9       
##    Referee                HF              AR               PSA        
##  Length:208         Min.   : 0.00   Min.   :0.00000   Min.   : 1.140  
##  Class :character   1st Qu.: 8.00   1st Qu.:0.00000   1st Qu.: 2.310  
##  Mode  :character   Median :10.00   Median :0.00000   Median : 3.310  
##                     Mean   :10.32   Mean   :0.05051   Mean   : 4.940  
##                     3rd Qu.:13.00   3rd Qu.:0.00000   3rd Qu.: 5.173  
##                     Max.   :21.00   Max.   :1.00000   Max.   :34.710  
##                     NA's   :10      NA's   :10        NA's   :10

PARTE I

Ejercicio 1

1.Hacer imputación de datos con las siguientes opciones:

Omitir las filas con observaciones NA

premier_omitir_na = na.omit(premier)
DT::datatable(premier_omitir_na)

Show entries

Search:

	HomeTeam	AwayTeam	FTHG	HTHG	Referee	HF	PSA
1	Burnley	Southampton	3	0	G Scott	6	2.81
2	Watford	Brighton	0	0	C Pawson	15	4.37
3	Leicester	Wolves	0	0	A Marriner	3	3.66
4	Newcastle	Arsel	0	0	M Atkinson	12	1.81
5	Man United	Chelsea	4	1	A Taylor	15	3.63
6	Arsel	Burnley	2	1	M Dean	13	9.35
7	Aston Villa	Bournemouth	1	0	M Atkinson	10	3.17
8	Brighton	West Ham	1	0	A Taylor	11	2.92
9	Everton	Watford	1	1	L Mason	11	4.87
10	Norwich	Newcastle	3	1	S Attwell	9	3.44

Showing 1 to 10 of 164 entries

Previous1 2 3 4 5…17Next

Al eliminar las filas u observaciones nos quedariamos com 167 filas

Imputar con la media

premier_imputados_media = premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
                                            ~replace(., is.na(.), mean(.,
                                                                      na.rm=TRUE))
                                            )

Imputar con la mediana

premier_imputados_mediana = premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
                                            ~replace(., is.na(.), median(.,
                                                                      na.rm=TRUE))
)

Reemplazar NAs por cero

premier_imputados_cero = premier %>% mutate_at(c("FTHG", "HTHG", "HF", "AR","PSA"),
                                            ~replace(., is.na(.), 0)
                                            )

Ejercicio 2

2.Analizar, empleando la función summary(), los estadísticos resultantes para cada opción de imputación. Seleccionar la mejor opción.

summary(premier_omitir_na)

##    HomeTeam           AwayTeam              FTHG            HTHG       
##  Length:164         Length:164         Min.   :0.000   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :1.000   Median :0.0000  
##                                        Mean   :1.433   Mean   :0.6159  
##                                        3rd Qu.:2.000   3rd Qu.:1.0000  
##                                        Max.   :8.000   Max.   :5.0000  
##    Referee                HF              AR               PSA        
##  Length:164         Min.   : 3.00   Min.   :0.00000   Min.   : 1.200  
##  Class :character   1st Qu.: 8.00   1st Qu.:0.00000   1st Qu.: 2.410  
##  Mode  :character   Median :10.00   Median :0.00000   Median : 3.525  
##                     Mean   :10.44   Mean   :0.04878   Mean   : 5.066  
##                     3rd Qu.:13.00   3rd Qu.:0.00000   3rd Qu.: 5.098  
##                     Max.   :21.00   Max.   :1.00000   Max.   :34.710

summary(premier_imputados_media)

##    HomeTeam           AwayTeam              FTHG            HTHG       
##  Length:208         Length:208         Min.   :0.000   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :1.000   Median :0.0000  
##                                        Mean   :1.485   Mean   :0.6784  
##                                        3rd Qu.:2.000   3rd Qu.:1.0000  
##                                        Max.   :8.000   Max.   :5.0000  
##    Referee                HF              AR               PSA        
##  Length:208         Min.   : 0.00   Min.   :0.00000   Min.   : 1.140  
##  Class :character   1st Qu.: 8.00   1st Qu.:0.00000   1st Qu.: 2.330  
##  Mode  :character   Median :10.00   Median :0.00000   Median : 3.470  
##                     Mean   :10.32   Mean   :0.05051   Mean   : 4.940  
##                     3rd Qu.:13.00   3rd Qu.:0.00000   3rd Qu.: 4.942  
##                     Max.   :21.00   Max.   :1.00000   Max.   :34.710

summary(premier_imputados_mediana)

##    HomeTeam           AwayTeam              FTHG            HTHG      
##  Length:208         Length:208         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :1.000   Median :0.000  
##                                        Mean   :1.462   Mean   :0.649  
##                                        3rd Qu.:2.000   3rd Qu.:1.000  
##                                        Max.   :8.000   Max.   :5.000  
##    Referee                HF              AR               PSA        
##  Length:208         Min.   : 0.00   Min.   :0.00000   Min.   : 1.140  
##  Class :character   1st Qu.: 8.00   1st Qu.:0.00000   1st Qu.: 2.330  
##  Mode  :character   Median :10.00   Median :0.00000   Median : 3.310  
##                     Mean   :10.31   Mean   :0.04808   Mean   : 4.862  
##                     3rd Qu.:13.00   3rd Qu.:0.00000   3rd Qu.: 4.890  
##                     Max.   :21.00   Max.   :1.00000   Max.   :34.710

summary(premier_imputados_cero)

##    HomeTeam           AwayTeam              FTHG            HTHG      
##  Length:208         Length:208         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :1.000   Median :0.000  
##                                        Mean   :1.413   Mean   :0.649  
##                                        3rd Qu.:2.000   3rd Qu.:1.000  
##                                        Max.   :8.000   Max.   :5.000  
##    Referee                HF               AR               PSA        
##  Length:208         Min.   : 0.000   Min.   :0.00000   Min.   : 0.000  
##  Class :character   1st Qu.: 7.000   1st Qu.:0.00000   1st Qu.: 2.150  
##  Mode  :character   Median :10.000   Median :0.00000   Median : 3.175  
##                     Mean   : 9.827   Mean   :0.04808   Mean   : 4.702  
##                     3rd Qu.:13.000   3rd Qu.:0.00000   3rd Qu.: 4.890  
##                     Max.   :21.000   Max.   :1.00000   Max.   :34.710

Comparación para la variable FTHG

Estadística	sin Imputar	FTHG omitir_na	FTHG media	FTHG mediana	FTHG cero
Min	0.000	0.000	0.000	0.000	0.000
1st Qu	1.000	1.000	1.000	1.000	1.000
Median	1.000	1.000	1.000	1.000	1.000
Mean	1.485	1.433	1.485	1.462	1.413
3rd Qu	2.000	2.000	2.000	2.000	2.000
Max	8.00	8.000	8.000	8.000	8.000

Comparación para la variable HTHG

Estadística	sin imputar	HTHG omitir_na	HTHG media	HTHG mediana	FTHG cero
Min	0.00	0.000	0.000	0.000	0.000
1st Qu	0.000	0.000	0.000	0.000	0.000
Median	0.000	0.000	0.000	0.000	0.000
Mean	0.6784	0.6159	0.6784	0.649	0.649
3rd Qu	1.000	1.000	1.000	1.000	1.000
Max	5.000	5.000	5.000	5.000	5.000

Comparación para la variable HF

Estadística	sin imputar	HF omitir_na	HF media	HF mediana	HF cero
Min	0.000	3.000	0.000	0.000	0.000
1st Qu	8.000	8.000	8.000	8.000	7.000
Median	10.000	10.000	10.000	10.000	10.000
Mean	10.32	10.44	10.320	10.31	9.827
3rd Qu	13.000	13.000	13.000	13.000	13.000
Max	21.00	21.000	21.000	21.000	21.000

Comparación para la variable AR

Estadística	Sin Imputar	AR omitir_na	AR media	AR mediana	AR cero
Min	0.000	0.000	0.000	0.000	0.000
1st Qu	0.000	0.000	0.000	0.000	0.000
Median	0.000	0.000	0.000	0.000	0.000
Mean	0.05051	0.04878	0.05051	0.04808	0.04808
3rd Qu	0.000	0.000	0.000	0.000	0.000
Max	1.0000	1.000	1.000	1.000	1.000

Ejercicio 3

3.Con la librería tidyverse, el operador %>% y las funciones filter() y ggplot() realizar un análisis descriptivo del dataset seleccionado en el numeral 2. considerando:
- La combinación de tres (3) variables entre si

premier_imputados_mediana$FTR<-pleague$FTR

library(GGally)
ggpairs(premier_imputados_mediana, columns=c("HTHG", "FTHG", "HF","AR","PSA"),
        ggplot2::aes(colour=FTR))

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

Mínimo dos gráficas de dispersión

library(ggplot2)
ggplot(data=pleague, aes(x=HS, y=FTHG, color=FTR))+
  geom_point()+
  ggtitle( "HS vs FTHG")

Mínimo dos gráficos de boxplot

library(plotly)
p1=ggplot(premier_imputados_mediana, aes(y=FTHG , col=FTR))+
    geom_boxplot()+
  ggtitle("Goles del equipo local ")

ggplotly(p1)

p2=ggplot(premier_imputados_mediana, aes(y=FTHG , col=HomeTeam))+
    geom_boxplot()+
  ggtitle("Goles del equipo local ")

ggplotly(p2)

Variable PSA

q=ggplot(data=premier_imputados_mediana, aes(y=PSA))+
  geom_boxplot(fill="steelblue")+
  ggtitle("PSA")
ggplotly(q)

Equipo con mas goles local

a=premier_imputados_mediana %>% group_by(HomeTeam) %>% summarise(mediagoles=mean(FTHG))

b=a %>% arrange(desc(mediagoles))
c=b[1:10,]
c$HomeTeam <- factor(c$HomeTeam, levels=c$HomeTeam)

or=ggplot(c, aes(x=HomeTeam, y=mediagoles, fill=HomeTeam))+
  geom_bar(stat = "identity", )+
  ggtitle("Los 5 equipos con más goles en media de local")
ggplotly(or)

Observamos que el equipo más goleador de local es el Liverpool con una media de 2.4 goles por partido

PARTE II

Ejercicio 4

1. Imputar datos para una variable categórica empleando la moda

Para calcular la moda usaremos la siguiente función:

getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

premier2 = premier
premier2$HomeTeam<-ifelse(is.na(premier$HomeTeam),
                          getmode(premier$HomeTeam),
                          premier$HomeTeam)
DT::datatable(premier2)

Show entries

Search:

	HomeTeam	AwayTeam	FTHG	HTHG	Referee	HF	AR	PSA
1	Liverpool	Norwich		4	M Oliver	9
2	West Ham	Man City	0		M Dean	6	0	1.26
3	Bournemouth	Sheffield United	1	0			0	3.9
4	Burnley	Southampton	3	0	G Scott	6	0	2.81
5	Burnley	Everton	0	0	J Moss	16	1	2.39
6	Watford	Brighton	0	0	C Pawson	15	0	4.37
7	Tottenham		3	0	C Kavagh	13	0	10.96
8	Leicester	Wolves	0	0	A Marriner	3	0	3.66
9	Newcastle	Arsel	0	0	M Atkinson	12	0	1.81
10	Man United	Chelsea	4	1	A Taylor	15	0	3.63

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

Ejercicio 5

Imputar datos para una variable categórica con cadenas de texto

premier2 = premier2 %>% mutate(AwayTeam = replace_na(AwayTeam, "equipo_desconocido"))
DT::datatable(premier2)

Show entries

Search:

	HomeTeam	AwayTeam	FTHG	HTHG	Referee	HF	AR	PSA
1	Liverpool	Norwich		4	M Oliver	9
2	West Ham	Man City	0		M Dean	6	0	1.26
3	Bournemouth	Sheffield United	1	0			0	3.9
4	Burnley	Southampton	3	0	G Scott	6	0	2.81
5	Burnley	Everton	0	0	J Moss	16	1	2.39
6	Watford	Brighton	0	0	C Pawson	15	0	4.37
7	Tottenham	equipo_desconocido	3	0	C Kavagh	13	0	10.96
8	Leicester	Wolves	0	0	A Marriner	3	0	3.66
9	Newcastle	Arsel	0	0	M Atkinson	12	0	1.81
10	Man United	Chelsea	4	1	A Taylor	15	0	3.63

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

Ejercicio 6

Hacer dos imputaciones con el paquete “mice”

en este caso se imputaran las variables FTHG Y HTHG

columns <- c("FTHG", "HTHG")

imputed_data <- mice(premier2[,names(premier2) %in% columns], m=1,
                     maxit = 1, method = "mean", seed = 2018, print=F)

complete.data <- mice::complete(imputed_data)

premier2$FTHG <- round(complete.data$FTHG) # Se redondea para que el resultado sea entero
premier2$HTHG <- round(complete.data$HTHG)
DT::datatable(premier2)

Show entries

Search:

	HomeTeam	AwayTeam	FTHG	HTHG	Referee	HF	AR	PSA
1	Liverpool	Norwich	1	4	M Oliver	9
2	West Ham	Man City	0	1	M Dean	6	0	1.26
3	Bournemouth	Sheffield United	1	0			0	3.9
4	Burnley	Southampton	3	0	G Scott	6	0	2.81
5	Burnley	Everton	0	0	J Moss	16	1	2.39
6	Watford	Brighton	0	0	C Pawson	15	0	4.37
7	Tottenham	equipo_desconocido	3	0	C Kavagh	13	0	10.96
8	Leicester	Wolves	0	0	A Marriner	3	0	3.66
9	Newcastle	Arsel	0	0	M Atkinson	12	0	1.81
10	Man United	Chelsea	4	1	A Taylor	15	0	3.63

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

colSums(is.na(premier2))

## HomeTeam AwayTeam     FTHG     HTHG  Referee       HF       AR      PSA 
##        0        0        0        0        8       10       10       10

Ejercicio 7

Convertir una variable categórica en numérica

Convertimos la variable HomeTeam en variable numerica

library(caret) # contiene la función dummyVars
library(fastDummies)

disci_dummie=dummy_cols(premier2,  select_columns = c("HomeTeam")) %>%
  select(-c("HomeTeam"))
DT::datatable(disci_dummie)

Show entries

Search:

	AwayTeam	FTHG	HTHG	Referee	HF	AR	PSA	HomeTeam_Bournemouth	HomeTeam_Burnley	HomeTeam_Leicester	HomeTeam_Liverpool	HomeTeam_Man United	HomeTeam_Newcastle	HomeTeam_Tottenham	HomeTeam_Watford	HomeTeam_West Ham
1	Norwich	1	4	M Oliver	9			0	0	0	1	0	0	0	0	0
2	Man City	0	1	M Dean	6	0	1.26	0	0	0	0	0	0	0	0	1
3	Sheffield United	1	0			0	3.9	1	0	0	0	0	0	0	0	0
4	Southampton	3	0	G Scott	6	0	2.81	0	1	0	0	0	0	0	0	0
5	Everton	0	0	J Moss	16	1	2.39	0	1	0	0	0	0	0	0	0
6	Brighton	0	0	C Pawson	15	0	4.37	0	0	0	0	0	0	0	1	0
7	equipo_desconocido	3	0	C Kavagh	13	0	10.96	0	0	0	0	0	0	1	0	0
8	Wolves	0	0	A Marriner	3	0	3.66	0	0	1	0	0	0	0	0	0
9	Arsel	0	0	M Atkinson	12	0	1.81	0	0	0	0	0	1	0	0	0
10	Chelsea	4	1	A Taylor	15	0	3.63	0	0	0	0	1	0	0	0	0

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

Ejercicio 8

1. Discretizar una de las variables

HS2 <- cut(premier_league$HS, breaks = quantile(premier_league$HS, probs = c(0, 0.33, 0.66, 1)),
                  labels = c("local_defensivo", "local_no_ofensivo", "local_ofensivo"),
                  right = TRUE)

ofensivo=data.frame(premier_league %>% select("HomeTeam", "HS"), HS2)
DT::datatable(ofensivo)

Show entries

Search:

	HomeTeam	HS	HS2
1	Liverpool	15	local_no_ofensivo
2	West Ham	5	local_defensivo
3	Bournemouth	13	local_no_ofensivo
4	Burnley	10	local_defensivo
5		6	local_defensivo
6	Watford	11	local_defensivo
7	Tottenham	31	local_ofensivo
8	Leicester	15	local_no_ofensivo
9	Newcastle	9	local_defensivo
10	Man United	11	local_defensivo

Showing 1 to 10 of 208 entries

Previous1 2 3 4 5…21Next

Parcial 1

Santiago Rodriguez, John Gonzalez, Wilson Sandoval

2021-09-02

PARTE I

Ejercicio 1

Omitir las filas con observaciones NA

Imputar con la media

Imputar con la mediana

Reemplazar NAs por cero

Ejercicio 2

Ejercicio 3

PARTE II

Ejercicio 4

Ejercicio 5

Ejercicio 6

Ejercicio 7

Ejercicio 8