Proceso
- Cargar librerías
- Cargar los datos (conjunto FIFA)
- Definir variables independientes.
- Limpiar el conjunto de datos,
- Partir en datos de entrenamiento y validación
Crear modelo de regresión líneal múltiple
\(y = b0 + b_1x_1 + b_2x_2 + b_3x_3 + ... + b_kx_k\)
- Predecir con el conjunto de datos de validación
- Interpretar el caso, 180 a 200 palabras
- Cargar librerías
library(readr)
library(dplyr)
library(caret)
library(knitr)
- Cargar los datos (conjunto FIFA)
datos <- read.csv("C:/Users/cinth/Documents/ITD/Analisis inteligente de datos/Datos/data.csv", encoding = "UTF-8")
datos_slctd <- select(datos, Value, Age, Overall, Potential,
Wage, Special, International.Reputation,Weak.Foot,
Skill.Moves, Jersey.Number, Height,
Weight, Crossing, Finishing, HeadingAccuracy,
ShortPassing, Volleys, Dribbling, Curve,
FKAccuracy, LongPassing, BallControl, Acceleration,
SprintSpeed, Agility, Reactions, Balance, ShotPower,
Jumping, Stamina, Strength,LongShots, Aggression,
Interceptions, Positioning, Vision,
Penalties, Composure, Marking, StandingTackle,
SlidingTackle)
kable(head(datos_slctd, 10), caption = "Variables numéricas seleccionadas (primeros díez registros)", row.names = 1:nrow(datos_slctd))
Variables numéricas seleccionadas (primeros díez registros)
| 1 |
€110.5M |
31 |
94 |
94 |
€565K |
2202 |
5 |
4 |
4 |
10 |
5’7 |
159lbs |
84 |
95 |
70 |
90 |
86 |
97 |
93 |
94 |
87 |
96 |
91 |
86 |
91 |
95 |
95 |
85 |
68 |
72 |
59 |
94 |
48 |
22 |
94 |
94 |
75 |
96 |
33 |
28 |
26 |
| 2 |
€77M |
33 |
94 |
94 |
€405K |
2228 |
5 |
4 |
5 |
7 |
6’2 |
183lbs |
84 |
94 |
89 |
81 |
87 |
88 |
81 |
76 |
77 |
94 |
89 |
91 |
87 |
96 |
70 |
95 |
95 |
88 |
79 |
93 |
63 |
29 |
95 |
82 |
85 |
95 |
28 |
31 |
23 |
| 3 |
€118.5M |
26 |
92 |
93 |
€290K |
2143 |
5 |
5 |
5 |
10 |
5’9 |
150lbs |
79 |
87 |
62 |
84 |
84 |
96 |
88 |
87 |
78 |
95 |
94 |
90 |
96 |
94 |
84 |
80 |
61 |
81 |
49 |
82 |
56 |
36 |
89 |
87 |
81 |
94 |
27 |
24 |
33 |
| 4 |
€72M |
27 |
91 |
93 |
€260K |
1471 |
4 |
3 |
1 |
1 |
6’4 |
168lbs |
17 |
13 |
21 |
50 |
13 |
18 |
21 |
19 |
51 |
42 |
57 |
58 |
60 |
90 |
43 |
31 |
67 |
43 |
64 |
12 |
38 |
30 |
12 |
68 |
40 |
68 |
15 |
21 |
13 |
| 5 |
€102M |
27 |
91 |
92 |
€355K |
2281 |
4 |
5 |
4 |
7 |
5’11 |
154lbs |
93 |
82 |
55 |
92 |
82 |
86 |
85 |
83 |
91 |
91 |
78 |
76 |
79 |
91 |
77 |
91 |
63 |
90 |
75 |
91 |
76 |
61 |
87 |
94 |
79 |
88 |
68 |
58 |
51 |
| 6 |
€93M |
27 |
91 |
91 |
€340K |
2142 |
4 |
4 |
4 |
10 |
5’8 |
163lbs |
81 |
84 |
61 |
89 |
80 |
95 |
83 |
79 |
83 |
94 |
94 |
88 |
95 |
90 |
94 |
82 |
56 |
83 |
66 |
80 |
54 |
41 |
87 |
89 |
86 |
91 |
34 |
27 |
22 |
| 7 |
€67M |
32 |
91 |
91 |
€420K |
2280 |
4 |
4 |
4 |
10 |
5’8 |
146lbs |
86 |
72 |
55 |
93 |
76 |
90 |
85 |
78 |
88 |
93 |
80 |
72 |
93 |
90 |
94 |
79 |
68 |
89 |
58 |
82 |
62 |
83 |
79 |
92 |
82 |
84 |
60 |
76 |
73 |
| 8 |
€80M |
31 |
91 |
91 |
€455K |
2346 |
5 |
4 |
3 |
9 |
6’0 |
190lbs |
77 |
93 |
77 |
82 |
88 |
87 |
86 |
84 |
64 |
90 |
86 |
75 |
82 |
92 |
83 |
86 |
69 |
90 |
83 |
85 |
87 |
41 |
92 |
84 |
85 |
85 |
62 |
45 |
38 |
| 9 |
€51M |
32 |
91 |
91 |
€380K |
2201 |
4 |
3 |
3 |
15 |
6’0 |
181lbs |
66 |
60 |
91 |
78 |
66 |
63 |
74 |
72 |
77 |
84 |
76 |
75 |
78 |
85 |
66 |
79 |
93 |
84 |
83 |
59 |
88 |
90 |
60 |
63 |
75 |
82 |
87 |
92 |
91 |
| 10 |
€68M |
25 |
90 |
93 |
€94K |
1331 |
3 |
3 |
1 |
1 |
6’2 |
192lbs |
13 |
11 |
15 |
29 |
13 |
12 |
13 |
14 |
26 |
16 |
43 |
60 |
67 |
86 |
49 |
22 |
76 |
41 |
78 |
12 |
34 |
19 |
11 |
70 |
11 |
70 |
27 |
12 |
18 |
kable(tail(datos_slctd, 10), caption = "Variables numéricas seleccionadas (últimos díez registros)", row.names = 1:nrow(datos_slctd))
Variables numéricas seleccionadas (últimos díez registros)
| 18198 |
€60K |
18 |
47 |
61 |
€1K |
1362 |
1 |
3 |
2 |
14 |
5’10 |
141lbs |
44 |
44 |
36 |
53 |
43 |
50 |
48 |
46 |
52 |
51 |
68 |
62 |
58 |
41 |
62 |
50 |
55 |
50 |
38 |
37 |
37 |
28 |
39 |
48 |
49 |
52 |
41 |
47 |
38 |
| 18199 |
€60K |
18 |
47 |
70 |
€1K |
792 |
1 |
2 |
1 |
22 |
5’11 |
154lbs |
14 |
8 |
14 |
19 |
8 |
10 |
13 |
10 |
21 |
11 |
18 |
24 |
22 |
36 |
47 |
26 |
56 |
20 |
38 |
5 |
25 |
6 |
5 |
37 |
14 |
34 |
15 |
11 |
13 |
| 18200 |
€70K |
18 |
47 |
69 |
€1K |
1303 |
1 |
3 |
2 |
65 |
5’6 |
150lbs |
31 |
31 |
41 |
51 |
26 |
46 |
35 |
31 |
55 |
47 |
60 |
63 |
53 |
46 |
55 |
49 |
57 |
42 |
43 |
30 |
53 |
49 |
35 |
40 |
36 |
40 |
48 |
49 |
49 |
| 18201 |
€60K |
18 |
47 |
62 |
€1K |
1203 |
1 |
2 |
2 |
21 |
5’9 |
157lbs |
28 |
47 |
47 |
42 |
37 |
39 |
32 |
25 |
30 |
41 |
65 |
48 |
64 |
54 |
80 |
44 |
77 |
31 |
31 |
51 |
26 |
16 |
46 |
37 |
58 |
50 |
15 |
17 |
14 |
| 18202 |
€60K |
18 |
47 |
68 |
€1K |
1098 |
1 |
3 |
2 |
29 |
6’1 |
168lbs |
22 |
23 |
45 |
25 |
27 |
21 |
21 |
27 |
27 |
32 |
52 |
52 |
39 |
43 |
48 |
39 |
74 |
39 |
52 |
16 |
44 |
45 |
20 |
31 |
38 |
43 |
44 |
47 |
53 |
| 18203 |
€60K |
19 |
47 |
65 |
€1K |
1307 |
1 |
2 |
2 |
22 |
5’9 |
134lbs |
34 |
38 |
40 |
49 |
25 |
42 |
30 |
34 |
45 |
43 |
54 |
57 |
60 |
49 |
76 |
43 |
55 |
40 |
47 |
38 |
46 |
46 |
39 |
52 |
43 |
45 |
40 |
48 |
47 |
| 18204 |
€60K |
19 |
47 |
63 |
€1K |
1098 |
1 |
2 |
2 |
21 |
6’3 |
170lbs |
23 |
52 |
52 |
43 |
36 |
39 |
32 |
20 |
25 |
40 |
41 |
39 |
38 |
40 |
52 |
41 |
47 |
43 |
67 |
42 |
47 |
16 |
46 |
33 |
43 |
42 |
22 |
15 |
19 |
| 18205 |
€60K |
16 |
47 |
67 |
€1K |
1189 |
1 |
3 |
2 |
33 |
5’8 |
148lbs |
25 |
40 |
46 |
38 |
38 |
45 |
38 |
27 |
28 |
44 |
70 |
69 |
50 |
47 |
58 |
45 |
60 |
55 |
32 |
45 |
32 |
15 |
48 |
43 |
55 |
41 |
32 |
13 |
11 |
| 18206 |
€60K |
17 |
47 |
66 |
€1K |
1228 |
1 |
3 |
2 |
34 |
5’10 |
154lbs |
44 |
50 |
39 |
42 |
40 |
51 |
34 |
32 |
32 |
52 |
61 |
60 |
52 |
21 |
71 |
64 |
42 |
40 |
48 |
34 |
33 |
22 |
44 |
47 |
50 |
46 |
20 |
25 |
27 |
| 18207 |
€60K |
16 |
46 |
66 |
€1K |
1321 |
1 |
3 |
2 |
33 |
5’10 |
176lbs |
41 |
34 |
46 |
48 |
30 |
43 |
40 |
34 |
44 |
51 |
57 |
55 |
55 |
51 |
63 |
43 |
62 |
47 |
60 |
32 |
56 |
42 |
34 |
49 |
33 |
43 |
40 |
43 |
50 |
- Definir variables independientes.
paste("La variable dependiente 'y' es Value")
[1] "La variable dependiente 'y' es Value"
paste("Las variables independientes X1, X2, X3 ... Xn, son las elegidas anteriormente")
[1] "Las variables independientes X1, X2, X3 ... Xn, son las elegidas anteriormente"
- Limpiar el conjunto de datos.
mk_to_pesos <- function(m_k) {
options(scipen=999)
pesos <- substr(m_k,2,nchar(m_k)-1)
pesos <- as.numeric(pesos)
pesos
}
feet_inch_to_m <- function(height){
est_m <- as.numeric(substring(height, 1,1)) * 30.48 +
as.numeric(substring(height, 3,4)) * 2.54
est_m <- round(est_m/100,2)
est_m
}
lb_to_kg <- function(pounds){
kgs <- as.numeric(substring(pounds, 1,3))
kgs <- round(kgs * 0.453592,2)
kgs
}
a_minusculas <- function(words){
names(words) <- tolower(names(words))
words
}
datos_slctd <- datos_slctd %>%
mutate(Valor = ifelse(substr(Value, nchar(Value), nchar(Value)) == 'M', mk_to_pesos(Value) * 1000000, mk_to_pesos(Value) * 1000)) %>%
filter(Valor > 0)
datos_slctd <- datos_slctd %>%
mutate(Salario = ifelse(substr(Wage, nchar(Wage), nchar(Wage)) == 'M', mk_to_pesos(Wage) * 1000000, mk_to_pesos(Wage) * 1000)) %>%
filter(Valor > 0)
datos_slctd <- mutate(datos_slctd, Height_m = feet_inch_to_m(Height))
datos_slctd <- mutate(datos_slctd, Weight_kg = lb_to_kg(Weight))
datos_slctd <- a_minusculas(datos_slctd)
datos_slctd <- select(datos_slctd, -value, -wage, -height, -weight)
kable(head(datos_slctd, 10), caption = "Variables numéricas seleccionadas (primeros díez registros)", row.names = 1:nrow(datos_slctd))
Variables numéricas seleccionadas (primeros díez registros)
| 1 |
31 |
94 |
94 |
2202 |
5 |
4 |
4 |
10 |
84 |
95 |
70 |
90 |
86 |
97 |
93 |
94 |
87 |
96 |
91 |
86 |
91 |
95 |
95 |
85 |
68 |
72 |
59 |
94 |
48 |
22 |
94 |
94 |
75 |
96 |
33 |
28 |
26 |
110500000 |
565000 |
1.70 |
72.12 |
| 2 |
33 |
94 |
94 |
2228 |
5 |
4 |
5 |
7 |
84 |
94 |
89 |
81 |
87 |
88 |
81 |
76 |
77 |
94 |
89 |
91 |
87 |
96 |
70 |
95 |
95 |
88 |
79 |
93 |
63 |
29 |
95 |
82 |
85 |
95 |
28 |
31 |
23 |
77000000 |
405000 |
1.88 |
83.01 |
| 3 |
26 |
92 |
93 |
2143 |
5 |
5 |
5 |
10 |
79 |
87 |
62 |
84 |
84 |
96 |
88 |
87 |
78 |
95 |
94 |
90 |
96 |
94 |
84 |
80 |
61 |
81 |
49 |
82 |
56 |
36 |
89 |
87 |
81 |
94 |
27 |
24 |
33 |
118500000 |
290000 |
1.75 |
68.04 |
| 4 |
27 |
91 |
93 |
1471 |
4 |
3 |
1 |
1 |
17 |
13 |
21 |
50 |
13 |
18 |
21 |
19 |
51 |
42 |
57 |
58 |
60 |
90 |
43 |
31 |
67 |
43 |
64 |
12 |
38 |
30 |
12 |
68 |
40 |
68 |
15 |
21 |
13 |
72000000 |
260000 |
1.93 |
76.20 |
| 5 |
27 |
91 |
92 |
2281 |
4 |
5 |
4 |
7 |
93 |
82 |
55 |
92 |
82 |
86 |
85 |
83 |
91 |
91 |
78 |
76 |
79 |
91 |
77 |
91 |
63 |
90 |
75 |
91 |
76 |
61 |
87 |
94 |
79 |
88 |
68 |
58 |
51 |
102000000 |
355000 |
1.80 |
69.85 |
| 6 |
27 |
91 |
91 |
2142 |
4 |
4 |
4 |
10 |
81 |
84 |
61 |
89 |
80 |
95 |
83 |
79 |
83 |
94 |
94 |
88 |
95 |
90 |
94 |
82 |
56 |
83 |
66 |
80 |
54 |
41 |
87 |
89 |
86 |
91 |
34 |
27 |
22 |
93000000 |
340000 |
1.73 |
73.94 |
| 7 |
32 |
91 |
91 |
2280 |
4 |
4 |
4 |
10 |
86 |
72 |
55 |
93 |
76 |
90 |
85 |
78 |
88 |
93 |
80 |
72 |
93 |
90 |
94 |
79 |
68 |
89 |
58 |
82 |
62 |
83 |
79 |
92 |
82 |
84 |
60 |
76 |
73 |
67000000 |
420000 |
1.73 |
66.22 |
| 8 |
31 |
91 |
91 |
2346 |
5 |
4 |
3 |
9 |
77 |
93 |
77 |
82 |
88 |
87 |
86 |
84 |
64 |
90 |
86 |
75 |
82 |
92 |
83 |
86 |
69 |
90 |
83 |
85 |
87 |
41 |
92 |
84 |
85 |
85 |
62 |
45 |
38 |
80000000 |
455000 |
1.83 |
86.18 |
| 9 |
32 |
91 |
91 |
2201 |
4 |
3 |
3 |
15 |
66 |
60 |
91 |
78 |
66 |
63 |
74 |
72 |
77 |
84 |
76 |
75 |
78 |
85 |
66 |
79 |
93 |
84 |
83 |
59 |
88 |
90 |
60 |
63 |
75 |
82 |
87 |
92 |
91 |
51000000 |
380000 |
1.83 |
82.10 |
| 10 |
25 |
90 |
93 |
1331 |
3 |
3 |
1 |
1 |
13 |
11 |
15 |
29 |
13 |
12 |
13 |
14 |
26 |
16 |
43 |
60 |
67 |
86 |
49 |
22 |
76 |
41 |
78 |
12 |
34 |
19 |
11 |
70 |
11 |
70 |
27 |
12 |
18 |
68000000 |
94000 |
1.88 |
87.09 |
kable(tail(datos_slctd, 10), caption = "Variables numéricas seleccionadas (últimos díez registros)", row.names = 1:nrow(datos_slctd))
Variables numéricas seleccionadas (últimos díez registros)
| 17946 |
18 |
47 |
61 |
1362 |
1 |
3 |
2 |
14 |
44 |
44 |
36 |
53 |
43 |
50 |
48 |
46 |
52 |
51 |
68 |
62 |
58 |
41 |
62 |
50 |
55 |
50 |
38 |
37 |
37 |
28 |
39 |
48 |
49 |
52 |
41 |
47 |
38 |
60000 |
1000 |
1.78 |
63.96 |
| 17947 |
18 |
47 |
70 |
792 |
1 |
2 |
1 |
22 |
14 |
8 |
14 |
19 |
8 |
10 |
13 |
10 |
21 |
11 |
18 |
24 |
22 |
36 |
47 |
26 |
56 |
20 |
38 |
5 |
25 |
6 |
5 |
37 |
14 |
34 |
15 |
11 |
13 |
60000 |
1000 |
1.80 |
69.85 |
| 17948 |
18 |
47 |
69 |
1303 |
1 |
3 |
2 |
65 |
31 |
31 |
41 |
51 |
26 |
46 |
35 |
31 |
55 |
47 |
60 |
63 |
53 |
46 |
55 |
49 |
57 |
42 |
43 |
30 |
53 |
49 |
35 |
40 |
36 |
40 |
48 |
49 |
49 |
70000 |
1000 |
1.68 |
68.04 |
| 17949 |
18 |
47 |
62 |
1203 |
1 |
2 |
2 |
21 |
28 |
47 |
47 |
42 |
37 |
39 |
32 |
25 |
30 |
41 |
65 |
48 |
64 |
54 |
80 |
44 |
77 |
31 |
31 |
51 |
26 |
16 |
46 |
37 |
58 |
50 |
15 |
17 |
14 |
60000 |
1000 |
1.75 |
71.21 |
| 17950 |
18 |
47 |
68 |
1098 |
1 |
3 |
2 |
29 |
22 |
23 |
45 |
25 |
27 |
21 |
21 |
27 |
27 |
32 |
52 |
52 |
39 |
43 |
48 |
39 |
74 |
39 |
52 |
16 |
44 |
45 |
20 |
31 |
38 |
43 |
44 |
47 |
53 |
60000 |
1000 |
1.85 |
76.20 |
| 17951 |
19 |
47 |
65 |
1307 |
1 |
2 |
2 |
22 |
34 |
38 |
40 |
49 |
25 |
42 |
30 |
34 |
45 |
43 |
54 |
57 |
60 |
49 |
76 |
43 |
55 |
40 |
47 |
38 |
46 |
46 |
39 |
52 |
43 |
45 |
40 |
48 |
47 |
60000 |
1000 |
1.75 |
60.78 |
| 17952 |
19 |
47 |
63 |
1098 |
1 |
2 |
2 |
21 |
23 |
52 |
52 |
43 |
36 |
39 |
32 |
20 |
25 |
40 |
41 |
39 |
38 |
40 |
52 |
41 |
47 |
43 |
67 |
42 |
47 |
16 |
46 |
33 |
43 |
42 |
22 |
15 |
19 |
60000 |
1000 |
1.91 |
77.11 |
| 17953 |
16 |
47 |
67 |
1189 |
1 |
3 |
2 |
33 |
25 |
40 |
46 |
38 |
38 |
45 |
38 |
27 |
28 |
44 |
70 |
69 |
50 |
47 |
58 |
45 |
60 |
55 |
32 |
45 |
32 |
15 |
48 |
43 |
55 |
41 |
32 |
13 |
11 |
60000 |
1000 |
1.73 |
67.13 |
| 17954 |
17 |
47 |
66 |
1228 |
1 |
3 |
2 |
34 |
44 |
50 |
39 |
42 |
40 |
51 |
34 |
32 |
32 |
52 |
61 |
60 |
52 |
21 |
71 |
64 |
42 |
40 |
48 |
34 |
33 |
22 |
44 |
47 |
50 |
46 |
20 |
25 |
27 |
60000 |
1000 |
1.78 |
69.85 |
| 17955 |
16 |
46 |
66 |
1321 |
1 |
3 |
2 |
33 |
41 |
34 |
46 |
48 |
30 |
43 |
40 |
34 |
44 |
51 |
57 |
55 |
55 |
51 |
63 |
43 |
62 |
47 |
60 |
32 |
56 |
42 |
34 |
49 |
33 |
43 |
40 |
43 |
50 |
60000 |
1000 |
1.78 |
79.83 |
- Partir en datos de entrenamiento y validación (70 - 30)
set.seed(2020)
conjunto_entrenamiento <- createDataPartition(y = datos_slctd$overall, p = 0.7, list = FALSE, times = 1)
datos_entrenamiento <- datos_slctd[conjunto_entrenamiento, ]
datos_validacion <- datos_slctd[-conjunto_entrenamiento, ]
kable(head(datos_entrenamiento, 10), caption = "Datos de entrenamiento (primeros díez registros)", row.names = 1:nrow(datos_entrenamiento))
Datos de entrenamiento (primeros díez registros)
| 1 |
31 |
94 |
94 |
2202 |
5 |
4 |
4 |
10 |
84 |
95 |
70 |
90 |
86 |
97 |
93 |
94 |
87 |
96 |
91 |
86 |
91 |
95 |
95 |
85 |
68 |
72 |
59 |
94 |
48 |
22 |
94 |
94 |
75 |
96 |
33 |
28 |
26 |
110500000 |
565000 |
1.70 |
72.12 |
| 2 |
33 |
94 |
94 |
2228 |
5 |
4 |
5 |
7 |
84 |
94 |
89 |
81 |
87 |
88 |
81 |
76 |
77 |
94 |
89 |
91 |
87 |
96 |
70 |
95 |
95 |
88 |
79 |
93 |
63 |
29 |
95 |
82 |
85 |
95 |
28 |
31 |
23 |
77000000 |
405000 |
1.88 |
83.01 |
| 3 |
26 |
92 |
93 |
2143 |
5 |
5 |
5 |
10 |
79 |
87 |
62 |
84 |
84 |
96 |
88 |
87 |
78 |
95 |
94 |
90 |
96 |
94 |
84 |
80 |
61 |
81 |
49 |
82 |
56 |
36 |
89 |
87 |
81 |
94 |
27 |
24 |
33 |
118500000 |
290000 |
1.75 |
68.04 |
| 5 |
27 |
91 |
92 |
2281 |
4 |
5 |
4 |
7 |
93 |
82 |
55 |
92 |
82 |
86 |
85 |
83 |
91 |
91 |
78 |
76 |
79 |
91 |
77 |
91 |
63 |
90 |
75 |
91 |
76 |
61 |
87 |
94 |
79 |
88 |
68 |
58 |
51 |
102000000 |
355000 |
1.80 |
69.85 |
| 6 |
27 |
91 |
91 |
2142 |
4 |
4 |
4 |
10 |
81 |
84 |
61 |
89 |
80 |
95 |
83 |
79 |
83 |
94 |
94 |
88 |
95 |
90 |
94 |
82 |
56 |
83 |
66 |
80 |
54 |
41 |
87 |
89 |
86 |
91 |
34 |
27 |
22 |
93000000 |
340000 |
1.73 |
73.94 |
| 7 |
32 |
91 |
91 |
2280 |
4 |
4 |
4 |
10 |
86 |
72 |
55 |
93 |
76 |
90 |
85 |
78 |
88 |
93 |
80 |
72 |
93 |
90 |
94 |
79 |
68 |
89 |
58 |
82 |
62 |
83 |
79 |
92 |
82 |
84 |
60 |
76 |
73 |
67000000 |
420000 |
1.73 |
66.22 |
| 8 |
31 |
91 |
91 |
2346 |
5 |
4 |
3 |
9 |
77 |
93 |
77 |
82 |
88 |
87 |
86 |
84 |
64 |
90 |
86 |
75 |
82 |
92 |
83 |
86 |
69 |
90 |
83 |
85 |
87 |
41 |
92 |
84 |
85 |
85 |
62 |
45 |
38 |
80000000 |
455000 |
1.83 |
86.18 |
| 11 |
29 |
90 |
90 |
2152 |
4 |
4 |
4 |
9 |
62 |
91 |
85 |
83 |
89 |
85 |
77 |
86 |
65 |
89 |
77 |
78 |
78 |
90 |
78 |
88 |
84 |
78 |
84 |
84 |
80 |
39 |
91 |
77 |
88 |
86 |
34 |
42 |
19 |
77000000 |
205000 |
1.83 |
79.83 |
| 12 |
28 |
90 |
90 |
2190 |
4 |
5 |
3 |
8 |
88 |
76 |
54 |
92 |
82 |
81 |
86 |
84 |
93 |
90 |
64 |
62 |
70 |
89 |
71 |
87 |
30 |
75 |
73 |
92 |
60 |
82 |
79 |
86 |
73 |
85 |
72 |
79 |
69 |
76500000 |
355000 |
1.83 |
76.20 |
| 14 |
32 |
90 |
90 |
2115 |
4 |
2 |
4 |
21 |
84 |
76 |
54 |
93 |
82 |
89 |
82 |
77 |
87 |
94 |
70 |
64 |
92 |
90 |
90 |
72 |
64 |
78 |
52 |
75 |
57 |
50 |
89 |
92 |
75 |
93 |
59 |
53 |
29 |
60000000 |
285000 |
1.73 |
67.13 |
kable(head(datos_validacion, 10), caption = "Datos de validación (últimos díez registros)", row.names = 1:nrow(datos_validacion))
Datos de validación (últimos díez registros)
| 4 |
27 |
91 |
93 |
1471 |
4 |
3 |
1 |
1 |
17 |
13 |
21 |
50 |
13 |
18 |
21 |
19 |
51 |
42 |
57 |
58 |
60 |
90 |
43 |
31 |
67 |
43 |
64 |
12 |
38 |
30 |
12 |
68 |
40 |
68 |
15 |
21 |
13 |
72000000 |
260000 |
1.93 |
76.20 |
| 9 |
32 |
91 |
91 |
2201 |
4 |
3 |
3 |
15 |
66 |
60 |
91 |
78 |
66 |
63 |
74 |
72 |
77 |
84 |
76 |
75 |
78 |
85 |
66 |
79 |
93 |
84 |
83 |
59 |
88 |
90 |
60 |
63 |
75 |
82 |
87 |
92 |
91 |
51000000 |
380000 |
1.83 |
82.10 |
| 10 |
25 |
90 |
93 |
1331 |
3 |
3 |
1 |
1 |
13 |
11 |
15 |
29 |
13 |
12 |
13 |
14 |
26 |
16 |
43 |
60 |
67 |
86 |
49 |
22 |
76 |
41 |
78 |
12 |
34 |
19 |
11 |
70 |
11 |
70 |
27 |
12 |
18 |
68000000 |
94000 |
1.88 |
87.09 |
| 13 |
32 |
90 |
90 |
1946 |
3 |
3 |
2 |
10 |
55 |
42 |
92 |
79 |
47 |
53 |
49 |
51 |
70 |
76 |
68 |
68 |
58 |
85 |
54 |
67 |
91 |
66 |
88 |
43 |
89 |
88 |
48 |
52 |
50 |
82 |
90 |
89 |
89 |
44000000 |
125000 |
1.88 |
78.02 |
| 16 |
24 |
89 |
94 |
2092 |
3 |
3 |
4 |
21 |
82 |
84 |
68 |
87 |
88 |
92 |
88 |
88 |
75 |
92 |
87 |
83 |
91 |
86 |
85 |
82 |
75 |
80 |
65 |
88 |
48 |
32 |
84 |
87 |
86 |
84 |
23 |
20 |
20 |
89000000 |
205000 |
1.78 |
74.84 |
| 22 |
31 |
89 |
89 |
2161 |
4 |
4 |
3 |
21 |
70 |
89 |
89 |
78 |
90 |
80 |
77 |
76 |
52 |
82 |
75 |
76 |
77 |
91 |
59 |
87 |
88 |
92 |
78 |
79 |
84 |
48 |
93 |
77 |
85 |
82 |
52 |
45 |
39 |
60000000 |
200000 |
1.85 |
77.11 |
| 23 |
32 |
89 |
89 |
1473 |
5 |
4 |
1 |
1 |
15 |
13 |
25 |
55 |
11 |
30 |
14 |
11 |
59 |
48 |
54 |
60 |
51 |
84 |
35 |
25 |
77 |
43 |
80 |
16 |
29 |
30 |
12 |
70 |
47 |
70 |
17 |
10 |
11 |
38000000 |
130000 |
1.93 |
92.08 |
| 25 |
33 |
89 |
89 |
1841 |
4 |
3 |
2 |
3 |
58 |
33 |
83 |
59 |
45 |
58 |
60 |
31 |
59 |
57 |
63 |
75 |
54 |
82 |
55 |
78 |
89 |
65 |
89 |
49 |
92 |
88 |
28 |
50 |
50 |
84 |
93 |
93 |
90 |
27000000 |
215000 |
1.88 |
84.82 |
| 30 |
27 |
88 |
88 |
2017 |
3 |
3 |
4 |
10 |
86 |
77 |
56 |
85 |
74 |
90 |
87 |
77 |
78 |
93 |
94 |
86 |
94 |
83 |
93 |
75 |
53 |
75 |
44 |
84 |
34 |
26 |
83 |
87 |
61 |
83 |
51 |
24 |
22 |
62000000 |
165000 |
1.63 |
58.97 |
| 31 |
26 |
88 |
91 |
2137 |
3 |
3 |
4 |
22 |
75 |
79 |
55 |
89 |
65 |
94 |
88 |
76 |
83 |
95 |
75 |
69 |
87 |
77 |
90 |
69 |
64 |
70 |
59 |
87 |
58 |
64 |
78 |
89 |
76 |
86 |
60 |
64 |
51 |
73500000 |
315000 |
1.75 |
78.93 |
- Crear modelo de regresión líneal múltiple \(y = b0 + b_1x_1 + b_2x_2 + b_3x_3 + ... + b_kx_k\)
modelo_regresion_multiple <- lm(formula = valor ~ ., datos_entrenamiento)
summary(modelo_regresion_multiple)
Call:
lm(formula = valor ~ ., data = datos_entrenamiento)
Residuals:
Min 1Q Median 3Q Max
-19161476 -832836 -60785 700651 56366387
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6237527.444 1293049.221 -4.824 0.00000142445759378
age -260441.896 10830.032 -24.048 < 0.0000000000000002
overall 263151.521 12324.511 21.352 < 0.0000000000000002
potential -45061.430 9216.102 -4.889 0.00000102388538355
special -4971.156 1050.494 -4.732 0.00000224521357859
international.reputation 2096975.406 80512.189 26.045 < 0.0000000000000002
weak.foot 64861.448 37182.465 1.744 0.081112
skill.moves 227777.379 60540.118 3.762 0.000169
jersey.number -5180.184 1478.922 -3.503 0.000462
crossing -4564.702 3279.621 -1.392 0.163996
finishing 19777.935 3898.800 5.073 0.00000039756985080
headingaccuracy -3149.386 2946.924 -1.069 0.285224
shortpassing -403.627 5483.003 -0.074 0.941319
volleys 10266.460 3483.094 2.948 0.003209
dribbling -4145.766 4894.856 -0.847 0.397031
curve -914.425 3441.961 -0.266 0.790498
fkaccuracy 19978.179 3058.942 6.531 0.00000000006782730
longpassing 17968.651 4230.417 4.247 0.00002177436178140
ballcontrol -23414.482 5821.108 -4.022 0.00005796128114699
acceleration 7772.372 4605.084 1.688 0.091478
sprintspeed -1521.284 4289.824 -0.355 0.722876
agility -1031.663 3552.756 -0.290 0.771528
reactions 34476.184 5330.003 6.468 0.00000000010278779
balance 5856.472 3586.184 1.633 0.102480
shotpower -9906.667 3540.906 -2.798 0.005153
jumping 549.760 2690.688 0.204 0.838107
stamina 22668.921 2876.934 7.880 0.00000000000000356
strength 3944.760 3414.156 1.155 0.247943
longshots 771.132 3840.908 0.201 0.840883
aggression 3220.423 2690.702 1.197 0.231379
interceptions -4011.602 3769.712 -1.064 0.287274
positioning 9072.650 3724.818 2.436 0.014876
vision 10036.751 3659.156 2.743 0.006098
penalties -7407.033 3273.003 -2.263 0.023648
composure 896.362 3767.794 0.238 0.811962
marking 9743.854 2960.754 3.291 0.001001
standingtackle 21022.779 5464.713 3.847 0.000120
slidingtackle -19840.500 5183.721 -3.827 0.000130
salario 160.301 1.456 110.081 < 0.0000000000000002
height_m -1159502.240 664549.743 -1.745 0.081045
weight_kg 1722.922 5678.866 0.303 0.761596
(Intercept) ***
age ***
overall ***
potential ***
special ***
international.reputation ***
weak.foot .
skill.moves ***
jersey.number ***
crossing
finishing ***
headingaccuracy
shortpassing
volleys **
dribbling
curve
fkaccuracy ***
longpassing ***
ballcontrol ***
acceleration .
sprintspeed
agility
reactions ***
balance
shotpower **
jumping
stamina ***
strength
longshots
aggression
interceptions
positioning *
vision **
penalties *
composure
marking **
standingtackle ***
slidingtackle ***
salario ***
height_m .
weight_kg
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2527000 on 12504 degrees of freedom
(25 observations deleted due to missingness)
Multiple R-squared: 0.8088, Adjusted R-squared: 0.8082
F-statistic: 1323 on 40 and 12504 DF, p-value: < 0.00000000000000022
Optimización del modelo
- Selección de variables que el primer modelo no identifica como estadísticamente signitificativas.
modelo_regresion_multiple_2 <- lm(formula = valor ~ age + overall + potential + special + international.reputation + jersey.number + finishing + fkaccuracy + longpassing + ballcontrol + reactions + stamina + salario , datos_entrenamiento)
summary(modelo_regresion_multiple_2)
Call:
lm(formula = valor ~ age + overall + potential + special + international.reputation +
jersey.number + finishing + fkaccuracy + longpassing + ballcontrol +
reactions + stamina + salario, data = datos_entrenamiento)
Residuals:
Min 1Q Median 3Q Max
-18997420 -824040 -75785 697276 56424654
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7857348.39 452119.77 -17.379 < 0.0000000000000002
age -266566.73 10213.53 -26.099 < 0.0000000000000002
overall 254603.39 10507.57 24.230 < 0.0000000000000002
potential -44787.95 9087.82 -4.928 0.0000008398819053
special -3549.32 372.39 -9.531 < 0.0000000000000002
international.reputation 2102462.29 80084.70 26.253 < 0.0000000000000002
jersey.number -5350.31 1478.81 -3.618 0.000298
finishing 22222.02 2169.52 10.243 < 0.0000000000000002
fkaccuracy 18559.56 2463.89 7.533 0.0000000000000531
longpassing 16414.29 3085.71 5.319 0.0000001058745305
ballcontrol -19753.30 4063.23 -4.861 0.0000011792807544
reactions 36293.34 5005.88 7.250 0.0000000000004408
stamina 22403.91 2712.00 8.261 < 0.0000000000000002
salario 160.60 1.46 110.037 < 0.0000000000000002
(Intercept) ***
age ***
overall ***
potential ***
special ***
international.reputation ***
jersey.number ***
finishing ***
fkaccuracy ***
longpassing ***
ballcontrol ***
reactions ***
stamina ***
salario ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2538000 on 12531 degrees of freedom
(25 observations deleted due to missingness)
Multiple R-squared: 0.8068, Adjusted R-squared: 0.8066
F-statistic: 4026 on 13 and 12531 DF, p-value: < 0.00000000000000022
- Predecir con el conjunto de datos de validación
- Tomaremos el modelo optimizado y haremos la predicción con la función \(\textit{predict()}\)
prediccion <- predict(object = modelo_regresion_multiple_2, newdata = datos_validacion)
- Agregar al conjunto de datos las predicciones, para comparar con los datos de validación.
datos_validacion_df <- mutate(datos_validacion, predicho = prediccion)
kable(head(datos_validacion_df, 10), caption = "Primeros díez registros con sus respectivas predicciones")
Primeros díez registros con sus respectivas predicciones
| 27 |
91 |
93 |
1471 |
4 |
3 |
1 |
1 |
17 |
13 |
21 |
50 |
13 |
18 |
21 |
19 |
51 |
42 |
57 |
58 |
60 |
90 |
43 |
31 |
67 |
43 |
64 |
12 |
38 |
30 |
12 |
68 |
40 |
68 |
15 |
21 |
13 |
72000000 |
260000 |
1.93 |
76.20 |
53767689 |
| 32 |
91 |
91 |
2201 |
4 |
3 |
3 |
15 |
66 |
60 |
91 |
78 |
66 |
63 |
74 |
72 |
77 |
84 |
76 |
75 |
78 |
85 |
66 |
79 |
93 |
84 |
83 |
59 |
88 |
90 |
60 |
63 |
75 |
82 |
87 |
92 |
91 |
51000000 |
380000 |
1.83 |
82.10 |
71493065 |
| 25 |
90 |
93 |
1331 |
3 |
3 |
1 |
1 |
13 |
11 |
15 |
29 |
13 |
12 |
13 |
14 |
26 |
16 |
43 |
60 |
67 |
86 |
49 |
22 |
76 |
41 |
78 |
12 |
34 |
19 |
11 |
70 |
11 |
70 |
27 |
12 |
18 |
68000000 |
94000 |
1.88 |
87.09 |
25556761 |
| 32 |
90 |
90 |
1946 |
3 |
3 |
2 |
10 |
55 |
42 |
92 |
79 |
47 |
53 |
49 |
51 |
70 |
76 |
68 |
68 |
58 |
85 |
54 |
67 |
91 |
66 |
88 |
43 |
89 |
88 |
48 |
52 |
50 |
82 |
90 |
89 |
89 |
44000000 |
125000 |
1.88 |
78.02 |
28009254 |
| 24 |
89 |
94 |
2092 |
3 |
3 |
4 |
21 |
82 |
84 |
68 |
87 |
88 |
92 |
88 |
88 |
75 |
92 |
87 |
83 |
91 |
86 |
85 |
82 |
75 |
80 |
65 |
88 |
48 |
32 |
84 |
87 |
86 |
84 |
23 |
20 |
20 |
89000000 |
205000 |
1.78 |
74.84 |
43715121 |
| 31 |
89 |
89 |
2161 |
4 |
4 |
3 |
21 |
70 |
89 |
89 |
78 |
90 |
80 |
77 |
76 |
52 |
82 |
75 |
76 |
77 |
91 |
59 |
87 |
88 |
92 |
78 |
79 |
84 |
48 |
93 |
77 |
85 |
82 |
52 |
45 |
39 |
60000000 |
200000 |
1.85 |
77.11 |
43286358 |
| 32 |
89 |
89 |
1473 |
5 |
4 |
1 |
1 |
15 |
13 |
25 |
55 |
11 |
30 |
14 |
11 |
59 |
48 |
54 |
60 |
51 |
84 |
35 |
25 |
77 |
43 |
80 |
16 |
29 |
30 |
12 |
70 |
47 |
70 |
17 |
10 |
11 |
38000000 |
130000 |
1.93 |
92.08 |
32968482 |
| 33 |
89 |
89 |
1841 |
4 |
3 |
2 |
3 |
58 |
33 |
83 |
59 |
45 |
58 |
60 |
31 |
59 |
57 |
63 |
75 |
54 |
82 |
55 |
78 |
89 |
65 |
89 |
49 |
92 |
88 |
28 |
50 |
50 |
84 |
93 |
93 |
90 |
27000000 |
215000 |
1.88 |
84.82 |
43991912 |
| 27 |
88 |
88 |
2017 |
3 |
3 |
4 |
10 |
86 |
77 |
56 |
85 |
74 |
90 |
87 |
77 |
78 |
93 |
94 |
86 |
94 |
83 |
93 |
75 |
53 |
75 |
44 |
84 |
34 |
26 |
83 |
87 |
61 |
83 |
51 |
24 |
22 |
62000000 |
165000 |
1.63 |
58.97 |
36279405 |
| 26 |
88 |
91 |
2137 |
3 |
3 |
4 |
22 |
75 |
79 |
55 |
89 |
65 |
94 |
88 |
76 |
83 |
95 |
75 |
69 |
87 |
77 |
90 |
69 |
64 |
70 |
59 |
87 |
58 |
64 |
78 |
89 |
76 |
86 |
60 |
64 |
51 |
73500000 |
315000 |
1.75 |
78.93 |
59750432 |
kable(tail(datos_validacion_df, 10), caption = "Últimos díez registros con sus respectivas predicciones")
Últimos díez registros con sus respectivas predicciones
| 5376 |
19 |
48 |
59 |
1152 |
1 |
3 |
2 |
28 |
28 |
23 |
45 |
27 |
24 |
25 |
21 |
26 |
29 |
28 |
63 |
50 |
47 |
42 |
76 |
35 |
65 |
49 |
49 |
21 |
45 |
46 |
25 |
29 |
37 |
43 |
42 |
54 |
54 |
40000 |
1000 |
1.73 |
72.12 |
-1780511 |
| 5377 |
18 |
48 |
63 |
1370 |
1 |
3 |
3 |
24 |
38 |
34 |
49 |
54 |
34 |
49 |
33 |
37 |
50 |
41 |
67 |
61 |
61 |
46 |
59 |
50 |
56 |
64 |
55 |
35 |
52 |
39 |
46 |
45 |
36 |
44 |
44 |
40 |
41 |
60000 |
1000 |
1.85 |
73.03 |
-1427709 |
| 5378 |
17 |
48 |
66 |
1296 |
1 |
2 |
2 |
32 |
45 |
46 |
46 |
38 |
27 |
46 |
28 |
24 |
34 |
38 |
61 |
57 |
56 |
47 |
66 |
39 |
59 |
60 |
48 |
33 |
53 |
46 |
43 |
37 |
33 |
38 |
43 |
49 |
45 |
50000 |
1000 |
1.80 |
60.78 |
-1306960 |
| 5379 |
18 |
48 |
55 |
1368 |
1 |
3 |
2 |
33 |
33 |
24 |
42 |
54 |
33 |
44 |
34 |
36 |
50 |
47 |
61 |
57 |
57 |
44 |
58 |
47 |
64 |
59 |
66 |
31 |
53 |
49 |
35 |
46 |
37 |
42 |
47 |
49 |
53 |
40000 |
1000 |
1.85 |
81.19 |
-1654365 |
| 5380 |
18 |
47 |
67 |
1285 |
1 |
3 |
2 |
32 |
32 |
32 |
45 |
48 |
31 |
41 |
32 |
43 |
47 |
37 |
53 |
55 |
31 |
47 |
61 |
41 |
54 |
61 |
55 |
34 |
44 |
44 |
51 |
54 |
34 |
46 |
35 |
44 |
47 |
60000 |
1000 |
1.75 |
79.83 |
-1536809 |
| 5381 |
18 |
47 |
64 |
1191 |
1 |
2 |
2 |
4 |
36 |
25 |
40 |
27 |
27 |
46 |
31 |
25 |
23 |
29 |
64 |
58 |
55 |
54 |
81 |
22 |
56 |
54 |
40 |
22 |
48 |
49 |
35 |
30 |
32 |
32 |
41 |
48 |
48 |
50000 |
1000 |
1.73 |
66.22 |
-1547318 |
| 5382 |
19 |
47 |
61 |
1333 |
1 |
3 |
2 |
26 |
31 |
28 |
40 |
53 |
31 |
46 |
39 |
37 |
48 |
48 |
58 |
58 |
60 |
48 |
79 |
42 |
63 |
35 |
51 |
30 |
55 |
44 |
28 |
51 |
44 |
35 |
41 |
44 |
54 |
60000 |
1000 |
1.70 |
66.22 |
-2620240 |
| 5383 |
18 |
47 |
69 |
1303 |
1 |
3 |
2 |
65 |
31 |
31 |
41 |
51 |
26 |
46 |
35 |
31 |
55 |
47 |
60 |
63 |
53 |
46 |
55 |
49 |
57 |
42 |
43 |
30 |
53 |
49 |
35 |
40 |
36 |
40 |
48 |
49 |
49 |
70000 |
1000 |
1.68 |
68.04 |
-2639956 |
| 5384 |
19 |
47 |
65 |
1307 |
1 |
2 |
2 |
22 |
34 |
38 |
40 |
49 |
25 |
42 |
30 |
34 |
45 |
43 |
54 |
57 |
60 |
49 |
76 |
43 |
55 |
40 |
47 |
38 |
46 |
46 |
39 |
52 |
43 |
45 |
40 |
48 |
47 |
60000 |
1000 |
1.75 |
60.78 |
-2321330 |
| 5385 |
16 |
46 |
66 |
1321 |
1 |
3 |
2 |
33 |
41 |
34 |
46 |
48 |
30 |
43 |
40 |
34 |
44 |
51 |
57 |
55 |
55 |
51 |
63 |
43 |
62 |
47 |
60 |
32 |
56 |
42 |
34 |
49 |
33 |
43 |
40 |
43 |
50 |
60000 |
1000 |
1.78 |
79.83 |
-1963480 |