library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
library(nortest)
library(tidyr)
library(plotly)

#Cargo de bases

fifa<- data.table(read.csv('Fifa_hombres.csv', header = T, sep = "," , encoding = 'Latin-1'))

fifa<-fifa %>% drop_na()

Plots de variables

Plots de Fifa

ggplot() +
  geom_point(data = fifa, mapping = aes(x = fifa$age, y = fifa$potential, color=fifa$value_eur)) +
  
  geom_smooth(data = fifa, mapping = aes(x = fifa$age,  y = fifa$potential), method = 'loess', se = T, fullrange = TRUE, color = "#D284F9")+
  
  theme_minimal() +
  labs(x = 'Age', y = 'Overall') +
  theme(legend.position = 'bottom')
## `geom_smooth()` using formula = 'y ~ x'

Grafico para análizar individuos

edad_jugadores <- ggplot(fifa, aes(x=age, y=potential, text=paste("Nombre:",long_name, "\n","Potencial:",potential,"\n","Edad:",age,"\n", "Valoración:",value_eur,"\n","Altura:",height_cm ))) +  geom_point(color="blue")
ggplotly(edad_jugadores, tooltip = "text")

Testeo de Variables

Testing Normality Overall

qqnorm(fifa$overall)

 ad.test(fifa$overall)
## 
##  Anderson-Darling normality test
## 
## data:  fifa$overall
## A = 22.415, p-value < 2.2e-16

Testing Normality Potential

qqnorm(fifa$potential)

 ad.test(fifa$potential)
## 
##  Anderson-Darling normality test
## 
## data:  fifa$potential
## A = 37.972, p-value < 2.2e-16

Testing Normality Age

qqnorm(fifa$age)

 ad.test(fifa$age)
## 
##  Anderson-Darling normality test
## 
## data:  fifa$age
## A = 114.76, p-value < 2.2e-16

Implementación de modelo

Prueba de un modelo basico

# Implementación de la regresión
f1=formula(potential~ overall+ value_eur+ age+  league_level +pace +physic +nationality_id)


reg1 = lm(formula=f1, data=fifa)

sum01<-summary((reg1))
sum01
## 
## Call:
## lm(formula = f1, data = fifa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2013  -1.6786  -0.2778   1.4779  11.2897 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)     4.300e+01  2.739e-01  156.960  < 2e-16 ***
## overall         8.802e-01  5.036e-03  174.791  < 2e-16 ***
## value_eur       2.515e-08  3.229e-09    7.791 7.07e-15 ***
## age            -9.299e-01  5.540e-03 -167.856  < 2e-16 ***
## league_level   -2.655e-01  2.920e-02   -9.091  < 2e-16 ***
## pace           -4.536e-02  2.071e-03  -21.902  < 2e-16 ***
## physic         -4.374e-02  2.627e-03  -16.651  < 2e-16 ***
## nationality_id -5.027e-03  4.318e-04  -11.642  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.523 on 16012 degrees of freedom
## Multiple R-squared:  0.8258, Adjusted R-squared:  0.8257 
## F-statistic: 1.084e+04 on 7 and 16012 DF,  p-value: < 2.2e-16

Plot de los residuos

plot(sum01$residuals)

graph1 <- ggplot() +

  geom_point(fifa, mapping = aes(x=fifa$age , y=fifa$potential, color=fifa$potential)) +
  geom_smooth(fifa, mapping = aes(x=age, y=predict(reg1)), color='green', method='loess',se=F, fullrange = T) +
  theme_minimal() +
  labs(x='Age',y='Potential',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 1))
graph1

Pruebas de modelo basico

k-folds

set.seed(12345) ## setear una semilla
training.samples <- createDataPartition(fifa$potential,p = 0.8, list = FALSE)
train.data  <- fifa[training.samples, ]
test.data <- fifa[-training.samples, ]

model <- lm(f1, data = train.data)

predictions01 <- predict(model,train.data)
predictions02 <- predict(model,test.data)

data.table(' ' = c('Dentro de muestra','Fuera de muestra'),
           R2 = c(R2(predictions01, train.data$potential),
                  R2(predictions02, test.data$potential)),
           RMSE = c(RMSE(predictions01, train.data$potential),
                    RMSE(predictions02, test.data$potential)),
           MAE = c(MAE(predictions01, train.data$potential),
                   MAE(predictions02, test.data$potential)))
##                             R2     RMSE      MAE
## 1: Dentro de muestra 0.8247396 2.522858 1.957855
## 2:  Fuera de muestra 0.8299166 2.519620 1.971272

LOO

set.seed(123456)## setear una semilla
setupLOO <- trainControl(method = "LOOCV")

predLOO <- train(f1,data=fifa[1:2000],method="lm",trControl= setupLOO)
print(predLOO)
## Linear Regression 
## 
## 2000 samples
##    7 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 1999, 1999, 1999, 1999, 1999, 1999, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   1.696503  0.83278   1.331941
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE