library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
library(nortest)
library(tidyr)
library(plotly)

#Cargo de bases

fifa<- data.table(read.csv('Fifa_hombres.csv', header = T, sep = "," , encoding = 'Latin-1'))

fifa<-fifa %>% drop_na()

Plots de variables

Plots de Fifa

ggplot() +
  geom_point(data = fifa, mapping = aes(x = fifa$age, y = fifa$potential, color=fifa$value_eur)) +
  
  geom_smooth(data = fifa, mapping = aes(x = fifa$age,  y = fifa$potential), method = 'loess', se = T, fullrange = TRUE, color = "#D284F9")+
  
  theme_minimal() +
  labs(x = 'Age', y = 'Overall') +
  theme(legend.position = 'bottom')

## `geom_smooth()` using formula = 'y ~ x'

Grafico para análizar individuos

edad_jugadores <- ggplot(fifa, aes(x=age, y=potential, text=paste("Nombre:",long_name, "\n","Potencial:",potential,"\n","Edad:",age,"\n", "Valoración:",value_eur,"\n","Altura:",height_cm ))) +  geom_point(color="blue")

ggplotly(edad_jugadores, tooltip = "text")

Testeo de Variables

Testing Normality Overall

qqnorm(fifa$overall)

 ad.test(fifa$overall)

## 
##  Anderson-Darling normality test
## 
## data:  fifa$overall
## A = 22.415, p-value < 2.2e-16

Testing Normality Potential

qqnorm(fifa$potential)

 ad.test(fifa$potential)

## 
##  Anderson-Darling normality test
## 
## data:  fifa$potential
## A = 37.972, p-value < 2.2e-16

Testing Normality Age

qqnorm(fifa$age)

 ad.test(fifa$age)

## 
##  Anderson-Darling normality test
## 
## data:  fifa$age
## A = 114.76, p-value < 2.2e-16

Implementación de modelo

Prueba de un modelo basico

# Implementación de la regresión
f1=formula(potential~ overall+ value_eur+ age+  league_level +pace +physic +nationality_id)


reg1 = lm(formula=f1, data=fifa)

sum01<-summary((reg1))
sum01

## 
## Call:
## lm(formula = f1, data = fifa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2013  -1.6786  -0.2778   1.4779  11.2897 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)     4.300e+01  2.739e-01  156.960  < 2e-16 ***
## overall         8.802e-01  5.036e-03  174.791  < 2e-16 ***
## value_eur       2.515e-08  3.229e-09    7.791 7.07e-15 ***
## age            -9.299e-01  5.540e-03 -167.856  < 2e-16 ***
## league_level   -2.655e-01  2.920e-02   -9.091  < 2e-16 ***
## pace           -4.536e-02  2.071e-03  -21.902  < 2e-16 ***
## physic         -4.374e-02  2.627e-03  -16.651  < 2e-16 ***
## nationality_id -5.027e-03  4.318e-04  -11.642  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.523 on 16012 degrees of freedom
## Multiple R-squared:  0.8258, Adjusted R-squared:  0.8257 
## F-statistic: 1.084e+04 on 7 and 16012 DF,  p-value: < 2.2e-16

Plot de los residuos

plot(sum01$residuals)

graph1 <- ggplot() +

  geom_point(fifa, mapping = aes(x=fifa$age , y=fifa$potential, color=fifa$potential)) +
  geom_smooth(fifa, mapping = aes(x=age, y=predict(reg1)), color='green', method='loess',se=F, fullrange = T) +
  theme_minimal() +
  labs(x='Age',y='Potential',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 1))
graph1

Pruebas de modelo basico

k-folds

set.seed(12345) ## setear una semilla
training.samples <- createDataPartition(fifa$potential,p = 0.8, list = FALSE)
train.data  <- fifa[training.samples, ]
test.data <- fifa[-training.samples, ]

model <- lm(f1, data = train.data)

predictions01 <- predict(model,train.data)
predictions02 <- predict(model,test.data)

data.table(' ' = c('Dentro de muestra','Fuera de muestra'),
           R2 = c(R2(predictions01, train.data$potential),
                  R2(predictions02, test.data$potential)),
           RMSE = c(RMSE(predictions01, train.data$potential),
                    RMSE(predictions02, test.data$potential)),
           MAE = c(MAE(predictions01, train.data$potential),
                   MAE(predictions02, test.data$potential)))

##                             R2     RMSE      MAE
## 1: Dentro de muestra 0.8247396 2.522858 1.957855
## 2:  Fuera de muestra 0.8299166 2.519620 1.971272

LOO

set.seed(123456)## setear una semilla
setupLOO <- trainControl(method = "LOOCV")

predLOO <- train(f1,data=fifa[1:2000],method="lm",trControl= setupLOO)
print(predLOO)

## Linear Regression 
## 
## 2000 samples
##    7 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 1999, 1999, 1999, 1999, 1999, 1999, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   1.696503  0.83278   1.331941
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Untitled

Sebastián Lillo

2023-07-02

Plots de variables

Plots de Fifa

Grafico para análizar individuos

Testeo de Variables

Testing Normality Overall

Testing Normality Potential

Testing Normality Age

Implementación de modelo

Prueba de un modelo basico

Plot de los residuos

Pruebas de modelo basico

k-folds

LOO