library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
library(nortest)
library(tidyr)
library(plotly)
#Cargo de bases
fifa<- data.table(read.csv('Fifa_hombres.csv', header = T, sep = "," , encoding = 'Latin-1'))
fifa<-fifa %>% drop_na()
ggplot() +
geom_point(data = fifa, mapping = aes(x = fifa$age, y = fifa$potential, color=fifa$value_eur)) +
geom_smooth(data = fifa, mapping = aes(x = fifa$age, y = fifa$potential), method = 'loess', se = T, fullrange = TRUE, color = "#D284F9")+
theme_minimal() +
labs(x = 'Age', y = 'Overall') +
theme(legend.position = 'bottom')
## `geom_smooth()` using formula = 'y ~ x'
edad_jugadores <- ggplot(fifa, aes(x=age, y=potential, text=paste("Nombre:",long_name, "\n","Potencial:",potential,"\n","Edad:",age,"\n", "Valoración:",value_eur,"\n","Altura:",height_cm ))) + geom_point(color="blue")
ggplotly(edad_jugadores, tooltip = "text")
qqnorm(fifa$overall)
ad.test(fifa$overall)
##
## Anderson-Darling normality test
##
## data: fifa$overall
## A = 22.415, p-value < 2.2e-16
qqnorm(fifa$potential)
ad.test(fifa$potential)
##
## Anderson-Darling normality test
##
## data: fifa$potential
## A = 37.972, p-value < 2.2e-16
qqnorm(fifa$age)
ad.test(fifa$age)
##
## Anderson-Darling normality test
##
## data: fifa$age
## A = 114.76, p-value < 2.2e-16
# Implementación de la regresión
f1=formula(potential~ overall+ value_eur+ age+ league_level +pace +physic +nationality_id)
reg1 = lm(formula=f1, data=fifa)
sum01<-summary((reg1))
sum01
##
## Call:
## lm(formula = f1, data = fifa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2013 -1.6786 -0.2778 1.4779 11.2897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.300e+01 2.739e-01 156.960 < 2e-16 ***
## overall 8.802e-01 5.036e-03 174.791 < 2e-16 ***
## value_eur 2.515e-08 3.229e-09 7.791 7.07e-15 ***
## age -9.299e-01 5.540e-03 -167.856 < 2e-16 ***
## league_level -2.655e-01 2.920e-02 -9.091 < 2e-16 ***
## pace -4.536e-02 2.071e-03 -21.902 < 2e-16 ***
## physic -4.374e-02 2.627e-03 -16.651 < 2e-16 ***
## nationality_id -5.027e-03 4.318e-04 -11.642 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.523 on 16012 degrees of freedom
## Multiple R-squared: 0.8258, Adjusted R-squared: 0.8257
## F-statistic: 1.084e+04 on 7 and 16012 DF, p-value: < 2.2e-16
plot(sum01$residuals)
graph1 <- ggplot() +
geom_point(fifa, mapping = aes(x=fifa$age , y=fifa$potential, color=fifa$potential)) +
geom_smooth(fifa, mapping = aes(x=age, y=predict(reg1)), color='green', method='loess',se=F, fullrange = T) +
theme_minimal() +
labs(x='Age',y='Potential',color=NULL) +
theme(legend.position = 'bottom') +
scale_y_continuous(labels = number_format(scale = 1))
graph1
set.seed(12345) ## setear una semilla
training.samples <- createDataPartition(fifa$potential,p = 0.8, list = FALSE)
train.data <- fifa[training.samples, ]
test.data <- fifa[-training.samples, ]
model <- lm(f1, data = train.data)
predictions01 <- predict(model,train.data)
predictions02 <- predict(model,test.data)
data.table(' ' = c('Dentro de muestra','Fuera de muestra'),
R2 = c(R2(predictions01, train.data$potential),
R2(predictions02, test.data$potential)),
RMSE = c(RMSE(predictions01, train.data$potential),
RMSE(predictions02, test.data$potential)),
MAE = c(MAE(predictions01, train.data$potential),
MAE(predictions02, test.data$potential)))
## R2 RMSE MAE
## 1: Dentro de muestra 0.8247396 2.522858 1.957855
## 2: Fuera de muestra 0.8299166 2.519620 1.971272
set.seed(123456)## setear una semilla
setupLOO <- trainControl(method = "LOOCV")
predLOO <- train(f1,data=fifa[1:2000],method="lm",trControl= setupLOO)
print(predLOO)
## Linear Regression
##
## 2000 samples
## 7 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 1999, 1999, 1999, 1999, 1999, 1999, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.696503 0.83278 1.331941
##
## Tuning parameter 'intercept' was held constant at a value of TRUE