knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
setwd("~/Desktop/DATA110")
df <- read_csv("data.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## X1 = col_double(),
## ID = col_double(),
## Age = col_double(),
## Overall = col_double(),
## Potential = col_double(),
## Special = col_double(),
## `International Reputation` = col_double(),
## `Weak Foot` = col_double(),
## `Skill Moves` = col_double(),
## `Jersey Number` = col_double(),
## Crossing = col_double(),
## Finishing = col_double(),
## HeadingAccuracy = col_double(),
## ShortPassing = col_double(),
## Volleys = col_double(),
## Dribbling = col_double(),
## Curve = col_double(),
## FKAccuracy = col_double(),
## LongPassing = col_double(),
## BallControl = col_double()
## # ... with 24 more columns
## )
## ℹ Use `spec()` for the full column specifications.
names(df) <- str_replace_all(names(df), c(" " = "_"))
euro = "\u20AC"
df$Wage = gsub(euro, '', df$Wage)
df$Wage = gsub("K", '', df$Wage)
df <- df %>%
filter(Wage > 0)
df <- df %>%
mutate(Wage = as.double(df$Wage))
#unique(df$Age)
#unique(df$Wage)
p1 <- df %>%
ggplot(mapping = aes(x=factor(Age), y=Wage)) +
geom_point() +
labs(title="Wage as a function of Age",
x = "Player Age",
y = "Player Wage")
p1
copy <- df %>%
filter(!is.na(International_Reputation))
copy$International_Reputation[copy$International_Reputation == 1]<- "Below Avg Rank"
copy$International_Reputation[copy$International_Reputation == 2]<- "Avg Rank"
copy$International_Reputation[copy$International_Reputation == 3]<- "Above Avg Rank"
copy$International_Reputation[copy$International_Reputation == 4]<- "High Rank"
copy$International_Reputation[copy$International_Reputation == 5]<- "Highest Rank"
copy$International_Reputation<-factor(copy$International_Reputation, levels=c("Below Avg Rank", "Avg Rank","Above Avg Rank", "High Rank", "Highest Rank"))
#copy
#df
p2 <- copy %>%
ggplot(aes(Age, Wage, color = International_Reputation)) +
geom_point() +
ggtitle("Wage as a Function of Age") +
xlab("Player Age") +
ylab("Player Wage")
p2
#unique(newtable$International_Reputation)
linreg <- df%>%
filter(!is.na(International_Reputation))
lm_obj <- lm(Wage ~ Age + International_Reputation, data = linreg)
stats <- broom::tidy(lm_obj)
stats
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -29.0 0.695 -41.8 0
## 2 Age -0.134 0.0271 -4.94 0.000000782
## 3 International_Reputation 38.0 0.321 118. 0
#cor(df$Age, df$International_Reputation)
p3 <- lm_obj %>%
augment() %>%
ggplot(aes(x = factor(Age), y = .resid)) +
geom_point() +
labs(title="Residuals vs Age",
x = "Age",
y = "Residuals")
p3
p4 <- lm_obj %>%
augment() %>%
ggplot(aes(x = International_Reputation, y = .resid)) +
geom_point() +
labs(title="Residuals vs International Reputation",
x = "International Reputation",
y = "Residuals")
p4
p5 <- lm_obj %>%
augment() %>%
ggplot(aes(x = factor(.fitted), y = .resid)) +
geom_point() +
labs(title="Residuals vs Fitted",
x = "Fitted",
y = "Residuals")
p5
Interpreting these plots can be subjective but the one that is most obvious to evaluate perhaps is the one for fitted values vs Residuals which reveals heteroscedasticity or that our residuals are not uniformally distributed across a residual plot. I suspect that Wage is not just dependent on Age and International_Reputation but some of the other factors as well.
As EDA is an iterative process, there are several things we can do when this happens: rebuild the model with different independent variables, perform transformations on non-linear data or try to fit a non-linear regression model without overfitting.
In the previous section we observed that other factors might be at play, so my goal here is to do an experiment to predict Wage modeled on a single factor -Potential and then Wage modeled on multiple factors -Potential, Position, Age, International_Reputation and Club. I am splitting Wage into Top and Bottom brackets for a random forest classification then using 70% of the data to train and 30% to test the classification model.
temp <- df
temp <- temp[!is.na(temp$Position), ]
#unique(df$International_Reputation)
#unique(temp$International_Reputation)
#change club to as.numeric
model1 <- temp %>%
mutate(Result = ifelse(Wage < 5, "Bottom", "Top")) %>%
select(Position, Result)
model1
## # A tibble: 17,918 x 2
## Position Result
## <chr> <chr>
## 1 RF Top
## 2 ST Top
## 3 LW Top
## 4 GK Top
## 5 RCM Top
## 6 LF Top
## 7 RCM Top
## 8 RS Top
## 9 RCB Top
## 10 GK Top
## # … with 17,908 more rows
model2 <- temp %>%
mutate(Result = ifelse(Wage < 5, "Bottom", "Top")) %>%
mutate(Position = as.numeric(factor(Position))) %>%
mutate(Club = as.numeric(factor(Club))) %>%
select(Potential, Position, Age, International_Reputation, Club, Result)
model2
## # A tibble: 17,918 x 6
## Potential Position Age International_Reputation Club Result
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 94 22 31 5 213 Top
## 2 94 27 33 5 328 Top
## 3 93 15 26 5 435 Top
## 4 93 6 27 4 375 Top
## 5 92 20 27 4 374 Top
## 6 91 12 27 4 136 Top
## 7 91 20 32 4 472 Top
## 8 91 24 31 5 213 Top
## 9 91 19 32 4 472 Top
## 10 93 6 25 3 60 Top
## # … with 17,908 more rows
in_validation1 <- sample(nrow(model1), as.integer(nrow(model1)*70/100))
validation_set1 <- model1[-in_validation1,]
training_set1 <- model1[in_validation1,]
in_validation2 <- sample(nrow(model2), as.integer(nrow(model2)*70/100))
validation_set2 <- model2[-in_validation2,]
training_set2 <- model2[in_validation2,]
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
rf_model1 <- train(Result~.,
data = training_set1,
na.action = na.pass,
method="rf",
trControl=trainControl(method="none"))
rf_model1
## Random Forest
##
## 12542 samples
## 1 predictor
## 2 classes: 'Bottom', 'Top'
##
## No pre-processing
## Resampling: None
rf_model2 <- train(Result~.,
data = training_set2,
na.action = na.pass,
method="rf",
trControl=trainControl(method="none"))
rf_model2
## Random Forest
##
## 12542 samples
## 5 predictor
## 2 classes: 'Bottom', 'Top'
##
## No pre-processing
## Resampling: None
test_predictions1 <- predict(rf_model1, newdata = validation_set1)
table(pred=test_predictions1,
observed=validation_set1$Result)
## observed
## pred Bottom Top
## Bottom 2939 1889
## Top 251 297
test_predictions2 <- predict(rf_model2, newdata = validation_set2)
table(pred=test_predictions2,
observed=validation_set2$Result)
## observed
## pred Bottom Top
## Bottom 2904 598
## Top 293 1581
Precision1 <- 2918/(2918 + 1915)
Recall1 <- 2918/(2918 + 250)
Precision1
## [1] 0.6037658
Recall1
## [1] 0.9210859
Precision2 <- 3032/(3032 + 639)
Recall2 <-3032/(3032 + 225)
Precision2
## [1] 0.825933
Recall2
## [1] 0.930918
f1score1 <- 2*(Precision1 * Recall1)/(Precision1 + Recall1)
f1score1
## [1] 0.7294088
f1score2 <- 2*(Precision2 * Recall2)/(Precision2 + Recall2)
f1score2
## [1] 0.8752887