For this exercise, I am going to try to address the nature of association that exists between player Wage and some of the other variables in the dataset such as, Age, Internatinal Reputation, Potential and others. I will start with some exploratory analysis and creating plots to get a sense of the data before developing regression and classification models and evaluating their results.

I have divided the file into four sections, data cleaning, plots to visualize data as part of EDA, linear regression and random forest.

knitr::opts_chunk$set(echo = TRUE)

Loading necessary libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(broom)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Loading the dataset

setwd("~/Desktop/DATA110")
df <- read_csv("data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   X1 = col_double(),
##   ID = col_double(),
##   Age = col_double(),
##   Overall = col_double(),
##   Potential = col_double(),
##   Special = col_double(),
##   `International Reputation` = col_double(),
##   `Weak Foot` = col_double(),
##   `Skill Moves` = col_double(),
##   `Jersey Number` = col_double(),
##   Crossing = col_double(),
##   Finishing = col_double(),
##   HeadingAccuracy = col_double(),
##   ShortPassing = col_double(),
##   Volleys = col_double(),
##   Dribbling = col_double(),
##   Curve = col_double(),
##   FKAccuracy = col_double(),
##   LongPassing = col_double(),
##   BallControl = col_double()
##   # ... with 24 more columns
## )
## ℹ Use `spec()` for the full column specifications.

PART 1: DATA CLEANING

Replacing all instances of space in a column name with an underscore

Getting rid of euro symbol and character K and replacing them with an empty string

Retaining values where Wage > 0 and storing Wage as a double variable

names(df) <- str_replace_all(names(df), c(" " = "_"))

euro = "\u20AC"

df$Wage = gsub(euro, '', df$Wage)
df$Wage = gsub("K", '', df$Wage)

df <- df %>% 
  filter(Wage > 0)

df <- df %>% 
  mutate(Wage = as.double(df$Wage))

#unique(df$Age)
#unique(df$Wage)

PART II: PLOTS

Scatterplot of Wage as a function of Age

p1 <- df %>%
  ggplot(mapping = aes(x=factor(Age), y=Wage)) +
    geom_point() +
    labs(title="Wage as a function of Age",
         x = "Player Age",
         y = "Player Wage")
p1

Making a duplicate dataset and assigning levels to International_Reputation

copy <- df %>%
   filter(!is.na(International_Reputation))

copy$International_Reputation[copy$International_Reputation == 1]<- "Below Avg Rank"
copy$International_Reputation[copy$International_Reputation == 2]<- "Avg Rank"
copy$International_Reputation[copy$International_Reputation == 3]<- "Above Avg Rank"
copy$International_Reputation[copy$International_Reputation == 4]<- "High Rank"
copy$International_Reputation[copy$International_Reputation == 5]<- "Highest Rank"

copy$International_Reputation<-factor(copy$International_Reputation, levels=c("Below Avg Rank", "Avg Rank","Above Avg Rank", "High Rank", "Highest Rank"))

#copy
#df

Scatterplot of Wage with respect to Age

Providing a third variable to observe if there is a correlation

p2 <- copy %>%
  ggplot(aes(Age, Wage, color = International_Reputation)) + 
  geom_point() +
  ggtitle("Wage as a Function of Age") +
  xlab("Player Age") +
  ylab("Player Wage")
p2  

PART III: LINEAR REGRESSION

My goal here is to model a linear regression to test my hypothesis that no relationship exists between Wage in terms of Age and International Reputation.

#unique(newtable$International_Reputation)

linreg <- df%>%
  filter(!is.na(International_Reputation))

lm_obj <- lm(Wage ~ Age + International_Reputation, data = linreg)
stats <- broom::tidy(lm_obj)
stats
## # A tibble: 3 x 5
##   term                     estimate std.error statistic     p.value
##   <chr>                       <dbl>     <dbl>     <dbl>       <dbl>
## 1 (Intercept)               -29.0      0.695     -41.8  0          
## 2 Age                        -0.134    0.0271     -4.94 0.000000782
## 3 International_Reputation   38.0      0.321     118.   0
#cor(df$Age, df$International_Reputation)

The p.value for both variables is significantly smaller than 0.05 so we reject the null hypothesis

For our initial hypothesis to still be valid, we need assumptions for linear regression to hold true

  1. Linear relationship: there exists a linear relationship between the independent variable, x, and the dependent variable, y
  2. Independence: the residuals are independent
  3. Homoscedasticity: the residuals have constant variance at every level of x
  4. Normality: the residuals of the model are normally distributed

Residual plot for one of the two independent variables (Age)

p3 <- lm_obj %>%
  augment() %>%
  ggplot(aes(x = factor(Age), y = .resid)) +
    geom_point() +
    labs(title="Residuals vs Age",
         x = "Age",
         y = "Residuals")
p3

Residual plot for one of the two independent variables (International_Reputation)

p4 <- lm_obj %>%
  augment() %>%
  ggplot(aes(x = International_Reputation, y = .resid)) +
    geom_point() +
    labs(title="Residuals vs International Reputation",
         x = "International Reputation",
         y = "Residuals")
p4

Residual plot for fitted values

p5 <- lm_obj %>%
  augment() %>%
  ggplot(aes(x = factor(.fitted), y = .resid)) +
    geom_point() +
    labs(title="Residuals vs Fitted",
         x = "Fitted",
         y = "Residuals")
p5

Interpreting these plots can be subjective but the one that is most obvious to evaluate perhaps is the one for fitted values vs Residuals which reveals heteroscedasticity or that our residuals are not uniformally distributed across a residual plot. I suspect that Wage is not just dependent on Age and International_Reputation but some of the other factors as well.

As EDA is an iterative process, there are several things we can do when this happens: rebuild the model with different independent variables, perform transformations on non-linear data or try to fit a non-linear regression model without overfitting.

PART IV: RANDOM FOREST

In the previous section we observed that other factors might be at play, so my goal here is to do an experiment to predict Wage modeled on a single factor -Potential and then Wage modeled on multiple factors -Potential, Position, Age, International_Reputation and Club. I am splitting Wage into Top and Bottom brackets for a random forest classification then using 70% of the data to train and 30% to test the classification model.

temp <- df
temp <- temp[!is.na(temp$Position), ]

#unique(df$International_Reputation)
#unique(temp$International_Reputation)
#change club to as.numeric

model1 <- temp %>%
  mutate(Result = ifelse(Wage < 5, "Bottom", "Top")) %>%
  select(Position, Result)
model1
## # A tibble: 17,918 x 2
##    Position Result
##    <chr>    <chr> 
##  1 RF       Top   
##  2 ST       Top   
##  3 LW       Top   
##  4 GK       Top   
##  5 RCM      Top   
##  6 LF       Top   
##  7 RCM      Top   
##  8 RS       Top   
##  9 RCB      Top   
## 10 GK       Top   
## # … with 17,908 more rows
model2 <- temp %>%
  mutate(Result = ifelse(Wage < 5, "Bottom", "Top")) %>%
  mutate(Position = as.numeric(factor(Position))) %>%
  mutate(Club = as.numeric(factor(Club))) %>%
  select(Potential, Position, Age, International_Reputation, Club, Result)
model2
## # A tibble: 17,918 x 6
##    Potential Position   Age International_Reputation  Club Result
##        <dbl>    <dbl> <dbl>                    <dbl> <dbl> <chr> 
##  1        94       22    31                        5   213 Top   
##  2        94       27    33                        5   328 Top   
##  3        93       15    26                        5   435 Top   
##  4        93        6    27                        4   375 Top   
##  5        92       20    27                        4   374 Top   
##  6        91       12    27                        4   136 Top   
##  7        91       20    32                        4   472 Top   
##  8        91       24    31                        5   213 Top   
##  9        91       19    32                        4   472 Top   
## 10        93        6    25                        3    60 Top   
## # … with 17,908 more rows
in_validation1 <- sample(nrow(model1), as.integer(nrow(model1)*70/100))
validation_set1 <- model1[-in_validation1,]
training_set1 <- model1[in_validation1,]

in_validation2 <- sample(nrow(model2), as.integer(nrow(model2)*70/100))
validation_set2 <- model2[-in_validation2,]
training_set2 <- model2[in_validation2,]

Training my two models: Wage modeled on Potential and Wage modeled on multiple predictors

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
rf_model1 <- train(Result~.,
                data = training_set1,
                 na.action = na.pass,
                method="rf",
                trControl=trainControl(method="none"))
rf_model1
## Random Forest 
## 
## 12542 samples
##     1 predictor
##     2 classes: 'Bottom', 'Top' 
## 
## No pre-processing
## Resampling: None
rf_model2 <- train(Result~.,
                data = training_set2,
                na.action = na.pass,
                method="rf",
                trControl=trainControl(method="none"))
rf_model2
## Random Forest 
## 
## 12542 samples
##     5 predictor
##     2 classes: 'Bottom', 'Top' 
## 
## No pre-processing
## Resampling: None

Resulting table for the first model

test_predictions1 <- predict(rf_model1, newdata = validation_set1)

table(pred=test_predictions1,
      observed=validation_set1$Result)
##         observed
## pred     Bottom  Top
##   Bottom   2939 1889
##   Top       251  297

Resulting table for the second model

test_predictions2 <- predict(rf_model2, newdata = validation_set2)

table(pred=test_predictions2,
      observed=validation_set2$Result)
##         observed
## pred     Bottom  Top
##   Bottom   2904  598
##   Top       293 1581

Calculating Precision and Recall for the first model

Precision1 <- 2918/(2918 + 1915)
Recall1 <- 2918/(2918 + 250)
Precision1
## [1] 0.6037658
Recall1
## [1] 0.9210859

Calculating Precision and Recall for the second model

Precision2 <- 3032/(3032 + 639)
Recall2 <-3032/(3032 + 225)
Precision2
## [1] 0.825933
Recall2
## [1] 0.930918

Calculating f1 scores for both models

f1score1 <- 2*(Precision1 * Recall1)/(Precision1 + Recall1)
f1score1
## [1] 0.7294088
f1score2 <- 2*(Precision2 * Recall2)/(Precision2 + Recall2)
f1score2
## [1] 0.8752887

We observe that the model with more variables performs better compared to the model with a single predictor!