library(keras)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reticulate)
library(tensorflow)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(dplyr)
library(magrittr)
library(ggplot2)
library(ggcorrplot)
library(DescTools)
library(readr)
use_python("/Users/osim/miniforge3/envs/python3106/bin/python3")
#Recheck python config
py_discover_config()$python
## [1] "/Users/osim/miniforge3/envs/python3106/bin/python3"
This data set has 3,907 observations across 7 predictor variables. The target variable is Wage which is stated in USD. The objective is to use deep neural network to forecast Wage for footballers taking into account information such as age, club, position and others, and compare this with results obtained from OLS Regression.
In this exercise we will explore the usage of categorical predictor variable in DNN, and test if it performs better than OLS. We will apply one hot encoding (OHE) technique as well as categorical variable embeddings. We will also briefly touch on dimensionality reduction in categorical variables by using the embedding technique.
#Read data
salary <- read_csv("SalaryPrediction.csv")
## Rows: 3907 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Club, League, Nation, Position
## dbl (3): Age, Apps, Caps
## num (1): Wage
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Check the first few lines
head(salary)
## # A tibble: 6 × 8
## Wage Age Club League Nation Position Apps Caps
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 46427000 23 PSG Ligue 1 Uber Eats FRA Forward 190 57
## 2 42125000 30 PSG Ligue 1 Uber Eats BRA Midfilder 324 119
## 3 34821000 35 PSG Ligue 1 Uber Eats ARG Forward 585 162
## 4 19959000 31 R. Madrid La Liga BEL Forward 443 120
## 5 19500000 31 Man UFC Premier League ESP Goalkeeper 480 45
## 6 18810000 30 R. Madrid La Liga AUT Defender 371 94
Information on dataset
The following will need to be converted to factors: - Club (116 unique values) - Nation (114 unique values) - Position (4 unique values)
While technically league is also a factor, league is dependent on club and we may consider dropping league out of the dataset as it provide no new information for our target variable
salary<-salary %>%
mutate(Club = as.factor(Club),
Nation= as.factor(Nation),
Position=as.factor(Position)) %>%
select(-League)
str(salary)
## tibble [3,907 × 7] (S3: tbl_df/tbl/data.frame)
## $ Wage : num [1:3907] 46427000 42125000 34821000 19959000 19500000 ...
## $ Age : num [1:3907] 23 30 35 31 31 30 29 30 27 29 ...
## $ Club : Factor w/ 116 levels "1. FC Koln","A. Bilbao",..: 75 75 75 77 64 77 54 59 31 64 ...
## $ Nation : Factor w/ 114 levels "ALB","ALG","ANG",..: 41 14 4 9 38 7 9 35 36 41 ...
## $ Position: Factor w/ 4 levels "Defender","Forward",..: 2 4 2 2 3 1 2 2 4 1 ...
## $ Apps : num [1:3907] 190 324 585 443 480 371 427 367 326 287 ...
## $ Caps : num [1:3907] 57 119 162 120 45 94 102 85 77 86 ...
range(salary$Age)
## [1] 18 41
Lets group the age into 3 brackets - Prime: Age 18 - 25 - Middle: Age 25 - 32 - Senior: Age 35 - 41
# Create a vector of age breaks
age_breaks <- c(18, 25, 35, 41, Inf)
# Create labels for the categories
age_labels <- c("Prime", "Middle", "Middle", "Senior")
# Use cut to categorize the Age column
salary$Age_Category <- cut(salary$Age, breaks = age_breaks, labels = age_labels, right = FALSE)
salary$Age_Category <- as.factor(salary$Age_Category)
# View the resulting dataframe
head(salary)
## # A tibble: 6 × 8
## Wage Age Club Nation Position Apps Caps Age_Category
## <dbl> <dbl> <fct> <fct> <fct> <dbl> <dbl> <fct>
## 1 46427000 23 PSG FRA Forward 190 57 Prime
## 2 42125000 30 PSG BRA Midfilder 324 119 Middle
## 3 34821000 35 PSG ARG Forward 585 162 Middle
## 4 19959000 31 R. Madrid BEL Forward 443 120 Middle
## 5 19500000 31 Man UFC ESP Goalkeeper 480 45 Middle
## 6 18810000 30 R. Madrid AUT Defender 371 94 Middle
Wages: The distribution of wages in our dataset is highly skewed to the right. The distribution can be summarized from boxplot below. As we can see, we have some outliers in the > USD 50 million per year range. Most footballers earn somewhere along USD 1 million per year, with the lowest earning only USD 1,400 per year
Age: Most footballers are around 24 years old, with the youngest being 18 years old and the oldest being 41 years old. What’s interesting about the age is that there are only 4 players we have classified as senior i.e. above age 35. This means our dataset may not have enough representation from the senior category and may not be able to generalize when presented with unseen data with a senior footballer.
Club: MRT dominates the club represented in the dataset
Nation: Most of the footballers are from the European region, with Spain being the most dominant, followed by Portugal, England, France, Germany and Italy.
summary(salary)
## Wage Age Club Nation
## Min. : 1400 Min. :18.00 MRT : 64 ESP : 452
## 1st Qu.: 75500 1st Qu.:20.00 BRG : 61 POR : 428
## Median : 399000 Median :24.00 VIZ : 57 ENG : 410
## Mean : 1367959 Mean :24.12 Chelsea : 54 FRA : 353
## 3rd Qu.: 1560000 3rd Qu.:28.00 Estoril Praia: 51 GER : 298
## Max. :46427000 Max. :41.00 Gil Vicente : 51 ITA : 278
## (Other) :3569 (Other):1688
## Position Apps Caps Age_Category
## Defender :1490 Min. : 0.0 Min. : 0.000 Prime :2205
## Forward : 827 1st Qu.: 15.0 1st Qu.: 0.000 Middle:1698
## Goalkeeper: 430 Median :115.0 Median : 0.000 Senior: 4
## Midfilder :1160 Mean :140.1 Mean : 8.927
## 3rd Qu.:224.5 3rd Qu.: 6.000
## Max. :715.0 Max. :180.000
##
ggplot(salary, aes(y = Wage)) +
geom_boxplot() +
labs(y = "Wage Distribution") +
theme_minimal()
Numeric variables include: - Wage (target variable) - Age - Apps - Caps
salary %>%
select_if(is.numeric) %>%
cor() %>%
ggcorrplot()
Insights - Wages are not strongly correlated with age and appearances, suggesting that other factors may influence wages (perhaps one of the categorical variables) - Age and appearances are strongly correlated, as the more senior the footballer is, the number of appearances tend to be higher. However, better salaried players are often the better players, tend to have more appearances despite their age. We will later see that the relationship between wages and age are not linear - All the numeric variables are correlated positively with wages
Conventionally, football lovers understand that wages are largely determined by the club a player belongs to. According to our datasets, footballers from PSG make the most money on average, followed by Real Madrid and FC Bayern.
However, if we were to change the centredness measure from ‘mean’ to ‘median’, we get a totally different picture. The club with the highest median salary is FC Bayern, suggesting presence of outliers with distribution being skewed to the right.
salary %>%
group_by(Club) %>%
summarise(avg_wages= mean(Wage), median_wage = median(Wage)) %>%
arrange(desc(avg_wages))
## # A tibble: 116 × 3
## Club avg_wages median_wage
## <fct> <dbl> <dbl>
## 1 PSG 11046675 5908000
## 2 R. Madrid 9128783. 7984000
## 3 FC Bayern 8708043. 9457000
## 4 Barcelona 6313604. 6931500
## 5 Man UFC 4622174. 2184000
## 6 Man City 4526184. 2080000
## 7 Chelsea 4246630. 2600000
## 8 Juventus 4120172. 2683500
## 9 A. Madrid 4096265. 3884500
## 10 Borussia Dortmund 3907577. 3438000
## # ℹ 106 more rows
How about age?
Naturally as a player ages, his salary growth may be slower, but in absolute terms our dataset shows that footballers’ salary peaks at age 25 - 35. This is consistent when measured between both mean and median.
salary %>%
group_by(Age_Category) %>%
summarise(avg_wages = mean(Wage), median_wage = median(Wage)) %>%
arrange(desc(avg_wages))
## # A tibble: 3 × 3
## Age_Category avg_wages median_wage
## <fct> <dbl> <dbl>
## 1 Middle 2237448. 1103500
## 2 Senior 762250 546500
## 3 Prime 699493. 112000
How about position?
Midfielders are most likely to score the highest wages compared to their counterparts, with goalkeepers being the lowest
salary %>%
group_by(Position) %>%
summarise(avg_wages = mean(Wage), median_wage = median(Wage)) %>%
arrange(desc(avg_wages))
## # A tibble: 4 × 3
## Position avg_wages median_wage
## <fct> <dbl> <dbl>
## 1 Midfilder 1603824. 528000
## 2 Forward 1342720. 273000
## 3 Defender 1299439. 440000
## 4 Goalkeeper 1017645. 227500
To reduce the skewness of the data we will handle outliers in the Wage column with Winsorize from DescTools
library(DescTools)
lower_bound <- 0.10
upper_bound <- 0.97
#Before outlier handling
salary$Wage %>%
plot()
#After outlier handling
salary$Wage <- DescTools::Winsorize(salary$Wage, probs = c(lower_bound, upper_bound))
salary$Wage %>%
plot
Football lovers believe that clubs are often the factor that explains the most differences in wages. As such we will want to ensure that both the train and test dataset has more or less equal representation of each clubs.
library(rsample)
set.seed(100)
index <- initial_split(data= salary, prop=0.8, strata="Club")
## Warning: Too little data to stratify.
## • Resampling will be unstratified.
data_train <- training(index)
data_test <- testing(index)
The split of our training and testing dataset is as follows
dim(data_train)
## [1] 3125 8
dim(data_test)
## [1] 782 8
It’s generally a good practice to normalize the target variable when you are working with neural networks, especially if the scale of the target variable varies significantly. Normalization can help the training process by making it easier for the model to converge.
\[X_{scaled}= \frac{X−X_{min}} {X_{max}−X_{min}}\]
#normalization of numerical variables using scale(). Note that in scale() we will use mean and standard deviation instead of the formula above
data_train_norm <- data_train %>%
mutate(Wage = scale(Wage),
Age= scale(Age),
Apps = scale(Apps),
Caps = scale(Caps))
data_test_norm <- data_test %>%
mutate(Wage = scale(Wage),
Age= scale(Age),
Apps = scale(Apps),
Caps = scale(Caps))
# We will use numpy from Python to convert numerical data into matrix format
np<- import("numpy")
arr<- np$array(1:10)
train_x <- data_train_norm %>% select_if(is.numeric) %>% select(-Wage) %>% as.matrix()
train_y <- data_train_norm[,"Wage"] %>% as.matrix()
test_x <- data_test_norm %>% select_if(is.numeric) %>% select(-Wage) %>% as.matrix()
test_y <- data_test_norm[,"Wage"] %>% as.matrix()
#Convert categorical data to matrix / array type
categorical_train_x <- data_train_norm %>%
select_if(~ !is.numeric(.)) %>%
as.matrix()
categorical_test_x <- data_test_norm %>%
select_if(~ !is.numeric(.)) %>%
as.matrix()
head(categorical_train_x)
## Club Nation Position Age_Category
## [1,] "PSG" "FRA" "Forward" "Prime"
## [2,] "PSG" "BRA" "Midfilder" "Middle"
## [3,] "PSG" "ARG" "Forward" "Middle"
## [4,] "R. Madrid" "BEL" "Forward" "Middle"
## [5,] "Man UFC" "ESP" "Goalkeeper" "Middle"
## [6,] "R. Madrid" "AUT" "Defender" "Middle"
For now we will not be binding the numerical and categorical predictors. We will start with estimating wage using normalized numerical predictors
#final check on number of rows
nrow(train_x)
## [1] 3125
nrow(train_y)
## [1] 3125
nrow(categorical_train_x)
## [1] 3125
nrow(test_x)
## [1] 782
nrow(test_y)
## [1] 782
nrow(categorical_test_x)
## [1] 782
This is the final step in Cross Validation step. As we will be using keras library, we need to convert the predictor dataset into array. It is not necessary to apply the same treatment to the response variable
train_x <- array_reshape(x=train_x, dim=dim(train_x))
test_x <- array_reshape(x=test_x, dim=dim(test_x))
train_y <- array_reshape(x=train_y, dim=dim(train_y))
test_y <- array_reshape(x=test_y, dim=dim(test_y))
head(train_x)
## [,1] [,2] [,3]
## [1,] -0.2437054 0.3701913 2.333809
## [2,] 1.1752605 1.3938895 5.345201
## [3,] 2.1888076 3.3878092 7.433747
## [4,] 1.3779699 2.3029947 5.393772
## [5,] 1.3779699 2.5856576 1.750959
## [6,] 1.1752605 1.7529479 4.130930
input_dim <- ncol(train_x[,1:3])
num_class <- 1
library(tensorflow)
set_random_seed(100)
model <- keras_model_sequential() %>%
layer_dense(units = 256*2, activation = "relu", input_shape = input_dim) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1, activation = "linear")
# View the model spec
model
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_2 (Dense) (None, 512) 2048
## dense_1 (Dense) (None, 64) 32832
## dense (Dense) (None, 1) 65
## ================================================================================
## Total params: 34945 (136.50 KB)
## Trainable params: 34945 (136.50 KB)
## Non-trainable params: 0 (0.00 Byte)
## ________________________________________________________________________________
For the loss function, we will use Mean Squared Error (MSE). This is the function that we are seeking to minimize during the forwardpass and backpropagation loop.
For the coefficient / weight update method, we will apply Adamax. For learning rate, we will use industry best practice rate of 0.001. For accuracy metrics, we will also be using the ‘mean_squaed_error’ given this is a regression problem.
optimizers <- keras::keras$optimizers
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adamax(learning_rate = 0.001),
metrics = list(mean_squared_error = 'mean_squared_error')
)
history_num <- model %>% fit(x=train_x[,1:3], y=train_y, epoch=10, validation_data = list(test_x[,1:3], test_y), batch_size = 150, verbose=2)
## Epoch 1/10
## 21/21 - 0s - loss: 0.6770 - mean_squared_error: 0.6770 - val_loss: 0.6125 - val_mean_squared_error: 0.6125 - 338ms/epoch - 16ms/step
## Epoch 2/10
## 21/21 - 0s - loss: 0.6029 - mean_squared_error: 0.6029 - val_loss: 0.5799 - val_mean_squared_error: 0.5799 - 48ms/epoch - 2ms/step
## Epoch 3/10
## 21/21 - 0s - loss: 0.5794 - mean_squared_error: 0.5794 - val_loss: 0.5590 - val_mean_squared_error: 0.5590 - 39ms/epoch - 2ms/step
## Epoch 4/10
## 21/21 - 0s - loss: 0.5612 - mean_squared_error: 0.5612 - val_loss: 0.5483 - val_mean_squared_error: 0.5483 - 39ms/epoch - 2ms/step
## Epoch 5/10
## 21/21 - 0s - loss: 0.5507 - mean_squared_error: 0.5507 - val_loss: 0.5403 - val_mean_squared_error: 0.5403 - 37ms/epoch - 2ms/step
## Epoch 6/10
## 21/21 - 0s - loss: 0.5424 - mean_squared_error: 0.5424 - val_loss: 0.5330 - val_mean_squared_error: 0.5330 - 39ms/epoch - 2ms/step
## Epoch 7/10
## 21/21 - 0s - loss: 0.5376 - mean_squared_error: 0.5376 - val_loss: 0.5307 - val_mean_squared_error: 0.5307 - 60ms/epoch - 3ms/step
## Epoch 8/10
## 21/21 - 0s - loss: 0.5362 - mean_squared_error: 0.5362 - val_loss: 0.5249 - val_mean_squared_error: 0.5249 - 40ms/epoch - 2ms/step
## Epoch 9/10
## 21/21 - 0s - loss: 0.5318 - mean_squared_error: 0.5318 - val_loss: 0.5243 - val_mean_squared_error: 0.5243 - 41ms/epoch - 2ms/step
## Epoch 10/10
## 21/21 - 0s - loss: 0.5284 - mean_squared_error: 0.5284 - val_loss: 0.5229 - val_mean_squared_error: 0.5229 - 40ms/epoch - 2ms/step
plot(history_num)
For comparison we will also fit the same data with OLS.
ols_num <- lm(Wage~Age+Apps+Caps, data_train_norm)
summary(ols_num)
##
## Call:
## lm(formula = Wage ~ Age + Apps + Caps, data = data_train_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2670 -0.3152 -0.2078 0.1223 3.9820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.772e-15 1.410e-02 0.000 1
## Age -3.651e-01 3.722e-02 -9.809 <2e-16 ***
## Apps 5.950e-01 3.908e-02 15.224 <2e-16 ***
## Caps 4.133e-01 1.668e-02 24.782 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7882 on 3121 degrees of freedom
## Multiple R-squared: 0.3794, Adjusted R-squared: 0.3788
## F-statistic: 636 on 3 and 3121 DF, p-value: < 2.2e-16
We will use one hot encoding (OHE) on the Positions variable, which has 4 unique variables. During the EDA stage we have established that positions do play an important role in determining Wages.
unique(data_train_norm$Position)
## [1] Forward Midfilder Goalkeeper Defender
## Levels: Defender Forward Goalkeeper Midfilder
position_train_x <- data_train_norm %>%
select(Position, Age, Apps, Caps) %>%
mutate(Position=as.integer(Position))
position_test_x <- data_test_norm %>%
select(Position, Age, Apps, Caps) %>%
mutate(Position=as.integer(Position))
#Convert to one hot encoding
position_ohe_train <- to_categorical(position_train_x$Position)
position_ohe_test <- to_categorical(position_test_x$Position)
#Add the numerical variables in
position_ohe_train <- cbind(position_ohe_train, position_train_x$Apps, position_train_x$Caps, position_train_x$Age)
position_ohe_test <- cbind(position_ohe_test, position_test_x$Apps, position_test_x$Caps, position_test_x$Age)
library(tensorflow)
set_random_seed(100)
model_cat <- keras_model_sequential() %>%
layer_dense(units = 256, activation = "linear", input_shape = ncol(position_ohe_train)) %>%
layer_dense(units = 32, activation = "tanh") %>%
layer_dense(units = 1, activation = "linear")
# View the model spec
model_cat
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_5 (Dense) (None, 256) 2304
## dense_4 (Dense) (None, 32) 8224
## dense_3 (Dense) (None, 1) 33
## ================================================================================
## Total params: 10561 (41.25 KB)
## Trainable params: 10561 (41.25 KB)
## Non-trainable params: 0 (0.00 Byte)
## ________________________________________________________________________________
# Compile the model
model_cat %>% compile(
loss = 'mse',
optimizer = optimizer_adamax(learning_rate = 0.001),
metrics = list(mean_squared_error = 'mean_squared_error')
)
# Train the model
history_cat <- model_cat %>% fit(x=position_ohe_train, y=train_y, epoch=10, validation_data = list(position_ohe_test, test_y), batch_size = 150, verbose=2)
## Epoch 1/10
## 21/21 - 0s - loss: 0.7300 - mean_squared_error: 0.7300 - val_loss: 0.6195 - val_mean_squared_error: 0.6195 - 282ms/epoch - 13ms/step
## Epoch 2/10
## 21/21 - 0s - loss: 0.6109 - mean_squared_error: 0.6109 - val_loss: 0.5867 - val_mean_squared_error: 0.5867 - 34ms/epoch - 2ms/step
## Epoch 3/10
## 21/21 - 0s - loss: 0.5934 - mean_squared_error: 0.5934 - val_loss: 0.5762 - val_mean_squared_error: 0.5762 - 33ms/epoch - 2ms/step
## Epoch 4/10
## 21/21 - 0s - loss: 0.5846 - mean_squared_error: 0.5846 - val_loss: 0.5687 - val_mean_squared_error: 0.5687 - 34ms/epoch - 2ms/step
## Epoch 5/10
## 21/21 - 0s - loss: 0.5768 - mean_squared_error: 0.5768 - val_loss: 0.5642 - val_mean_squared_error: 0.5642 - 32ms/epoch - 2ms/step
## Epoch 6/10
## 21/21 - 0s - loss: 0.5717 - mean_squared_error: 0.5717 - val_loss: 0.5576 - val_mean_squared_error: 0.5576 - 32ms/epoch - 2ms/step
## Epoch 7/10
## 21/21 - 0s - loss: 0.5674 - mean_squared_error: 0.5674 - val_loss: 0.5527 - val_mean_squared_error: 0.5527 - 31ms/epoch - 1ms/step
## Epoch 8/10
## 21/21 - 0s - loss: 0.5637 - mean_squared_error: 0.5637 - val_loss: 0.5502 - val_mean_squared_error: 0.5502 - 31ms/epoch - 1ms/step
## Epoch 9/10
## 21/21 - 0s - loss: 0.5587 - mean_squared_error: 0.5587 - val_loss: 0.5467 - val_mean_squared_error: 0.5467 - 32ms/epoch - 2ms/step
## Epoch 10/10
## 21/21 - 0s - loss: 0.5543 - mean_squared_error: 0.5543 - val_loss: 0.5447 - val_mean_squared_error: 0.5447 - 31ms/epoch - 1ms/step
plot(history_cat)
#### Fit OLS for comparison
ols_cat <- lm(Wage~Position+Age+Apps+Caps, data_train_norm)
summary(ols_cat)
##
## Call:
## lm(formula = Wage ~ Position + Age + Apps + Caps, data = data_train_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3070 -0.3282 -0.2000 0.1197 3.9663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01637 0.02268 0.722 0.4705
## PositionForward -0.09896 0.03869 -2.558 0.0106 *
## PositionGoalkeeper -0.02988 0.04896 -0.610 0.5417
## PositionMidfilder 0.02538 0.03479 0.729 0.4658
## Age -0.36788 0.03911 -9.407 <2e-16 ***
## Apps 0.59608 0.04056 14.695 <2e-16 ***
## Caps 0.41352 0.01666 24.821 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7872 on 3118 degrees of freedom
## Multiple R-squared: 0.3814, Adjusted R-squared: 0.3803
## F-statistic: 320.5 on 6 and 3118 DF, p-value: < 2.2e-16
The issue with Club as categorical variable is as follows:
head(unique(data_train_norm$Club))
## [1] PSG R. Madrid Man UFC Inter Liverpool Man City
## 116 Levels: 1. FC Koln A. Bilbao A. Madrid AC Ajaccio AJ Auxerre ... Wolves
The solution will be to use embeddings which is provided within the keras package. This will help us to:
For us to work with the Club variable, we will need to encode the variable first using OHE.
club_train_x <- data_train_norm %>%
select(Club) %>%
mutate(Club=as.integer(Club))
club_test_x <- data_test_norm %>%
select(Club) %>%
mutate(Club=as.integer(Club))
#Convert to one hot encoding
club_ohe_train <- to_categorical(club_train_x)
club_ohe_test <- to_categorical(club_test_x)
head(club_ohe_train)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 1 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86]
## [1,] 0 1 0 0 0 0 0 0 0 0 0 0
## [2,] 0 1 0 0 0 0 0 0 0 0 0 0
## [3,] 0 1 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 1 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 1 0 0 0 0 0 0 0 0
## [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108]
## [1,] 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0
## [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117]
## [1,] 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0
The next steps are as follows: - Determine the embedding size, which is 10. Embedding size defines the dimensionality in which we map the categorical variables.(note: best practice formula is min(50, (levels in category)/2) - Setup model sequential with keras. Note that the first layer is the embedding layer with the size of 116 clubs plus 1 (for the unknowns)
embedding_size <- 50 # You can adjust the embedding size as needed
model_club <- keras_model_sequential()
model_club %>%
layer_embedding(input_dim = ncol(club_ohe_train), output_dim = embedding_size, input_length = 1, name = "embedding") %>%
layer_flatten()
model_club %>% compile(
loss = 'mse', # Choose an appropriate loss function for your task
optimizer = optimizer_sgd(learning_rate = 0.001),
metrics = list(mean_absolute_error = 'mean_absolute_error')
)
history_club <- model_club %>% fit(
x = club_ohe_train,
y = train_y,
epochs = 10,
batch_size = 20,
verbose = 2
)
## Epoch 1/10
## 157/157 - 0s - loss: 1.0005 - mean_absolute_error: 0.7040 - 252ms/epoch - 2ms/step
## Epoch 2/10
## 157/157 - 0s - loss: 1.0005 - mean_absolute_error: 0.7040 - 156ms/epoch - 992us/step
## Epoch 3/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7040 - 152ms/epoch - 971us/step
## Epoch 4/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7040 - 153ms/epoch - 975us/step
## Epoch 5/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7040 - 152ms/epoch - 967us/step
## Epoch 6/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7040 - 148ms/epoch - 944us/step
## Epoch 7/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7040 - 151ms/epoch - 961us/step
## Epoch 8/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7039 - 151ms/epoch - 962us/step
## Epoch 9/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7040 - 149ms/epoch - 947us/step
## Epoch 10/10
## 157/157 - 0s - loss: 1.0004 - mean_absolute_error: 0.7039 - 150ms/epoch - 956us/step
View the first few rows of the embeddings
layer <- get_layer(model_club, "embedding")
embeddings <- data.frame(layer$get_weights()[[1]])
head(embeddings)
## X1 X2 X3 X4 X5 X6
## 1 0.02875300 0.035134811 0.034433164 -0.003375545 -3.069345e-02 -0.02455088
## 2 0.01967478 -0.022206273 -0.016899563 0.018332046 4.827867e-02 0.01367901
## 3 0.04993686 -0.005711056 0.003341865 -0.030904282 -3.841583e-02 0.03251224
## 4 0.03046078 0.039875511 -0.038159609 0.001170255 -3.611344e-02 -0.01407187
## 5 -0.03293520 -0.016121279 0.046199966 0.003860198 -5.329773e-05 -0.01229806
## 6 -0.01728304 0.022101831 -0.023904299 0.033570658 -1.756282e-02 0.01531914
## X7 X8 X9 X10 X11 X12
## 1 -0.03821071 -0.021619556 0.0293216035 -0.014332391 0.03981850 0.02976664
## 2 -0.03853114 0.018403962 -0.0073061557 -0.034128938 0.01486544 0.03413233
## 3 0.03402864 -0.041764487 0.0279657729 0.016743694 -0.00621115 -0.02805263
## 4 0.03790670 0.018186036 0.0004508719 0.013890792 -0.03735243 0.03179023
## 5 0.04875975 -0.009747136 0.0344652645 0.022150826 -0.02643251 0.04805371
## 6 0.03969887 0.003853668 -0.0401910432 -0.009877313 -0.02586526 -0.03670522
## X13 X14 X15 X16 X17 X18
## 1 0.004260068 0.0025901140 -0.015003792 -0.02651241 0.04402053 -0.016252633
## 2 0.032566570 -0.0475305133 0.008825460 0.01218583 0.04807141 0.006764989
## 3 0.001628041 -0.0048393719 0.016315427 -0.02344579 -0.03021523 -0.028632833
## 4 0.034037601 0.0132137649 -0.008629572 -0.01106187 -0.01470318 -0.018513180
## 5 0.016742241 -0.0273922328 0.030237619 0.01193310 0.02547555 0.042310681
## 6 0.017502796 0.0005592108 0.047098782 0.03879628 0.03282156 0.044813301
## X19 X20 X21 X22 X23 X24
## 1 -0.01156808 -0.001333925 -0.04139784 -0.022329846 -0.029929968 0.02363647
## 2 0.03151739 0.033392485 -0.04597090 -0.047692388 -0.048430998 -0.04028874
## 3 0.03155135 0.017119553 0.03447566 0.024171498 -0.002674986 0.02074330
## 4 0.02269641 -0.029212201 -0.03239945 0.021105502 0.006753862 0.00282836
## 5 0.04169590 -0.045613613 -0.01582726 -0.006201755 0.017317485 -0.01373263
## 6 -0.02114627 0.037179139 -0.04423095 0.026478264 0.041760039 -0.02713074
## X25 X26 X27 X28 X29 X30
## 1 -0.0150756501 -0.009772272 0.04483617 0.03199528 -0.02341798 -0.0171471015
## 2 -0.0484469868 -0.014161320 0.01453342 -0.01164273 -0.03623715 -0.0320378989
## 3 -0.0007223599 0.004542459 -0.03875600 -0.03802960 0.01990626 0.0183490999
## 4 0.0466536172 -0.039607942 -0.02455237 -0.03430711 0.01438863 -0.0483338349
## 5 0.0331512131 -0.004731201 0.02543804 0.01959236 0.04620615 0.0072346106
## 6 0.0182837583 0.018837046 0.01262270 -0.01583960 0.04078582 -0.0003410578
## X31 X32 X33 X34 X35 X36
## 1 -0.004591366 0.0468027890 0.03261493 -0.04608252 -0.031942606 0.0058304900
## 2 0.024257185 -0.0006241716 -0.04855789 -0.01630506 0.029527329 0.0002875034
## 3 0.047157656 0.0290874355 -0.01451821 0.02052131 -0.043982234 0.0437591933
## 4 -0.029411091 -0.0136831775 0.03491444 0.03263381 -0.046791077 0.0301393382
## 5 0.016339052 0.0300024264 0.01118507 0.02331772 0.009573556 -0.0170978904
## 6 -0.015969634 0.0032770857 0.01676217 0.01454014 -0.013808526 0.0179371126
## X37 X38 X39 X40 X41 X42
## 1 0.023741983 -0.009336509 0.011379672 0.014329080 -0.022016343 0.019194778
## 2 0.037169758 0.003320597 -0.002742765 0.032914717 -0.032460526 0.006748153
## 3 0.046936978 0.020659063 0.022882316 0.010912180 -0.009330727 0.039998863
## 4 -0.027484966 -0.018144954 0.034046594 0.020311188 0.046760570 0.027523760
## 5 0.005985092 -0.035055637 -0.047816362 -0.005460285 0.048673917 0.033202637
## 6 -0.004728593 0.025162946 -0.005029868 -0.008824993 -0.029998267 -0.040857397
## X43 X44 X45 X46 X47 X48
## 1 0.042652361 0.017718030 -0.045460057 -0.013327010 -0.008109900 0.01517765
## 2 -0.016448006 0.018455902 -0.009740257 -0.030557571 0.024409473 -0.03607062
## 3 0.003282227 0.024329308 0.033591632 -0.008091711 -0.029224766 0.01016330
## 4 0.026841071 -0.040703654 -0.022932161 0.027635206 -0.019096924 -0.03524349
## 5 0.016062919 -0.031814300 -0.043857124 -0.009587608 0.037774179 0.02874125
## 6 0.020239498 -0.003462147 -0.015521146 -0.002646495 0.002920367 0.00390885
## X49 X50
## 1 0.01286508 0.009502525
## 2 0.03657885 0.010331911
## 3 -0.02919959 -0.008041847
## 4 0.03004240 0.046013083
## 5 0.01123013 -0.036329746
## 6 0.02359575 -0.049990237
After training the model, we can extract individual layers.
Note that the embedding layer defines where in the 3D feature space the network has placed each categories of the Club variable. We can visualize this in ggplot.
embeddings$name <- c("unknown", levels(data_train_norm$Club))
ggplot(embeddings, aes(X1, X2, color = name)) +
geom_point() +
geom_text(aes(label = name), hjust = 0, vjust = 0, size = 3) +
theme_bw() +
xlab("Embedding Dimension 1") +
ylab("Embedding Dimension 2") +
guides(color = FALSE) # Turn off legends for color
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We can see from the choice of colors that the network has ‘grouped’ the categories of the clubs according to their geography i.e. most of the German clubs are in green, the Spanish clubs are in red etc–although not all is grouped according to geographies. The closer a certain Club is to one another, the closer is their similarities. e.g. Real Madrid, PSG and Barcelona are more similar than Everton, Juventus and Milan.
Now we merge back the embeddings to the original data frame
#merge oribinal data frame with embeddings
dff_train <- merge(data_train_norm, embeddings, by.x="Club", "name")
dff_test<- merge(data_test_norm, embeddings, by.x="Club", "name")
#select only the embeddings for training
embeddings_train<- dff_train %>%
select(-Club, -Wage, -Nation, -Position, -Age_Category)
embeddings_test<- dff_test %>%
select(-Club, -Wage, -Nation, -Position, -Age_Category)
#final check on dimensions
dim(embeddings_train)
## [1] 3125 53
dim(embeddings_test)
## [1] 782 53
# Create a new sequential model for training with embeddings
model_with_embeddings <- keras_model_sequential()
# Add an input layer for embeddings
model_with_embeddings %>%
layer_dense(units = 128, activation = "relu", input_shape = ncol(embeddings_train)) %>%
layer_dense(units = 28, activation = "tanh") %>%
layer_dense(units = 1, activation= "relu")
# Compile the model
model_with_embeddings %>% compile(
loss = 'mse', # Use appropriate loss function for your regression problem
optimizer = optimizer_sgd(learning_rate = 0.001),
metrics = list(mean_squared_error = 'mean_squared_error')
)
# Train the model using your data (including embeddings)
history_club <- model_with_embeddings %>% fit(
x = as.matrix(embeddings_train),
y = train_y,
epochs = 10,
validation_data = list(as.matrix(embeddings_test), test_y),
batch_size = 50, verbose=2
)
## Epoch 1/10
## 63/63 - 0s - loss: 1.0323 - mean_squared_error: 1.0323 - val_loss: 1.0053 - val_mean_squared_error: 1.0053 - 253ms/epoch - 4ms/step
## Epoch 2/10
## 63/63 - 0s - loss: 1.0108 - mean_squared_error: 1.0108 - val_loss: 1.0003 - val_mean_squared_error: 1.0003 - 56ms/epoch - 891us/step
## Epoch 3/10
## 63/63 - 0s - loss: 1.0032 - mean_squared_error: 1.0032 - val_loss: 0.9984 - val_mean_squared_error: 0.9984 - 52ms/epoch - 821us/step
## Epoch 4/10
## 63/63 - 0s - loss: 1.0001 - mean_squared_error: 1.0001 - val_loss: 0.9974 - val_mean_squared_error: 0.9974 - 51ms/epoch - 807us/step
## Epoch 5/10
## 63/63 - 0s - loss: 0.9990 - mean_squared_error: 0.9990 - val_loss: 0.9971 - val_mean_squared_error: 0.9971 - 52ms/epoch - 828us/step
## Epoch 6/10
## 63/63 - 0s - loss: 0.9985 - mean_squared_error: 0.9985 - val_loss: 0.9971 - val_mean_squared_error: 0.9971 - 51ms/epoch - 808us/step
## Epoch 7/10
## 63/63 - 0s - loss: 0.9983 - mean_squared_error: 0.9983 - val_loss: 0.9970 - val_mean_squared_error: 0.9970 - 51ms/epoch - 804us/step
## Epoch 8/10
## 63/63 - 0s - loss: 0.9982 - mean_squared_error: 0.9982 - val_loss: 0.9970 - val_mean_squared_error: 0.9970 - 51ms/epoch - 809us/step
## Epoch 9/10
## 63/63 - 0s - loss: 0.9981 - mean_squared_error: 0.9981 - val_loss: 0.9970 - val_mean_squared_error: 0.9970 - 50ms/epoch - 801us/step
## Epoch 10/10
## 63/63 - 0s - loss: 0.9980 - mean_squared_error: 0.9980 - val_loss: 0.9970 - val_mean_squared_error: 0.9970 - 50ms/epoch - 790us/step
plot(history_club)
ols_club<- lm(Wage~Club+Age+Caps+Apps, data_train_norm)
summary(ols_club)
##
## Call:
## lm(formula = Wage ~ Club + Age + Caps + Apps, data = data_train_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7482 -0.3131 0.0040 0.2088 3.2199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.333466 0.127981 -2.606 0.009217 **
## ClubA. Bilbao 0.299850 0.192015 1.562 0.118488
## ClubA. Madrid 1.227395 0.175246 7.004 3.06e-12 ***
## ClubAC Ajaccio -0.321767 0.185454 -1.735 0.082840 .
## ClubAJ Auxerre -0.183480 0.187290 -0.980 0.327333
## ClubAlmeria -0.067650 0.179072 -0.378 0.705621
## ClubAngers SCO -0.015873 0.189533 -0.084 0.933260
## ClubArouca -0.148410 0.194829 -0.762 0.446272
## ClubArsenal 0.918837 0.167821 5.475 4.73e-08 ***
## ClubAS Monaco 0.696947 0.194602 3.581 0.000347 ***
## ClubAston Villa 0.749482 0.165704 4.523 6.33e-06 ***
## ClubAtalanta 0.304990 0.184750 1.651 0.098880 .
## ClubAtletico Pamplona -0.259524 0.201123 -1.290 0.197020
## ClubBarcelona 1.988964 0.195490 10.174 < 2e-16 ***
## ClubBayer 04 0.494880 0.174820 2.831 0.004674 **
## ClubBenfica 0.193026 0.169771 1.137 0.255638
## ClubBologna 0.134209 0.171892 0.781 0.434996
## ClubBorussia Dortmund 1.366702 0.194712 7.019 2.75e-12 ***
## ClubBorussia M'gladbach 0.509616 0.182689 2.790 0.005312 **
## ClubBournemouth 0.487559 0.169591 2.875 0.004070 **
## ClubBrentford 0.358436 0.168551 2.127 0.033537 *
## ClubBrest -0.067261 0.182801 -0.368 0.712938
## ClubBRG 0.019248 0.157981 0.122 0.903035
## ClubBrianza 0.033678 0.182709 0.184 0.853768
## ClubBrighton 0.458487 0.174525 2.627 0.008656 **
## ClubBVT -0.059024 0.179761 -0.328 0.742670
## ClubCadiz -0.142069 0.173117 -0.821 0.411909
## ClubCapitoline 0.790581 0.170903 4.626 3.89e-06 ***
## ClubCasa Pia -0.172895 0.177826 -0.972 0.330995
## ClubChaves -0.159225 0.209528 -0.760 0.447362
## ClubChelsea 1.526877 0.164999 9.254 < 2e-16 ***
## ClubClermont -0.047247 0.218946 -0.216 0.829165
## ClubCremonese -0.036981 0.189266 -0.195 0.845100
## ClubCrystal Palace 0.537598 0.169587 3.170 0.001539 **
## ClubEintracht Frankfurt 0.129550 0.194697 0.665 0.505852
## ClubElche -0.116160 0.180816 -0.642 0.520648
## ClubEmpoli 0.124569 0.174601 0.713 0.475622
## ClubEspanyol -0.103077 0.180803 -0.570 0.568649
## ClubEstoril Praia 0.007785 0.161893 0.048 0.961651
## ClubEverton 0.685651 0.170770 4.015 6.09e-05 ***
## ClubFamalicao 0.059923 0.165177 0.363 0.716793
## ClubFC Augsburg 0.061739 0.173172 0.357 0.721479
## ClubFC Bayern 2.234717 0.196425 11.377 < 2e-16 ***
## ClubFC Lorient 0.010275 0.197778 0.052 0.958570
## ClubFC Nantes -0.061870 0.184747 -0.335 0.737731
## ClubFC Porto 0.062928 0.177591 0.354 0.723107
## ClubFiorentina 0.322779 0.175973 1.834 0.066716 .
## ClubFulham 0.522133 0.179204 2.914 0.003599 **
## ClubGetafe 0.065649 0.182696 0.359 0.719370
## ClubGil Vicente -0.005827 0.166832 -0.035 0.972139
## ClubGirona -0.145731 0.186852 -0.780 0.435494
## ClubHertha Berlin 0.130317 0.177476 0.734 0.462837
## ClubHoffenheim 0.293366 0.177563 1.652 0.098602 .
## ClubInter 0.764277 0.176380 4.333 1.52e-05 ***
## ClubJuventus 1.379453 0.185342 7.443 1.28e-13 ***
## ClubLazio 0.566907 0.180839 3.135 0.001736 **
## ClubLeeds 0.575659 0.171947 3.348 0.000824 ***
## ClubLeicester 0.819626 0.164960 4.969 7.12e-07 ***
## ClubLiverpool 1.260734 0.164146 7.681 2.13e-14 ***
## ClubLOSC 0.010935 0.197607 0.055 0.955874
## ClubMainz 05 0.182796 0.174556 1.047 0.295090
## ClubMallorca -0.162769 0.179056 -0.909 0.363403
## ClubMan City 1.350584 0.185059 7.298 3.72e-13 ***
## ClubMan UFC 1.195299 0.164138 7.282 4.17e-13 ***
## ClubMilan 0.678609 0.169755 3.998 6.55e-05 ***
## ClubMontpellier -0.040407 0.194695 -0.208 0.835603
## ClubMRT 0.016451 0.156700 0.105 0.916395
## ClubNewcastle 0.809253 0.165632 4.886 1.08e-06 ***
## ClubNottm Forest 0.477735 0.165615 2.885 0.003947 **
## ClubOGC Nice 0.414227 0.191859 2.159 0.030928 *
## ClubOL 0.844987 0.213451 3.959 7.71e-05 ***
## ClubOM 0.481139 0.195070 2.466 0.013700 *
## ClubP. Ferreira -0.106482 0.185152 -0.575 0.565264
## ClubParthenope 0.602258 0.182884 3.293 0.001002 **
## ClubPSG 1.966354 0.199501 9.856 < 2e-16 ***
## ClubPTM -0.011670 0.168024 -0.069 0.944631
## ClubR. Madrid 2.679730 0.195590 13.701 < 2e-16 ***
## ClubRB Leipzig 0.795070 0.185027 4.297 1.79e-05 ***
## ClubRC Lens 0.040158 0.194731 0.206 0.836632
## ClubReal Hispalis 0.011829 0.187095 0.063 0.949593
## ClubReal San Sebastian 0.220806 0.180889 1.221 0.222307
## ClubReims 0.015216 0.197825 0.077 0.938697
## ClubRennes 0.198036 0.194643 1.017 0.309029
## ClubRio Ave -0.062034 0.171034 -0.363 0.716856
## ClubSalento 0.132188 0.169699 0.779 0.436068
## ClubSalernitana -0.158363 0.179212 -0.884 0.376948
## ClubSampdoria 0.081534 0.175921 0.463 0.643063
## ClubSanta Clara -0.095435 0.181093 -0.527 0.598236
## ClubSassuolo 0.218045 0.175956 1.239 0.215369
## ClubSC Freiburg -0.163771 0.177434 -0.923 0.356083
## ClubSchalke 04 0.041590 0.175960 0.236 0.813171
## ClubSevilla 0.392792 0.182871 2.148 0.031800 *
## ClubSouthampton 0.593551 0.164715 3.604 0.000319 ***
## ClubSpezia 0.136795 0.177523 0.771 0.441018
## ClubSporting CP 0.154626 0.166652 0.928 0.353567
## ClubStrasbourg -0.008990 0.204890 -0.044 0.965006
## ClubSV Werder 0.162659 0.200954 0.809 0.418332
## ClubTorino 0.217907 0.172291 1.265 0.206055
## ClubTottenham 0.952963 0.166954 5.708 1.26e-08 ***
## ClubToulouse FC -0.010348 0.191900 -0.054 0.956999
## ClubTroyes 0.013356 0.184930 0.072 0.942429
## ClubUdinese 0.138888 0.184771 0.752 0.452305
## ClubUnion Berlin -0.038510 0.174461 -0.221 0.825313
## ClubValencia 0.063364 0.173259 0.366 0.714601
## ClubValladolid -0.222512 0.191843 -1.160 0.246195
## ClubVallecano -0.212993 0.186947 -1.139 0.254660
## ClubVerona -0.033641 0.177505 -0.190 0.849697
## ClubVfB Stuttgart 0.068998 0.179261 0.385 0.700336
## ClubVfL Bochum -0.177947 0.174521 -1.020 0.307987
## ClubVfL Wolfsburg 0.658873 0.182828 3.604 0.000319 ***
## ClubVigo 0.017901 0.173204 0.103 0.917689
## ClubVillarreal 0.037650 0.197743 0.190 0.849010
## ClubVIZ 0.018308 0.158163 0.116 0.907857
## ClubVTSC -0.106439 0.179185 -0.594 0.552547
## ClubWest Ham 0.926710 0.174574 5.308 1.19e-07 ***
## ClubWolves 0.511822 0.163216 3.136 0.001730 **
## Age -0.145848 0.032176 -4.533 6.05e-06 ***
## Caps 0.226401 0.014687 15.415 < 2e-16 ***
## Apps 0.463157 0.033692 13.747 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6392 on 3006 degrees of freedom
## Multiple R-squared: 0.6069, Adjusted R-squared: 0.5915
## F-statistic: 39.33 on 118 and 3006 DF, p-value: < 2.2e-16
pred_num <- predict(object =model, x = test_x)
## 25/25 - 0s - 48ms/epoch - 2ms/step
pred_cat <- predict(object = model_cat, x=position_ohe_test)
## 25/25 - 0s - 34ms/epoch - 1ms/step
pred_club <- predict(object = model_with_embeddings, x=as.matrix(embeddings_test))
## 25/25 - 0s - 34ms/epoch - 1ms/step
Recall that we have scaled the data to achieve convergence faster. To calculate MAE or MSE, we need to revert the resulting prediction back to its original units
#Fetch the mean and standard deviation information
mean_wage <- mean(data_test$Wage)
std_wage <- sd(data_test$Wage)
# Define a function that can revert back scaled numbers
revert_from_scale <- function(x) {
wage_pred <- x * std_wage + mean_wage
wage_pred <- as.numeric(wage_pred)
return(wage_pred)
}
#Save the prediction in a dataframe
pred_num_revert <- do.call(rbind, lapply(pred_num, revert_from_scale))
pred_cat_revert <- do.call(rbind, lapply(pred_cat, revert_from_scale))
pred_club_revert <- do.call(rbind, lapply(pred_club, revert_from_scale))
#final check on dimensions
dim(pred_num_revert)
## [1] 782 1
dim(pred_cat_revert)
## [1] 782 1
dim(pred_club_revert)
## [1] 782 1
length(data_test$Wage)
## [1] 782
calculate_mae <- function(predicted, actual) {
if (length(predicted) != length(actual)) {
stop("Input vectors must have the same length")
}
abs_diff <- abs(predicted - actual)
mae <- mean(abs_diff)
return(mae)
}
#Apply the MAE function
mae_numerical <- calculate_mae(as.matrix(pred_num_revert), data_test$Wage)
mae_positions <- calculate_mae(as.matrix(pred_cat_revert), data_test$Wage)
mae_club <- calculate_mae(as.matrix(pred_club_revert), data_test$Wage)
#Value of MAEs
round(mae_numerical,0)
## [1] 923236
round(mae_positions,0)
## [1] 998195
round(mae_club,0)
## [1] 1493547
#predict
predicted_num_ols <- predict(ols_num, newdata = data_test_norm %>% select(-Wage))
predicted_cat_ols <- predict(ols_cat, newdata = data_test_norm %>% select(-Wage))
predicted_club_ols <- predict(ols_club, newdata = data_test_norm %>% select(-Wage))
# Revert scaling back
pred_num_ols<- do.call(rbind, lapply(predicted_num_ols, revert_from_scale))
pred_cat_ols<- do.call(rbind, lapply(predicted_cat_ols, revert_from_scale))
pred_club_ols<- do.call(rbind, lapply(predicted_club_ols, revert_from_scale))
pred_all_ols<- do.call(rbind, lapply(predicted_num_ols, revert_from_scale))
#calculate MAE
mae_num <- mean(abs(pred_num_ols - data_test$Wage))
mae_cat <- mean(abs(pred_cat_ols - data_test$Wage))
mae_club_ols <- mean(abs(pred_club_ols - data_test$Wage))
#Value of MAE
round(mae_num,0)
## [1] 1054230
round(mae_cat,0)
## [1] 1050488
round(mae_club_ols,0)
## [1] 889619