# Load libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(leaps)
library(tidyverse)
library(psych)
library(rpart)
library(kableExtra)
library(C50)
library(tm)
library(qdap)
library(keras)

Problem 2: Problem2—Predicting Baseball players’ salaries

Given the hitters dataset attached with this assignment spec, the goal of this problem is to predict the Salary for baseball players based on other variables in the dataset. The dataset is a collection of Major League Baseball data from the 1986 and 1987 seasons. You can find more information about the dataset here: https://rdrr.io/cran/ISLR/man/Hitters.html

1. Download the dataset hitters.csv and explore the overall structure of the dataset using the str() function. Get a summary statistics of each variable. Answer the following questions:

  • How many observations do you have in the data?
  • How many categorical and numeric variables you have in your data?
  • Is there any missing value?
  • Draw the histogram of salary. Interpret what you see in the histogram.
# Read the data
df <- read.csv('/Users/subhalaxmirout/CSC 532 - ML/hitters-1.csv', header = T, sep = ",", na.strings = "?", strip.white=TRUE)

str(df)
## 'data.frame':    322 obs. of  20 variables:
##  $ AtBat    : int  293 315 479 496 321 594 185 298 323 401 ...
##  $ Hits     : int  66 81 130 141 87 169 37 73 81 92 ...
##  $ HmRun    : int  1 7 18 20 10 4 1 0 6 17 ...
##  $ Runs     : int  30 24 66 65 39 74 23 24 26 49 ...
##  $ RBI      : int  29 38 72 78 42 51 8 24 32 66 ...
##  $ Walks    : int  14 39 76 37 30 35 21 7 8 65 ...
##  $ Years    : int  1 14 3 11 2 11 2 3 2 13 ...
##  $ CAtBat   : int  293 3449 1624 5628 396 4408 214 509 341 5206 ...
##  $ CHits    : int  66 835 457 1575 101 1133 42 108 86 1332 ...
##  $ CHmRun   : int  1 69 63 225 12 19 1 0 6 253 ...
##  $ CRuns    : int  30 321 224 828 48 501 30 41 32 784 ...
##  $ CRBI     : int  29 414 266 838 46 336 9 37 34 890 ...
##  $ CWalks   : int  14 375 263 354 33 194 24 12 8 866 ...
##  $ League   : chr  "A" "N" "A" "N" ...
##  $ Division : chr  "E" "W" "W" "E" ...
##  $ PutOuts  : int  446 632 880 200 805 282 76 121 143 0 ...
##  $ Assists  : int  33 43 82 11 40 421 127 283 290 0 ...
##  $ Errors   : int  20 10 14 3 4 25 7 9 19 0 ...
##  $ Salary   : chr  "NA" "475" "480" "500" ...
##  $ NewLeague: chr  "A" "N" "A" "N" ...
summary(df)
##      AtBat            Hits         HmRun            Runs       
##  Min.   : 16.0   Min.   :  1   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:255.2   1st Qu.: 64   1st Qu.: 4.00   1st Qu.: 30.25  
##  Median :379.5   Median : 96   Median : 8.00   Median : 48.00  
##  Mean   :380.9   Mean   :101   Mean   :10.77   Mean   : 50.91  
##  3rd Qu.:512.0   3rd Qu.:137   3rd Qu.:16.00   3rd Qu.: 69.00  
##  Max.   :687.0   Max.   :238   Max.   :40.00   Max.   :130.00  
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 28.00   1st Qu.: 22.00   1st Qu.: 4.000   1st Qu.:  816.8  
##  Median : 44.00   Median : 35.00   Median : 6.000   Median : 1928.0  
##  Mean   : 48.03   Mean   : 38.74   Mean   : 7.444   Mean   : 2648.7  
##  3rd Qu.: 64.75   3rd Qu.: 53.00   3rd Qu.:11.000   3rd Qu.: 3924.2  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##      CHits            CHmRun           CRuns             CRBI        
##  Min.   :   4.0   Min.   :  0.00   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.: 209.0   1st Qu.: 14.00   1st Qu.: 100.2   1st Qu.:  88.75  
##  Median : 508.0   Median : 37.50   Median : 247.0   Median : 220.50  
##  Mean   : 717.6   Mean   : 69.49   Mean   : 358.8   Mean   : 330.12  
##  3rd Qu.:1059.2   3rd Qu.: 90.00   3rd Qu.: 526.2   3rd Qu.: 426.25  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.00  
##      CWalks           League            Division            PutOuts      
##  Min.   :   0.00   Length:322         Length:322         Min.   :   0.0  
##  1st Qu.:  67.25   Class :character   Class :character   1st Qu.: 109.2  
##  Median : 170.50   Mode  :character   Mode  :character   Median : 212.0  
##  Mean   : 260.24                                         Mean   : 288.9  
##  3rd Qu.: 339.25                                         3rd Qu.: 325.0  
##  Max.   :1566.00                                         Max.   :1378.0  
##     Assists          Errors         Salary           NewLeague        
##  Min.   :  0.0   Min.   : 0.00   Length:322         Length:322        
##  1st Qu.:  7.0   1st Qu.: 3.00   Class :character   Class :character  
##  Median : 39.5   Median : 6.00   Mode  :character   Mode  :character  
##  Mean   :106.9   Mean   : 8.04                                        
##  3rd Qu.:166.0   3rd Qu.:11.00                                        
##  Max.   :492.0   Max.   :32.00

How many observations do you have in the data?

The dataset consist of 322 observations and 20 variables.

How many categorical and numeric variables you have in your data?

categorical_variable <- sapply(df, is.character)
numerical_variables <- sapply(df, is.numeric)
count_categorical <- sum(categorical_variable)
count_numerical <- sum(numerical_variables)
cat("categorical variables: ",count_categorical,"\n")
## categorical variables:  4
cat("numerical variables: ", count_numerical)
## numerical variables:  16

Is there any missing value?

colSums(is.na(df))
##     AtBat      Hits     HmRun      Runs       RBI     Walks     Years    CAtBat 
##         0         0         0         0         0         0         0         0 
##     CHits    CHmRun     CRuns      CRBI    CWalks    League  Division   PutOuts 
##         0         0         0         0         0         0         0         0 
##   Assists    Errors    Salary NewLeague 
##         0         0         0         0

No missing value in the dataset.

Draw the histogram of salary. Interpret what you see in the histogram.

#Replace NA as 0
df$Salary[is.na(df$Salary)] <- 0
df$Salary <- as.integer(df$Salary)
# Draw a histogram
ggplot(df, aes(Salary)) +
  geom_histogram(fill = "blue", alpha = 0.5, bins = 30) +
  labs(x = "Salary(in thousands$)", y = "Frequency") +
  ggtitle("Histogram of Salary") +
  theme(plot.title = element_text(hjust = 0.5))

The salary looks completely right skewed. There is an outlier at 2500k.

2. Remove the observation for which Salary value is missing

df_new <- df
df_new<-df_new[!(df_new$Salary == 0),]

3. (2 pt) Which predictors have most correlation with Salary? Use scattered plot, side-by-side box plots, t-test and correlation matrix to answer this question.

numeric_values <- df_new %>% 
  dplyr::select_if(is.numeric)
numeric_values <- numeric_values[complete.cases(numeric_values),] %>% 
  data.frame()
train_cor <- cor(numeric_values)
corrplot::corrplot.mixed(train_cor, tl.col = 'black', tl.pos = 'lt', number.cex = 0.5)

pairs.panels(df_new[c("Salary", "AtBat", "Hits", "HmRun","Runs" )])

pairs.panels(df_new[c("Salary", "RBI","Walks", "Years")])

pairs.panels(df_new[c("Salary", "CAtBat","CHits", "CHmRun", "CRuns")])

pairs.panels(df_new[c("Salary", "CRBI", "CWalks","PutOuts", "Assists")])

Salary having more positive co-relation with “CAtBat”,“CHits”, “CHmRun”, and “CRuns”

Salary vs League

# Create the boxplot
boxPlot_league = ggplot(df_new, aes(x = League, y = Salary, color = League)) + geom_boxplot(outlier.colour = "blue",outlier.size = 2) + theme_classic()
boxPlot_league

# t-test between Private and Apps
t.test(Salary~League,alternative="two.sided", data=df_new)
## 
##  Welch Two Sample t-test
## 
## data:  Salary by League
## t = 0.23195, df = 260.26, p-value = 0.8168
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -96.63432 122.44030
## sample estimates:
## mean in group A mean in group N 
##        541.9353        529.0323

Here p-value is greater than 0.05, that is, we can conclude that there is no statistically significant difference between the means of the numeric variable for each level of the categorical variable. The Salary is not associated with League.

Salary vs Division

# Create the boxplot
boxPlot_div = ggplot(df_new, aes(x = Division, y = Salary, color = Division)) + geom_boxplot(outlier.colour = "blue",outlier.size = 2) + theme_classic()
boxPlot_div

# t-test between Private and Apps
t.test(Salary~Division,alternative="two.sided", data=df_new)
## 
##  Welch Two Sample t-test
## 
## data:  Salary by Division
## t = 3.1444, df = 218.46, p-value = 0.001895
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   64.69926 282.01554
## sample estimates:
## mean in group E mean in group W 
##        624.1783        450.8209

Here p-value is less than 0.05, that is, we can conclude that there is a statistically significant difference between the means of the numeric variable for each level of the categorical variable. The Salary is associated with Division

Salary vs NewLeague

# Create the boxplot
boxPlot_nleague = ggplot(df_new, aes(x = NewLeague, y = Salary, color = NewLeague)) + geom_boxplot(outlier.colour = "blue",outlier.size = 2) + theme_classic()
boxPlot_nleague

# t-test between Private and Apps
t.test(Salary~NewLeague,alternative="two.sided", data=df_new)
## 
##  Welch Two Sample t-test
## 
## data:  Salary by NewLeague
## t = 0.046341, df = 258.07, p-value = 0.9631
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -107.1549  112.3197
## sample estimates:
## mean in group A mean in group N 
##        537.0496        534.4672

Here p-value is greater than 0.05, that is, we can conclude that there is no statistically significant difference between the means of the numeric variable for each level of the categorical variable. The Salary is not associated with NewLeague.

4. Use set.seed(1) to set the random seed so I can reproduce your results.

set.seed(1)

5. Use Caret’s “createDataPartition” method as follows to partition the dataset into hitters_train, and hitters_test (use 90% for training and 10% for testing)

df_new <- na.omit(df_new)
split_1 = createDataPartition(df_new$Salary, p=0.9, list=FALSE)
df_train = df_new[split_1,]
df_test = df_new[-split_1,]

dim(df_train)
## [1] 239  20
dim(df_test)
## [1] 24 20

The first line creates a random 90%-10% split of data such that the distribution of the target variable hitters$salary is preserved in each split. The list = FALSE option avoids returning the data as a list. Instead, inTrain is a vector of indices used to get the training and test data.

6. (1pt) Neural networks do not accept categorical variables and we must encode the categorical variables before training the network. All the categorical variables in this dataset are binary ( i.e., have two levels) so you can encode them by simply using iflese function to convert each to a numeric variable with two values 0 and 1.

df_train$League=ifelse(df_train$League=="N",1,2)
df_train$Division=ifelse(df_train$Division=="W",1,2)
df_train$NewLeague=ifelse(df_train$NewLeague=="N",1,2)

df_test$League=ifelse(df_test$League=="N",1,2)
df_test$Division=ifelse(df_test$Division=="W",1,2)
df_test$NewLeague=ifelse(df_test$NewLeague=="N",1,2)

7. (1pt)Replace the salary column with log(salary) where log is the logarithm function. This will be the attribute we want to predict.

  • 1- Salary variable is right-skewed. A skewed target variable can make a machine learning model biased. For instance, in this case lower salaries are more frequent in the training data compared to the higher salaries. Therefore, a machine learning model trained on this data is less likely to successfully predict higher salaries. When we take the log of a right-skewed distribution, it makes the distribution more symmetrical.

  • 2- The range of salary is very large causing the gradients of the loss function to also be large. Multiplying a chain of large gradients during backpropagation can result in numeric overflow and you might see a NAN value for loss function after a few epochs of training. This is called exploding gradient problem. By predicting the salary in the log scale we can avoid the exploding gradients problem for this dataset.

df_train$Salary <- log(df_train$Salary)
#df_test$Salary <- log(df_test$Salary)

hist(df_train$Salary)

8. (1pt) Set.seed(1) and further divide the hitters_train data into 90% training and 10% validation using Caret’s “CreateDataPartition” function.

set.seed(1)
split_2 = createDataPartition(df_train$Salary, p=0.9, list=FALSE)
df_train = df_train[split_2,]
df_val = df_train[-split_2,]

dim(df_train)
## [1] 216  20
dim(df_val)
## [1] 20 20

9.

## 'data.frame':    263 obs. of  20 variables:
##  $ AtBat    : int  315 479 496 321 594 185 298 323 401 574 ...
##  $ Hits     : int  81 130 141 87 169 37 73 81 92 159 ...
##  $ HmRun    : int  7 18 20 10 4 1 0 6 17 21 ...
##  $ Runs     : int  24 66 65 39 74 23 24 26 49 107 ...
##  $ RBI      : int  38 72 78 42 51 8 24 32 66 75 ...
##  $ Walks    : int  39 76 37 30 35 21 7 8 65 59 ...
##  $ Years    : int  14 3 11 2 11 2 3 2 13 10 ...
##  $ CAtBat   : int  3449 1624 5628 396 4408 214 509 341 5206 4631 ...
##  $ CHits    : int  835 457 1575 101 1133 42 108 86 1332 1300 ...
##  $ CHmRun   : int  69 63 225 12 19 1 0 6 253 90 ...
##  $ CRuns    : int  321 224 828 48 501 30 41 32 784 702 ...
##  $ CRBI     : int  414 266 838 46 336 9 37 34 890 504 ...
##  $ CWalks   : int  375 263 354 33 194 24 12 8 866 488 ...
##  $ League   : chr  "N" "A" "N" "N" ...
##  $ Division : chr  "W" "W" "E" "E" ...
##  $ PutOuts  : int  632 880 200 805 282 76 121 143 0 238 ...
##  $ Assists  : int  43 82 11 40 421 127 283 290 0 445 ...
##  $ Errors   : int  10 14 3 4 25 7 9 19 0 22 ...
##  $ Salary   : int  475 480 500 91 750 70 100 75 1100 517 ...
##  $ NewLeague: chr  "N" "A" "N" "N" ...
##  - attr(*, "na.action")= 'omit' Named int [1:59] 1 16 19 23 31 33 37 39 40 42 ...
##   ..- attr(*, "names")= chr [1:59] "NA" "NA.1" "NA.2" "NA.3" ...
##     AtBat      Hits     HmRun      Runs       RBI     Walks     Years    CAtBat 
##         0         0         0         0         0         0         0         0 
##     CHits    CHmRun     CRuns      CRBI    CWalks    League  Division   PutOuts 
##         0         0         0         0         0         0         0         0 
##   Assists    Errors    Salary NewLeague 
##         0         0         0         0

slide - 105 - multi class classification