# Load libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(leaps)
library(tidyverse)
library(psych)
library(rpart)
library(kableExtra)
library(C50)
library(tm)
library(qdap)
library(keras)Given the hitters dataset attached with this assignment spec, the goal of this problem is to predict the Salary for baseball players based on other variables in the dataset. The dataset is a collection of Major League Baseball data from the 1986 and 1987 seasons. You can find more information about the dataset here: https://rdrr.io/cran/ISLR/man/Hitters.html
1. Download the dataset hitters.csv and explore the overall structure of the dataset using the str() function. Get a summary statistics of each variable. Answer the following questions:
# Read the data
df <- read.csv('/Users/subhalaxmirout/CSC 532 - ML/hitters-1.csv', header = T, sep = ",", na.strings = "?", strip.white=TRUE)
str(df)## 'data.frame': 322 obs. of 20 variables:
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : chr "A" "N" "A" "N" ...
## $ Division : chr "E" "W" "W" "E" ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : chr "NA" "475" "480" "500" ...
## $ NewLeague: chr "A" "N" "A" "N" ...
## AtBat Hits HmRun Runs
## Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
## Median :379.5 Median : 96 Median : 8.00 Median : 48.00
## Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
## Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
## Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
## Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
## 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
## Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
## Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
## 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
## CWalks League Division PutOuts
## Min. : 0.00 Length:322 Length:322 Min. : 0.0
## 1st Qu.: 67.25 Class :character Class :character 1st Qu.: 109.2
## Median : 170.50 Mode :character Mode :character Median : 212.0
## Mean : 260.24 Mean : 288.9
## 3rd Qu.: 339.25 3rd Qu.: 325.0
## Max. :1566.00 Max. :1378.0
## Assists Errors Salary NewLeague
## Min. : 0.0 Min. : 0.00 Length:322 Length:322
## 1st Qu.: 7.0 1st Qu.: 3.00 Class :character Class :character
## Median : 39.5 Median : 6.00 Mode :character Mode :character
## Mean :106.9 Mean : 8.04
## 3rd Qu.:166.0 3rd Qu.:11.00
## Max. :492.0 Max. :32.00
How many observations do you have in the data?
The dataset consist of 322 observations and 20 variables.
How many categorical and numeric variables you have in your data?
categorical_variable <- sapply(df, is.character)
numerical_variables <- sapply(df, is.numeric)
count_categorical <- sum(categorical_variable)
count_numerical <- sum(numerical_variables)
cat("categorical variables: ",count_categorical,"\n")## categorical variables: 4
## numerical variables: 16
Is there any missing value?
## AtBat Hits HmRun Runs RBI Walks Years CAtBat
## 0 0 0 0 0 0 0 0
## CHits CHmRun CRuns CRBI CWalks League Division PutOuts
## 0 0 0 0 0 0 0 0
## Assists Errors Salary NewLeague
## 0 0 0 0
No missing value in the dataset.
Draw the histogram of salary. Interpret what you see in the histogram.
#Replace NA as 0
df$Salary[is.na(df$Salary)] <- 0
df$Salary <- as.integer(df$Salary)
# Draw a histogram
ggplot(df, aes(Salary)) +
geom_histogram(fill = "blue", alpha = 0.5, bins = 30) +
labs(x = "Salary(in thousands$)", y = "Frequency") +
ggtitle("Histogram of Salary") +
theme(plot.title = element_text(hjust = 0.5))The salary looks completely right skewed. There is an outlier at 2500k.
2. Remove the observation for which Salary value is missing
3. (2 pt) Which predictors have most correlation with Salary? Use scattered plot, side-by-side box plots, t-test and correlation matrix to answer this question.
numeric_values <- df_new %>%
dplyr::select_if(is.numeric)
numeric_values <- numeric_values[complete.cases(numeric_values),] %>%
data.frame()
train_cor <- cor(numeric_values)
corrplot::corrplot.mixed(train_cor, tl.col = 'black', tl.pos = 'lt', number.cex = 0.5) Salary having more positive co-relation with “CAtBat”,“CHits”, “CHmRun”, and “CRuns”
Salary vs League
# Create the boxplot
boxPlot_league = ggplot(df_new, aes(x = League, y = Salary, color = League)) + geom_boxplot(outlier.colour = "blue",outlier.size = 2) + theme_classic()
boxPlot_league##
## Welch Two Sample t-test
##
## data: Salary by League
## t = 0.23195, df = 260.26, p-value = 0.8168
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -96.63432 122.44030
## sample estimates:
## mean in group A mean in group N
## 541.9353 529.0323
Here p-value is greater than 0.05, that is, we can conclude that there is no statistically significant difference between the means of the numeric variable for each level of the categorical variable. The Salary is not associated with League.
Salary vs Division
# Create the boxplot
boxPlot_div = ggplot(df_new, aes(x = Division, y = Salary, color = Division)) + geom_boxplot(outlier.colour = "blue",outlier.size = 2) + theme_classic()
boxPlot_div##
## Welch Two Sample t-test
##
## data: Salary by Division
## t = 3.1444, df = 218.46, p-value = 0.001895
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 64.69926 282.01554
## sample estimates:
## mean in group E mean in group W
## 624.1783 450.8209
Here p-value is less than 0.05, that is, we can conclude that there is a statistically significant difference between the means of the numeric variable for each level of the categorical variable. The Salary is associated with Division
Salary vs NewLeague
# Create the boxplot
boxPlot_nleague = ggplot(df_new, aes(x = NewLeague, y = Salary, color = NewLeague)) + geom_boxplot(outlier.colour = "blue",outlier.size = 2) + theme_classic()
boxPlot_nleague##
## Welch Two Sample t-test
##
## data: Salary by NewLeague
## t = 0.046341, df = 258.07, p-value = 0.9631
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -107.1549 112.3197
## sample estimates:
## mean in group A mean in group N
## 537.0496 534.4672
Here p-value is greater than 0.05, that is, we can conclude that there is no statistically significant difference between the means of the numeric variable for each level of the categorical variable. The Salary is not associated with NewLeague.
4. Use set.seed(1) to set the random seed so I can reproduce your results.
5. Use Caret’s “createDataPartition” method as follows to partition the dataset into hitters_train, and hitters_test (use 90% for training and 10% for testing)
df_new <- na.omit(df_new)
split_1 = createDataPartition(df_new$Salary, p=0.9, list=FALSE)
df_train = df_new[split_1,]
df_test = df_new[-split_1,]
dim(df_train)## [1] 239 20
## [1] 24 20
The first line creates a random 90%-10% split of data such that the distribution of the target variable hitters$salary is preserved in each split. The list = FALSE option avoids returning the data as a list. Instead, inTrain is a vector of indices used to get the training and test data.
6. (1pt) Neural networks do not accept categorical variables and we must encode the categorical variables before training the network. All the categorical variables in this dataset are binary ( i.e., have two levels) so you can encode them by simply using iflese function to convert each to a numeric variable with two values 0 and 1.
df_train$League=ifelse(df_train$League=="N",1,2)
df_train$Division=ifelse(df_train$Division=="W",1,2)
df_train$NewLeague=ifelse(df_train$NewLeague=="N",1,2)
df_test$League=ifelse(df_test$League=="N",1,2)
df_test$Division=ifelse(df_test$Division=="W",1,2)
df_test$NewLeague=ifelse(df_test$NewLeague=="N",1,2)7. (1pt)Replace the salary column with log(salary) where log is the logarithm function. This will be the attribute we want to predict.
1- Salary variable is right-skewed. A skewed target variable can make a machine learning model biased. For instance, in this case lower salaries are more frequent in the training data compared to the higher salaries. Therefore, a machine learning model trained on this data is less likely to successfully predict higher salaries. When we take the log of a right-skewed distribution, it makes the distribution more symmetrical.
2- The range of salary is very large causing the gradients of the loss function to also be large. Multiplying a chain of large gradients during backpropagation can result in numeric overflow and you might see a NAN value for loss function after a few epochs of training. This is called exploding gradient problem. By predicting the salary in the log scale we can avoid the exploding gradients problem for this dataset.
df_train$Salary <- log(df_train$Salary)
#df_test$Salary <- log(df_test$Salary)
hist(df_train$Salary)8. (1pt) Set.seed(1) and further divide the hitters_train data into 90% training and 10% validation using Caret’s “CreateDataPartition” function.
set.seed(1)
split_2 = createDataPartition(df_train$Salary, p=0.9, list=FALSE)
df_train = df_train[split_2,]
df_val = df_train[-split_2,]
dim(df_train)## [1] 216 20
## [1] 20 20
9.
## 'data.frame': 263 obs. of 20 variables:
## $ AtBat : int 315 479 496 321 594 185 298 323 401 574 ...
## $ Hits : int 81 130 141 87 169 37 73 81 92 159 ...
## $ HmRun : int 7 18 20 10 4 1 0 6 17 21 ...
## $ Runs : int 24 66 65 39 74 23 24 26 49 107 ...
## $ RBI : int 38 72 78 42 51 8 24 32 66 75 ...
## $ Walks : int 39 76 37 30 35 21 7 8 65 59 ...
## $ Years : int 14 3 11 2 11 2 3 2 13 10 ...
## $ CAtBat : int 3449 1624 5628 396 4408 214 509 341 5206 4631 ...
## $ CHits : int 835 457 1575 101 1133 42 108 86 1332 1300 ...
## $ CHmRun : int 69 63 225 12 19 1 0 6 253 90 ...
## $ CRuns : int 321 224 828 48 501 30 41 32 784 702 ...
## $ CRBI : int 414 266 838 46 336 9 37 34 890 504 ...
## $ CWalks : int 375 263 354 33 194 24 12 8 866 488 ...
## $ League : chr "N" "A" "N" "N" ...
## $ Division : chr "W" "W" "E" "E" ...
## $ PutOuts : int 632 880 200 805 282 76 121 143 0 238 ...
## $ Assists : int 43 82 11 40 421 127 283 290 0 445 ...
## $ Errors : int 10 14 3 4 25 7 9 19 0 22 ...
## $ Salary : int 475 480 500 91 750 70 100 75 1100 517 ...
## $ NewLeague: chr "N" "A" "N" "N" ...
## - attr(*, "na.action")= 'omit' Named int [1:59] 1 16 19 23 31 33 37 39 40 42 ...
## ..- attr(*, "names")= chr [1:59] "NA" "NA.1" "NA.2" "NA.3" ...
## AtBat Hits HmRun Runs RBI Walks Years CAtBat
## 0 0 0 0 0 0 0 0
## CHits CHmRun CRuns CRBI CWalks League Division PutOuts
## 0 0 0 0 0 0 0 0
## Assists Errors Salary NewLeague
## 0 0 0 0
slide - 105 - multi class classification