# Import all the relevent libraries
library(tm)
library(gmodels)
library(Matrix)
library(qdap)
library(keras)
library(tensorflow)
library(readr)
library(tfruns)
library(ggplot2)
library(tidyr)
library(dplyr)
library(corrplot)
library(caret)
library(neuralnet)
library(GGally)
library(NLP)Download the dataset hitters.csv and explore the overall structure of the dataset using the str() function. Get a summary statistics of each variable. Answer the following questions:
How many observations do you have in the data?
How many categorical and numeric variables you have in your data?
Is there any missing value?
Draw the histogram of salary. Interpret what you see in the histogram.
# Load the dataset
hitters <- read.csv("hitters-1.csv")
# Explore the overall structure of the dataset
str(hitters)## 'data.frame': 322 obs. of 20 variables:
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : chr "A" "N" "A" "N" ...
## $ Division : chr "E" "W" "W" "E" ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
## $ NewLeague: chr "A" "N" "A" "N" ...
# Get a summary statistics of each variable
summary(hitters)## AtBat Hits HmRun Runs
## Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
## Median :379.5 Median : 96 Median : 8.00 Median : 48.00
## Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
## Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
##
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
## Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
## Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
## 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
##
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
## Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
## Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
## 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
##
## CWalks League Division PutOuts
## Min. : 0.00 Length:322 Length:322 Min. : 0.0
## 1st Qu.: 67.25 Class :character Class :character 1st Qu.: 109.2
## Median : 170.50 Mode :character Mode :character Median : 212.0
## Mean : 260.24 Mean : 288.9
## 3rd Qu.: 339.25 3rd Qu.: 325.0
## Max. :1566.00 Max. :1378.0
##
## Assists Errors Salary NewLeague
## Min. : 0.0 Min. : 0.00 Min. : 67.5 Length:322
## 1st Qu.: 7.0 1st Qu.: 3.00 1st Qu.: 190.0 Class :character
## Median : 39.5 Median : 6.00 Median : 425.0 Mode :character
## Mean :106.9 Mean : 8.04 Mean : 535.9
## 3rd Qu.:166.0 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :492.0 Max. :32.00 Max. :2460.0
## NA's :59
1.1 Number of observation
# Number of observation in the data
nrow(hitters)## [1] 322
The dataset consist of 322 observations and 20 variables.
1.2 Types of Variables
Numeric Variables
cont_vars <- sapply(hitters, is.numeric)
cont_vars <- names(which(cont_vars==TRUE))
as.data.frame(cont_vars)## cont_vars
## 1 AtBat
## 2 Hits
## 3 HmRun
## 4 Runs
## 5 RBI
## 6 Walks
## 7 Years
## 8 CAtBat
## 9 CHits
## 10 CHmRun
## 11 CRuns
## 12 CRBI
## 13 CWalks
## 14 PutOuts
## 15 Assists
## 16 Errors
## 17 Salary
Categorical Variable:
cat_vars <- sapply(hitters, is.character)
cat_vars <- names(which(cat_vars==TRUE))
as.data.frame(cat_vars)## cat_vars
## 1 League
## 2 Division
## 3 NewLeague
1.3 Find missing values if there is any
as.data.frame(colSums(is.na(hitters)))## colSums(is.na(hitters))
## AtBat 0
## Hits 0
## HmRun 0
## Runs 0
## RBI 0
## Walks 0
## Years 0
## CAtBat 0
## CHits 0
## CHmRun 0
## CRuns 0
## CRBI 0
## CWalks 0
## League 0
## Division 0
## PutOuts 0
## Assists 0
## Errors 0
## Salary 59
## NewLeague 0
Only column named salary has missing values.
1.4 Plot the distribution of Salary using histogram
ggplot(hitters, aes(x = Salary)) +
geom_histogram(bins = 20, fill = "steelblue", color = "white") +
labs(title = "Histogram of Salary", x = "Salary", y = "Frequency")The histogram of Salary shows that the distribution of salaries is right-skewed, with a long tail to the right. Most of the salaries are concentrated in the lower range, with very few players earning higher salaries. This suggests that there is a significant disparity in salaries among baseball players, with only a small number of elite players earning very high salaries, while the majority of players earn much lower salaries.
# remove null values.
hitters <- na.omit(hitters)
colSums(is.na(hitters))## AtBat Hits HmRun Runs RBI Walks Years CAtBat
## 0 0 0 0 0 0 0 0
## CHits CHmRun CRuns CRBI CWalks League Division PutOuts
## 0 0 0 0 0 0 0 0
## Assists Errors Salary NewLeague
## 0 0 0 0
1) scatter plot
cols <- c("Salary", "AtBat", "Hits", "HmRun", "Runs", "RBI", "Walks",
"Years", "CAtBat", "CHits", "CHmRun", "CRuns", "CRBI", "CWalks",
"PutOuts", "Assists", "Errors")
ggpairs(hitters[, cols])ii) side-by-side Box plots
# Subset the data to only include the relevant predictors
predictors <- c("League", "Division", "NewLeague")
data <- hitters[c(predictors, "Salary")]# Create side-by-side box plots for each predictor
par(mfrow = c(1, length(predictors)))
for (i in 1:length(predictors)) {
boxplot(Salary ~ eval(parse(text = predictors[i])), data = data,
main = predictors[i], xlab = predictors[i], ylab = "Salary")
}iii) t-test:
continuous_variables <- names(select_if(hitters, is.numeric))
# run a for loop through continuous variables and perform t-tests
for (var in continuous_variables) {
print(paste0("T-test for association between Salary and ", var, ":"))
print(t.test(hitters[[var]], hitters$Salary))
print("_____________________________________________________________________")
}## [1] "T-test for association between Salary and AtBat:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -4.5206, df = 317.24, p-value = 8.72e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -189.85666 -74.70993
## sample estimates:
## mean of x mean of y
## 403.6426 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Hits:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -15.313, df = 267.24, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -483.139 -373.055
## sample estimates:
## mean of x mean of y
## 107.8289 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and HmRun:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -18.845, df = 262.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -579.0899 -469.5223
## sample estimates:
## mean of x mean of y
## 11.61977 535.92588
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Runs:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -17.27, df = 263.68, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -536.0404 -426.3209
## sample estimates:
## mean of x mean of y
## 54.74525 535.92588
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and RBI:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -17.387, df = 263.72, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -539.3013 -429.5771
## sample estimates:
## mean of x mean of y
## 51.48669 535.92588
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Walks:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -17.767, df = 263.21, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -549.6477 -439.9759
## sample estimates:
## mean of x mean of y
## 41.11407 535.92588
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Years:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -19.002, df = 262.06, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -583.3908 -473.8374
## sample estimates:
## mean of x mean of y
## 7.311787 535.925882
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and CAtBat:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = 14.763, df = 282.36, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1838.730 2404.506
## sample estimates:
## mean of x mean of y
## 2657.5437 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and CHits:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = 3.8249, df = 467.57, p-value = 0.0001486
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 90.56889 281.95197
## sample estimates:
## mean of x mean of y
## 722.1863 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and CHmRun:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -16.505, df = 279.38, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -522.3458 -411.0269
## sample estimates:
## mean of x mean of y
## 69.23954 535.92588
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and CRuns:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -5.0626, df = 480.86, p-value = 5.901e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -242.5126 -106.8981
## sample estimates:
## mean of x mean of y
## 361.2205 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and CRBI:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -6.0045, df = 475.01, p-value = 3.817e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -272.7598 -138.2555
## sample estimates:
## mean of x mean of y
## 330.4183 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and CWalks:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -8.5523, df = 422.67, p-value = 2.237e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -339.0150 -212.3045
## sample estimates:
## mean of x mean of y
## 260.2662 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and PutOuts:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -7.4903, df = 437.72, p-value = 3.827e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -309.5574 -180.8723
## sample estimates:
## mean of x mean of y
## 290.7110 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Assists:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -14.277, df = 315.62, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -474.6566 -359.6742
## sample estimates:
## mean of x mean of y
## 118.7605 535.9259
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Errors:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = -18.955, df = 262.11, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -582.1122 -472.5533
## sample estimates:
## mean of x mean of y
## 8.593156 535.925882
##
## [1] "_____________________________________________________________________"
## [1] "T-test for association between Salary and Salary:"
##
## Welch Two Sample t-test
##
## data: hitters[[var]] and hitters$Salary
## t = 0, df = 524, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -77.28235 77.28235
## sample estimates:
## mean of x mean of y
## 535.9259 535.9259
##
## [1] "_____________________________________________________________________"
Based on the t-test results, the variables that show a very strong association with salary are Hits, HmRun, Runs, RBI, Walks, Years, CAtBat, and CHmRun. The p-values for all these variables are less than 0.05, which suggests that the mean salary for the groups with high and low values of these variables are significantly different.
iv) Correlation Matrix
# Select only the numeric columns
num_cols <- sapply(hitters, is.numeric)
hitters_num <- hitters[, num_cols]
# Compute the correlation matrix
cor_mat <- cor(hitters_num)
# Plot the correlation matrix as a heatmap
corrplot(cor_mat, method = "color", type = "upper",
tl.col = "black")cor_mat## AtBat Hits HmRun Runs RBI Walks
## AtBat 1.0000000 0.96396913 0.555102154 0.89982910 0.79601539 0.6244481
## Hits 0.9639691 1.00000000 0.530627358 0.91063014 0.78847819 0.5873105
## HmRun 0.5551022 0.53062736 1.000000000 0.63107588 0.84910743 0.4404537
## Runs 0.8998291 0.91063014 0.631075883 1.00000000 0.77869235 0.6970151
## RBI 0.7960154 0.78847819 0.849107434 0.77869235 1.00000000 0.5695048
## Walks 0.6244481 0.58731051 0.440453717 0.69701510 0.56950476 1.0000000
## Years 0.0127255 0.01859809 0.113488420 -0.01197495 0.12966795 0.1347927
## CAtBat 0.2071663 0.20667761 0.217463613 0.17181080 0.27812591 0.2694500
## CHits 0.2253415 0.23560577 0.217495691 0.19132697 0.29213714 0.2707951
## CHmRun 0.2124215 0.18936425 0.492525845 0.22970104 0.44218969 0.3495822
## CRuns 0.2372778 0.23889610 0.258346846 0.23783121 0.30722616 0.3329766
## CRBI 0.2213932 0.21938423 0.349858379 0.20233548 0.38777657 0.3126968
## CWalks 0.1329257 0.12297073 0.227183183 0.16370021 0.23361884 0.4291399
## PutOuts 0.3096075 0.29968754 0.250931497 0.27115986 0.31206456 0.2808555
## Assists 0.3421174 0.30397495 -0.161601753 0.17925786 0.06290174 0.1025226
## Errors 0.3255770 0.27987618 -0.009743082 0.19260879 0.15015469 0.0819372
## Salary 0.3947709 0.43867474 0.343028078 0.41985856 0.44945709 0.4438673
## Years CAtBat CHits CHmRun CRuns
## AtBat 0.01272550 0.207166254 0.22534146 0.21242155 0.23727777
## Hits 0.01859809 0.206677608 0.23560577 0.18936425 0.23889610
## HmRun 0.11348842 0.217463613 0.21749569 0.49252584 0.25834685
## Runs -0.01197495 0.171810798 0.19132697 0.22970104 0.23783121
## RBI 0.12966795 0.278125914 0.29213714 0.44218969 0.30722616
## Walks 0.13479270 0.269449974 0.27079505 0.34958216 0.33297657
## Years 1.00000000 0.915680692 0.89784449 0.72237071 0.87664855
## CAtBat 0.91568069 1.000000000 0.99505681 0.80167609 0.98274694
## CHits 0.89784449 0.995056810 1.00000000 0.78665204 0.98454184
## CHmRun 0.72237071 0.801676089 0.78665204 1.00000000 0.82562483
## CRuns 0.87664855 0.982746941 0.98454184 0.82562483 1.00000000
## CRBI 0.86380936 0.950730141 0.94679739 0.92790264 0.94567701
## CWalks 0.83752373 0.906711655 0.89071842 0.81087827 0.92776846
## PutOuts -0.02001921 0.053392514 0.06734799 0.09382223 0.05908718
## Assists -0.08511772 -0.007897271 -0.01314420 -0.18888646 -0.03889509
## Errors -0.15651196 -0.070477521 -0.06803583 -0.16536941 -0.09408054
## Salary 0.40065699 0.526135310 0.54890956 0.52493056 0.56267771
## CRBI CWalks PutOuts Assists Errors
## AtBat 0.22139318 0.13292568 0.30960746 0.342117377 0.325576978
## Hits 0.21938423 0.12297073 0.29968754 0.303974950 0.279876183
## HmRun 0.34985838 0.22718318 0.25093150 -0.161601753 -0.009743082
## Runs 0.20233548 0.16370021 0.27115986 0.179257859 0.192608787
## RBI 0.38777657 0.23361884 0.31206456 0.062901737 0.150154692
## Walks 0.31269680 0.42913990 0.28085548 0.102522559 0.081937197
## Years 0.86380936 0.83752373 -0.02001921 -0.085117725 -0.156511957
## CAtBat 0.95073014 0.90671165 0.05339251 -0.007897271 -0.070477521
## CHits 0.94679739 0.89071842 0.06734799 -0.013144204 -0.068035829
## CHmRun 0.92790264 0.81087827 0.09382223 -0.188886464 -0.165369407
## CRuns 0.94567701 0.92776846 0.05908718 -0.038895093 -0.094080542
## CRBI 1.00000000 0.88913701 0.09537515 -0.096558877 -0.115316131
## CWalks 0.88913701 1.00000000 0.05816016 -0.066243445 -0.129935875
## PutOuts 0.09537515 0.05816016 1.00000000 -0.043390143 0.075305857
## Assists -0.09655888 -0.06624345 -0.04339014 1.000000000 0.703504693
## Errors -0.11531613 -0.12993587 0.07530586 0.703504693 1.000000000
## Salary 0.56696569 0.48982204 0.30048036 0.025436136 -0.005400702
## Salary
## AtBat 0.394770945
## Hits 0.438674738
## HmRun 0.343028078
## Runs 0.419858559
## RBI 0.449457088
## Walks 0.443867260
## Years 0.400656994
## CAtBat 0.526135310
## CHits 0.548909559
## CHmRun 0.524930560
## CRuns 0.562677711
## CRBI 0.566965686
## CWalks 0.489822036
## PutOuts 0.300480356
## Assists 0.025436136
## Errors -0.005400702
## Salary 1.000000000
Variables that show strong association with salary, according to the correlation matrix, are:
Salary having more positive co-relation with “CAtBat”,“CHits”, “CHmRun”, and “CRuns” and Errors has slightly negetively co-relate.
Based on these statistical test, we are going to consider all the variables.
hitters_final <- subset(hitters, select = c(Hits, Runs, RBI, Walks, Years, CHits, CRuns, CRBI, CWalks, HmRun , CAtBat,CHmRun, League, Division, NewLeague, Salary))
hitters_final## Hits Runs RBI Walks Years CHits CRuns CRBI CWalks HmRun CAtBat CHmRun
## 2 81 24 38 39 14 835 321 414 375 7 3449 69
## 3 130 66 72 76 3 457 224 266 263 18 1624 63
## 4 141 65 78 37 11 1575 828 838 354 20 5628 225
## 5 87 39 42 30 2 101 48 46 33 10 396 12
## 6 169 74 51 35 11 1133 501 336 194 4 4408 19
## 7 37 23 8 21 2 42 30 9 24 1 214 1
## 8 73 24 24 7 3 108 41 37 12 0 509 0
## 9 81 26 32 8 2 86 32 34 8 6 341 6
## 10 92 49 66 65 13 1332 784 890 866 17 5206 253
## 11 159 107 75 59 10 1300 702 504 488 21 4631 90
## 12 53 31 26 27 9 467 192 186 161 4 1876 15
## 13 113 48 61 47 4 392 205 204 203 13 1512 41
## 14 60 30 11 22 6 510 309 103 207 0 1941 4
## 15 43 29 27 30 13 825 376 290 238 7 3231 36
## 17 158 89 75 73 15 2273 1045 993 732 20 8068 177
## 18 46 24 8 15 5 102 65 23 39 2 479 5
## 20 32 16 22 14 8 180 67 82 56 8 727 24
## 21 92 72 48 65 1 92 72 48 65 16 413 16
## 22 109 55 43 62 1 109 55 43 62 3 426 3
## 24 116 60 62 74 6 489 242 251 240 16 1924 67
## 25 168 73 102 40 18 2464 1008 1072 402 18 8424 164
## 26 163 92 51 70 6 747 442 198 317 4 2695 17
## 27 73 32 18 22 7 491 291 108 180 4 1931 13
## 28 129 50 56 40 10 604 246 327 166 10 2331 61
## 29 152 92 37 81 5 633 349 182 308 6 2308 32
## 30 137 90 95 90 14 1382 763 734 784 20 5201 166
## 32 108 55 36 22 3 149 80 46 31 6 591 8
## 34 168 83 80 56 5 452 219 208 136 17 1646 44
## 35 49 23 25 12 7 308 126 132 66 6 1309 27
## 36 106 38 60 30 14 1906 859 803 571 10 6207 146
## 38 60 24 25 15 2 78 34 29 18 5 350 5
## 41 41 15 21 33 2 50 20 29 45 1 232 4
## 44 46 32 19 9 4 160 86 76 32 7 694 32
## 46 68 28 26 22 6 236 108 117 118 2 999 21
## 47 132 57 49 33 3 273 113 121 80 7 932 24
## 48 57 34 32 9 5 192 117 107 51 8 756 32
## 50 146 71 70 84 6 715 352 342 289 13 2648 77
## 51 101 42 63 22 17 1767 1003 977 619 14 6521 281
## 52 53 30 29 23 2 59 32 32 27 2 226 2
## 54 168 80 72 39 9 1307 634 563 319 19 4478 113
## 55 101 45 53 39 12 1429 747 666 526 12 5150 166
## 56 102 49 85 20 6 231 99 138 64 18 950 29
## 57 58 28 25 35 4 333 164 179 194 9 1335 49
## 59 78 32 41 12 12 968 409 321 170 6 3742 35
## 60 177 98 81 70 6 927 529 472 313 25 3210 133
## 61 113 58 69 16 1 113 58 69 16 24 416 24
## 62 44 21 23 15 16 1634 698 661 777 6 6631 98
## 63 56 27 15 11 4 270 116 64 57 0 1115 1
## 64 53 31 15 22 4 210 118 69 114 1 926 9
## 66 139 93 94 62 17 1982 1141 1179 727 31 7546 315
## 68 53 29 22 21 3 120 57 40 39 5 514 8
## 69 142 67 86 45 4 205 99 103 78 20 815 22
## 71 81 42 30 26 17 2198 950 909 690 3 8247 100
## 73 131 69 96 52 14 1397 712 815 548 26 5347 221
## 74 122 78 85 91 18 1947 1175 1152 1380 29 7761 347
## 75 137 86 97 97 15 1785 1082 949 989 26 6661 291
## 76 119 57 46 13 9 1046 461 301 112 6 3651 32
## 77 97 55 29 39 4 353 196 110 117 4 1258 16
## 79 103 59 47 39 6 555 285 274 186 15 2174 80
## 80 96 37 29 23 4 290 123 108 55 4 1064 11
## 82 70 49 35 43 15 1661 1019 608 820 7 6311 154
## 83 238 117 113 53 5 737 349 401 171 31 2223 93
## 85 163 89 83 75 11 1388 813 822 617 29 5017 266
## 86 83 50 39 56 9 948 575 528 635 9 3828 145
## 87 174 89 116 56 14 2024 978 1093 495 31 6727 247
## 88 82 44 45 47 2 113 61 70 63 16 428 25
## 89 41 21 29 22 16 1338 746 805 875 12 5409 181
## 90 114 67 57 48 4 298 160 123 122 13 1350 28
## 91 83 39 46 16 5 405 156 159 76 13 1457 28
## 92 123 76 93 72 4 471 292 343 267 27 1810 108
## 93 78 35 35 32 1 78 35 35 32 7 317 7
## 94 138 76 96 61 3 164 87 110 71 25 592 28
## 96 119 54 58 36 12 594 287 294 227 13 2133 41
## 97 148 90 104 77 14 2083 1135 1234 791 24 7287 305
## 99 115 97 71 68 3 184 156 119 99 27 711 45
## 100 110 70 47 36 7 544 335 174 258 15 2130 38
## 101 151 61 84 78 10 1679 884 1015 709 17 5624 275
## 103 49 41 23 18 8 336 166 122 106 2 1350 7
## 108 154 76 84 43 14 1583 743 693 300 22 6100 131
## 109 198 101 108 41 5 610 297 319 117 31 2129 92
## 110 51 19 18 11 1 51 19 18 11 4 215 4
## 111 128 70 73 80 14 2095 1072 1050 695 16 6675 209
## 112 76 33 52 37 5 351 195 219 214 16 1506 71
## 113 125 81 105 62 13 1646 847 999 680 24 6063 271
## 114 152 91 101 64 3 260 148 173 95 31 985 53
## 116 171 91 108 52 6 728 361 401 224 34 2862 107
## 117 118 63 54 30 4 187 102 80 50 12 793 14
## 118 77 45 47 26 16 1910 915 1067 546 14 6840 259
## 119 94 42 36 66 9 866 429 365 410 4 3573 59
## 120 85 30 44 20 8 568 216 208 93 3 2140 16
## 121 96 49 46 60 15 1972 1070 955 921 21 6986 231
## 122 77 36 55 41 20 2172 1172 1267 1057 16 8716 384
## 123 139 93 58 69 5 369 247 126 198 5 1469 12
## 124 84 62 33 47 5 376 284 141 219 11 1516 42
## 125 126 42 44 35 11 1578 703 519 256 2 5562 44
## 127 78 37 51 29 5 453 211 280 138 13 1649 73
## 128 120 54 51 31 8 900 444 419 240 5 3118 92
## 129 158 70 84 42 5 636 265 316 134 15 2358 58
## 130 169 72 88 38 7 1077 492 589 263 21 3754 140
## 131 104 50 58 25 7 822 313 377 187 14 2954 55
## 132 54 30 39 31 5 299 145 154 128 10 1185 40
## 133 70 22 37 18 18 2081 935 1088 643 7 7186 190
## 134 99 46 24 29 4 129 72 31 48 1 618 1
## 135 39 18 30 15 9 151 80 97 61 5 639 16
## 136 40 23 11 18 3 125 58 37 47 4 524 7
## 137 170 107 108 69 6 634 371 376 238 40 2325 128
## 138 103 48 36 40 15 1193 581 421 325 6 4338 70
## 140 103 65 32 71 2 103 67 32 71 2 440 2
## 141 144 85 117 65 2 173 101 130 69 33 696 38
## 142 200 108 121 32 4 404 210 222 68 29 1447 57
## 143 55 34 23 45 12 1213 702 705 625 9 4405 194
## 144 133 48 72 55 17 2147 980 1032 854 10 7472 153
## 146 132 61 74 41 6 671 273 383 226 21 2641 97
## 147 39 18 31 22 14 543 304 268 298 8 2128 56
## 148 183 80 74 32 5 715 330 326 158 10 2482 27
## 149 136 58 38 26 11 1066 450 367 241 7 3871 40
## 150 70 32 51 28 15 1130 544 462 551 13 4040 97
## 152 41 26 21 19 2 68 45 39 35 4 288 9
## 153 86 33 38 45 1 86 33 38 45 4 278 4
## 154 95 48 42 20 10 808 379 428 221 6 2964 81
## 155 147 58 88 47 10 730 302 351 174 23 2744 97
## 156 102 56 34 34 5 167 89 48 54 3 670 4
## 157 94 37 32 26 13 1330 616 522 436 5 4618 57
## 160 163 83 107 32 3 377 181 227 82 27 1437 65
## 162 174 67 78 58 6 880 366 337 218 7 3053 32
## 163 39 13 9 16 3 44 18 10 18 2 196 2
## 164 200 98 110 62 13 2163 1104 1289 564 20 7127 351
## 165 66 31 26 32 14 979 518 324 382 5 3910 33
## 166 76 35 60 25 3 151 68 94 55 13 630 24
## 167 157 90 78 26 4 541 310 226 91 16 2020 52
## 168 92 54 49 18 6 325 188 135 63 11 1354 30
## 169 73 23 37 16 4 108 38 46 28 5 450 6
## 171 91 41 42 57 13 1397 578 579 644 12 5590 83
## 173 101 46 43 61 3 218 96 72 91 3 948 6
## 175 47 20 28 18 11 890 419 382 304 5 3327 74
## 176 184 83 79 38 5 462 219 195 82 20 1689 40
## 177 58 34 23 22 1 58 34 23 22 6 181 6
## 178 118 84 86 68 8 750 433 420 309 28 2723 126
## 179 150 69 58 35 14 1839 983 707 600 21 6126 121
## 180 171 94 83 94 13 1840 969 900 917 13 6090 128
## 181 147 85 91 71 6 815 405 474 319 29 2816 117
## 182 74 34 29 22 10 1062 505 456 283 4 3919 85
## 183 161 89 96 66 4 470 233 260 155 26 1789 65
## 184 91 51 43 33 2 94 51 44 33 12 342 12
## 185 159 72 79 53 9 880 363 477 295 12 3082 83
## 186 136 62 48 83 10 970 408 303 414 5 3423 20
## 187 85 69 64 88 7 214 150 156 187 24 911 64
## 188 223 119 96 34 3 587 262 201 91 31 1928 35
## 189 64 31 26 30 1 64 31 26 30 0 279 0
## 190 127 66 65 67 7 844 436 458 377 20 3006 116
## 191 127 77 45 58 2 187 117 64 88 8 667 9
## 192 70 33 37 27 12 1222 557 483 307 8 4479 94
## 193 141 77 47 37 15 1240 615 430 340 11 4291 84
## 194 52 26 28 21 6 191 113 119 87 9 805 30
## 195 149 89 86 64 7 928 513 471 351 21 3558 102
## 196 84 53 62 38 10 1123 577 700 334 22 4273 212
## 197 128 67 94 52 13 1552 740 840 452 28 5829 210
## 199 92 42 60 21 3 185 88 112 50 18 682 36
## 201 157 95 73 63 10 1320 724 522 576 20 4704 93
## 202 54 27 25 33 1 54 27 25 33 2 216 2
## 203 179 94 60 65 5 476 216 163 166 4 1696 12
## 205 131 77 55 34 7 549 300 263 153 19 2051 62
## 206 56 22 36 19 2 58 24 37 19 12 216 12
## 207 93 47 30 30 2 230 121 69 68 8 969 14
## 208 148 64 78 49 13 1000 445 491 301 14 3400 113
## 210 131 68 77 33 6 398 210 203 136 18 1416 47
## 212 65 30 36 27 9 698 315 325 189 8 2815 55
## 213 54 25 14 12 1 54 25 14 12 3 209 3
## 214 71 18 30 36 3 76 20 36 45 3 344 3
## 215 77 47 53 27 6 516 247 288 161 19 1928 90
## 216 120 71 71 54 3 259 150 167 114 28 1085 54
## 217 60 28 33 18 3 170 80 75 36 8 638 17
## 218 1 0 0 0 2 9 6 7 4 0 41 2
## 219 94 36 26 62 7 519 181 199 288 5 1968 26
## 220 43 26 35 39 3 116 59 55 78 10 498 14
## 221 75 38 23 26 3 160 71 33 44 6 580 8
## 222 167 89 49 57 4 232 132 83 79 8 822 19
## 223 110 61 45 32 7 834 451 249 168 9 3015 40
## 224 76 34 37 15 4 408 198 120 113 7 1644 16
## 225 93 43 42 49 5 323 181 177 157 9 1258 54
## 227 137 58 47 12 2 271 129 80 24 2 1038 3
## 228 152 105 49 65 2 249 168 91 101 18 978 36
## 230 144 67 54 79 9 1169 583 374 528 0 4739 13
## 231 80 45 48 63 7 359 176 202 175 15 1493 61
## 232 163 88 50 77 4 470 245 167 174 12 1556 38
## 233 83 43 41 30 14 1543 751 714 535 9 5885 104
## 234 135 82 88 55 1 135 82 88 55 30 540 30
## 235 123 62 55 40 9 1203 676 390 364 9 4139 79
## 237 52 15 25 30 24 4256 2165 1314 1566 0 14053 160
## 238 56 41 19 21 5 329 166 125 105 6 1257 24
## 239 154 61 48 29 6 566 250 252 178 6 1966 29
## 240 72 33 31 26 5 82 41 32 26 0 354 0
## 241 77 35 29 33 12 1358 630 435 403 5 4933 48
## 242 96 50 45 39 5 344 178 192 136 7 1394 43
## 243 56 22 18 15 12 665 266 304 198 4 2796 43
## 244 70 42 36 44 16 1845 965 1128 990 13 7058 312
## 245 108 75 86 72 3 142 102 109 102 33 652 44
## 246 68 42 29 45 18 939 438 380 466 13 3949 78
## 248 110 45 49 46 9 658 249 322 274 9 2331 50
## 249 160 130 74 89 8 1182 862 417 708 28 4071 103
## 250 101 65 58 92 20 2510 1509 1659 1342 18 9528 548
## 252 82 42 60 35 5 408 238 299 157 21 1770 115
## 253 145 51 76 40 11 1102 410 497 284 11 3967 67
## 256 76 35 39 13 6 234 102 96 80 5 912 12
## 257 52 31 27 17 12 1323 643 445 459 8 5134 56
## 258 90 50 45 43 10 614 295 273 269 11 2288 43
## 259 135 52 44 52 9 895 377 284 296 8 3368 39
## 260 68 32 22 24 1 68 32 22 24 2 312 2
## 261 119 57 33 21 7 882 365 280 165 8 3358 36
## 262 27 8 10 5 4 49 16 13 14 3 239 3
## 263 68 42 42 61 6 238 128 104 172 5 961 16
## 264 178 68 76 46 6 902 494 345 242 14 3146 74
## 265 86 38 28 36 4 267 94 71 76 1 1089 3
## 266 57 32 25 18 3 170 98 54 62 8 653 17
## 267 101 50 55 22 1 101 50 55 22 16 382 16
## 268 113 59 57 68 12 1369 713 660 735 20 5348 155
## 269 149 73 47 42 1 149 73 47 42 7 549 7
## 270 63 25 33 16 10 667 315 259 204 3 2682 38
## 272 163 82 46 62 13 2019 1043 827 535 9 7037 153
## 273 117 54 88 43 6 412 204 276 155 29 1750 100
## 274 66 20 28 13 3 80 27 31 15 5 290 5
## 275 140 73 77 60 4 185 93 106 86 16 730 22
## 276 112 54 54 35 2 160 76 75 49 18 680 24
## 277 145 66 68 21 2 210 106 86 40 17 831 21
## 278 159 82 50 47 6 426 218 149 163 3 1619 11
## 279 142 58 81 23 18 2583 1138 1299 478 21 8759 271
## 280 96 44 36 65 4 148 68 56 99 0 711 1
## 281 103 53 33 52 2 123 63 39 58 8 507 8
## 282 122 67 45 51 4 403 211 146 155 1 1716 12
## 283 210 91 56 59 6 872 420 230 274 6 3070 19
## 285 169 88 73 53 8 841 450 342 373 17 3181 61
## 286 76 42 25 20 8 657 324 300 179 3 2658 48
## 287 152 69 75 53 6 686 369 384 321 23 2765 133
## 288 213 91 65 27 4 448 196 137 89 10 1518 15
## 289 103 48 28 54 8 493 207 162 198 3 1897 9
## 290 70 26 23 30 4 220 83 82 86 1 888 9
## 291 211 107 59 52 5 770 352 230 193 14 2364 27
## 292 68 26 30 29 7 339 135 163 128 8 1337 32
## 294 141 48 61 73 8 874 421 349 359 2 3162 16
## 295 120 53 44 21 4 227 106 80 52 17 927 22
## 296 4 2 3 1 1 4 2 3 1 1 19 1
## 297 43 24 17 20 7 219 105 99 71 2 854 12
## 298 47 21 29 24 6 256 129 139 106 10 1136 42
## 300 61 17 22 3 17 1145 488 491 244 4 4061 83
## 301 147 56 52 53 7 821 307 340 174 10 2872 63
## 302 138 56 59 34 3 357 149 161 87 1 1399 7
## 304 113 76 52 76 5 397 226 149 191 5 1546 17
## 305 42 17 14 15 10 1150 579 363 406 3 4086 57
## 307 32 14 25 12 19 2402 1048 1348 819 4 8396 242
## 308 69 35 31 32 4 355 180 148 158 4 1359 31
## 309 112 50 71 44 7 771 338 406 239 18 3031 110
## 310 139 94 29 60 2 309 201 69 110 0 1236 1
## 311 186 107 98 74 6 753 399 366 286 19 2728 69
## 312 81 37 44 37 7 566 279 257 246 5 2268 41
## 313 124 67 27 36 7 506 272 125 194 1 1775 6
## 314 207 107 71 105 5 978 474 322 417 8 2778 32
## 315 117 66 41 34 1 117 66 41 34 11 408 11
## 316 172 82 100 57 1 172 82 100 57 22 593 22
## 318 127 65 48 37 5 806 379 311 138 7 2703 32
## 319 136 76 50 94 12 1511 897 451 875 5 5511 39
## 320 126 61 43 52 6 433 217 93 146 3 1700 7
## 321 144 85 60 78 8 857 470 420 332 9 3198 97
## 322 170 77 44 31 11 1457 775 357 249 9 4908 30
## League Division NewLeague Salary
## 2 N W N 475.000
## 3 A W A 480.000
## 4 N E N 500.000
## 5 N E N 91.500
## 6 A W A 750.000
## 7 N E A 70.000
## 8 A W A 100.000
## 9 N W N 75.000
## 10 A E A 1100.000
## 11 A E A 517.143
## 12 N W N 512.500
## 13 N E N 550.000
## 14 A E A 700.000
## 15 N E N 240.000
## 17 N W N 775.000
## 18 A W A 175.000
## 20 N W N 135.000
## 21 N E N 100.000
## 22 A W N 115.000
## 24 N W N 600.000
## 25 A E A 776.667
## 26 A E A 765.000
## 27 N E N 708.333
## 28 N W N 750.000
## 29 N W N 625.000
## 30 A W A 900.000
## 32 N W N 110.000
## 34 A E A 612.500
## 35 A W A 300.000
## 36 N W N 850.000
## 38 N W N 90.000
## 41 A E A 67.500
## 44 A E A 180.000
## 46 A E A 305.000
## 47 N W N 215.000
## 48 A E A 247.500
## 50 N W N 815.000
## 51 A W A 875.000
## 52 N E N 70.000
## 54 A W A 1200.000
## 55 A E A 675.000
## 56 N W N 415.000
## 57 N W N 340.000
## 59 N W N 416.667
## 60 A E A 1350.000
## 61 A E A 90.000
## 62 N E N 275.000
## 63 A W A 230.000
## 64 N W N 225.000
## 66 A E A 950.000
## 68 A W A 75.000
## 69 A E A 105.000
## 71 N W N 320.000
## 73 A W A 850.000
## 74 A E A 535.000
## 75 A E A 933.333
## 76 A E N 850.000
## 77 N W A 210.000
## 79 A W A 325.000
## 80 A W A 275.000
## 82 N E N 450.000
## 83 A E A 1975.000
## 85 N W N 1900.000
## 86 A W A 600.000
## 87 N W N 1041.667
## 88 A E A 110.000
## 89 A W A 260.000
## 90 A W A 475.000
## 91 A W A 431.500
## 92 N E N 1220.000
## 93 A E A 70.000
## 94 A W A 145.000
## 96 N W N 595.000
## 97 A E A 1861.460
## 99 N W N 300.000
## 100 N W N 490.000
## 101 A E A 2460.000
## 103 A E A 375.000
## 108 A W A 750.000
## 109 A E A 1175.000
## 110 A E A 70.000
## 111 A W A 1500.000
## 112 N W A 385.000
## 113 N E N 1925.571
## 114 N W N 215.000
## 116 A W A 900.000
## 117 A W A 155.000
## 118 A W A 700.000
## 119 N W N 535.000
## 120 A E A 362.500
## 121 N E N 733.333
## 122 N W N 200.000
## 123 A W A 400.000
## 124 N E A 400.000
## 125 N W N 737.500
## 127 A W A 500.000
## 128 A W A 600.000
## 129 N E N 662.500
## 130 A W A 950.000
## 131 N E N 750.000
## 132 N E N 297.500
## 133 A W A 325.000
## 134 A W A 87.500
## 135 N W N 175.000
## 136 N E N 90.000
## 137 A E A 1237.500
## 138 A E A 430.000
## 140 A W N 100.000
## 141 A W A 165.000
## 142 A E A 250.000
## 143 N E N 1300.000
## 144 N W N 773.333
## 146 N E N 1008.333
## 147 A E A 275.000
## 148 A E A 775.000
## 149 A E A 850.000
## 150 A E A 365.000
## 152 A W A 95.000
## 153 N W N 110.000
## 154 N W N 100.000
## 155 N E N 277.500
## 156 A W A 80.000
## 157 N E N 600.000
## 160 A W A 200.000
## 162 N E N 657.000
## 163 A W N 75.000
## 164 A E A 2412.500
## 165 N W A 250.000
## 166 N E N 155.000
## 167 N E N 640.000
## 168 A E A 300.000
## 169 A W A 110.000
## 171 A W N 825.000
## 173 N W N 195.000
## 175 N W N 450.000
## 176 N W N 630.000
## 177 N W N 86.500
## 178 A E A 1300.000
## 179 A E N 1000.000
## 180 N E N 1800.000
## 181 A W A 1310.000
## 182 N W N 737.500
## 183 N W N 625.000
## 184 N E N 125.000
## 185 N E N 1043.333
## 186 N W N 725.000
## 187 A W A 300.000
## 188 A W A 365.000
## 189 N W N 75.000
## 190 N E N 1183.333
## 191 N E N 202.500
## 192 A E A 225.000
## 193 A E A 525.000
## 194 N W N 265.000
## 195 A E A 787.500
## 196 A E N 800.000
## 197 A W A 587.500
## 199 A E A 145.000
## 201 A E A 420.000
## 202 N W N 75.000
## 203 A E A 575.000
## 205 A W A 780.000
## 206 N E N 90.000
## 207 N W N 150.000
## 208 A E N 700.000
## 210 A E A 550.000
## 212 N E A 650.000
## 213 A W A 68.000
## 214 N E N 100.000
## 215 N W N 670.000
## 216 A E A 175.000
## 217 A W A 137.000
## 218 N E N 2127.333
## 219 N W N 875.000
## 220 A W A 120.000
## 221 N E N 140.000
## 222 N E N 210.000
## 223 N E N 800.000
## 224 N W N 240.000
## 225 A E A 350.000
## 227 A W A 175.000
## 228 A W A 200.000
## 230 N E N 1940.000
## 231 N W N 700.000
## 232 A W A 750.000
## 233 N W N 450.000
## 234 A W A 172.000
## 235 A E A 1260.000
## 237 N W N 750.000
## 238 A E A 190.000
## 239 A E A 580.000
## 240 N E N 130.000
## 241 A W A 450.000
## 242 A W A 300.000
## 243 A E A 250.000
## 244 N E A 1050.000
## 245 A E A 215.000
## 246 A E A 400.000
## 248 A E A 560.000
## 249 A E A 1670.000
## 250 A W A 487.500
## 252 A W A 425.000
## 253 N E A 500.000
## 256 A E A 250.000
## 257 A E A 400.000
## 258 A E A 450.000
## 259 N W N 750.000
## 260 A E A 70.000
## 261 N W N 875.000
## 262 N E N 190.000
## 263 N E N 191.000
## 264 N E N 740.000
## 265 N E N 250.000
## 266 N E N 140.000
## 267 A W A 97.500
## 268 A W A 740.000
## 269 N W N 140.000
## 270 A W A 341.667
## 272 A E A 1000.000
## 273 A W A 100.000
## 274 A W A 90.000
## 275 N E N 200.000
## 276 A W A 135.000
## 277 N E N 155.000
## 278 A W A 475.000
## 279 N W N 1450.000
## 280 N E N 150.000
## 281 A W A 105.000
## 282 A W A 350.000
## 283 N W N 90.000
## 285 A E A 530.000
## 286 A E A 341.667
## 287 A W A 940.000
## 288 A E A 350.000
## 289 N W N 326.667
## 290 N E N 250.000
## 291 N W N 740.000
## 292 N W A 425.000
## 294 N E N 925.000
## 295 A W A 185.000
## 296 N W A 920.000
## 297 N E N 286.667
## 298 A W A 245.000
## 300 A W A 235.000
## 301 N E N 1150.000
## 302 N E N 160.000
## 304 A W A 425.000
## 305 N W N 900.000
## 307 N W N 500.000
## 308 N E N 277.500
## 309 N E N 750.000
## 310 N E N 160.000
## 311 N E N 1300.000
## 312 N E N 525.000
## 313 N E N 550.000
## 314 A E A 1600.000
## 315 N W N 120.000
## 316 A W A 165.000
## 318 N E N 700.000
## 319 A E A 875.000
## 320 A W A 385.000
## 321 A E A 960.000
## 322 A W A 1000.000
set.seed(1)Use Caret’s “createDataPartition”to split data into hitters_train, and hitters_test (use 90% for training and 10% for testing)
inTrain = createDataPartition(hitters$Salary, p=0.9, list=FALSE)
hitters_train = hitters[inTrain,]
hitters_train = hitters[-inTrain,]
# Create data partition
inTrain <- createDataPartition(hitters$Salary, p = 0.9, list = FALSE)
# Create training and test sets
hitters_train <- hitters_final[inTrain, ]
hitters_test <- hitters_final[-inTrain, ]# Encoding binary categorical variables
hitters_train$NewLeague <- ifelse(hitters_train$NewLeague == "A", 0, 1)
hitters_test$NewLeague <- ifelse(hitters_test$NewLeague == "A", 0, 1)
hitters_train$League <- ifelse(hitters_train$League == "A", 0, 1)
hitters_test$League <- ifelse(hitters_test$League == "A", 0, 1)
hitters_train$Division <- ifelse(hitters_train$Division == "E", 0, 1)
hitters_test$Division <- ifelse(hitters_test$Division == "E", 0, 1)# Replace Salary column with log(Salary) in hitters_train
hitters_train$Salary <- log(hitters_train$Salary)
# Replace Salary column with log(Salary) in hitters_test
hitters_test$Salary <- log(hitters_test$Salary)set.seed(1)
inTrain <- createDataPartition(hitters_train$Salary, p = 0.9, list = FALSE)
train <- hitters_train[inTrain, ]
validation <- hitters_train[-inTrain, ]# Select the numeric variables to scale
numeric_cols <- sapply(hitters_train, is.numeric) & !names(hitters_train) %in% c("Salary", "Division", "League", "NewLeague")# Scale the numeric variables in the training data
scaled_train <- as.data.frame(scale(train[, numeric_cols]))
# Append the categorical variables to the scaled numeric variables
scaled_train <- cbind(scaled_train, train[, !numeric_cols])
# Get column means and standard deviations for scaling validation and test data
means <- colMeans(train[, numeric_cols])
sds <- apply(train[, numeric_cols], 2, sd)
# Scale validation and test data using means and sds from training data
scaled_valid <- as.data.frame(scale(validation[, numeric_cols], center = means, scale = sds))
scaled_valid <- cbind(scaled_valid, validation[, !numeric_cols])
scaled_test <- as.data.frame(scale(hitters_test[, numeric_cols], center = means, scale = sds))
scaled_test <- cbind(scaled_test, hitters_test[, !numeric_cols])scaled_train <- as.matrix(scaled_train)
scaled_test <- as.matrix(scaled_test)
scaled_valid <- as.matrix(scaled_valid)Print the returned value from tf_runs to see the metrics for each run. Which run ( which hyper-parameter combination) gave the best mean squared error on the validation data?
Print the learning curve for your best model. Does your best model still overfit?
Does your validation_loss stop decreasing after several epochs? If so, at roughly which epoch does your validation_loss stop decreasing?
X_train <- subset(scaled_train, select = -c(Salary))
y_train <- subset(scaled_train, select = c(Salary))
X_test <- subset(scaled_test, select = -c(Salary))
y_test <- subset(scaled_test, select = c(Salary))
X_val <- subset(scaled_valid, select = -c(Salary))
y_val <- subset(scaled_valid, select = c(Salary))runs <- tuning_run("problem2_nn.R",
flags = list(
nodes = c(32, 64, 128),
learning_rate = c(0.01, 0.001, 0.0001),
batch_size=c(50,100,200,500),
epochs=c(30,50,100),
activation=c("relu","sigmoid","tanh")
),
sample = 0.02
)##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
##
## > FLAGS <- flags(flag_numeric("nodes", 64), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## + .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = FLAGS$nodes, input_shape = dim(X_train)[2]) %>%
## + layer_dense(units = 1)
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "mse", metrics = list("mse"))
##
## > model %>% fit(X_train, y_train, epochs = FLAGS$epochs,
## + batch_size = FLAGS$batch_size, validation_data = list(X_val,
## + y_val))
runs## Data frame: 7 x 25
## run_dir metric_loss metric_mse metric_val_loss
## 1 runs/2023-03-27T01-00-38Z 0.4165 0.4165 0.4393
## 2 runs/2023-03-27T01-00-36Z 6.5761 6.5761 6.9082
## 3 runs/2023-03-27T01-00-33Z 3.7079 3.7079 4.6020
## 4 runs/2023-03-27T01-00-31Z 31.1762 31.1762 29.9040
## 5 runs/2023-03-27T01-00-26Z 0.5776 0.5776 0.3117
## 6 runs/2023-03-27T01-00-24Z 19.4368 19.4368 18.9947
## 7 runs/2023-03-27T01-00-12Z 33.0509 33.0509 34.0548
## metric_val_mse
## 1 0.4393
## 2 6.9082
## 3 4.6020
## 4 29.9040
## 5 0.3117
## 6 18.9947
## 7 34.0548
## # ... with 20 more columns:
## # flag_nodes, flag_batch_size, flag_activation, flag_learning_rate,
## # flag_epochs, epochs, epochs_completed, metrics, model, loss_function,
## # optimizer, learning_rate, script, start, end, completed, output,
## # source_code, context, type
view_run(runs$run_dir[1])Yes, the validation loss decreases continously and after 90th epoch the loss become flate, that is, it stopped decreasing.
Best Model:
nodes: 128
batch size: 50
activation function: sigmoid
learning rate: 0.0001
epochs: 50
Kindly look at the P2BestModel file.
# Retrain the best model once again
best_model =keras_model_sequential()
best_model %>%
layer_dense(units = 128, activation = "sigmoid", input_shape = dim(X_train)[2]) %>%
layer_dense(units = 1)best_model %>% compile(
optimizer = optimizer_adam(learning_rate=0.001),
loss = 'mse',
metrics = list('mse'))history <-best_model %>% fit(
X_train, y_train, epochs = 100,
batch_size= 50)plot(history)# Make predictions on the test set using the best model
y_pred <- best_model%>% predict(X_test) %>% as.vector()# Reverse the log transformation
y_pred_orig <- exp(y_pred)
y_test_orig <- exp(y_test)# Compute the RMSE in the original scale
rmse <- sqrt(mean((y_test_orig - y_pred_orig)^2))cat(paste("The RMSE value of Neural Net model is: ",rmse))## The RMSE value of Neural Net model is: 368.745935194109
# Train a simple linear regression model
linear_model <- lm(Salary ~ ., data = as.data.frame(scaled_train))# Make predictions on the test set
y_pred_lm <- predict(linear_model, newdata = as.data.frame(scaled_test))y_pred_lm_orig <- exp(y_pred_lm)# Compute the RMSE of the linear model
rmse_lm <- sqrt(mean((y_test_orig - y_pred_lm_orig)^2))cat(paste("The RMSE value of linear model is: ",rmse_lm))## The RMSE value of linear model is: 457.745470302265
The root mean square error (RMSE) is a measure of the difference between the predicted and actual values in a regression analysis. A lower RMSE value indicates better performance of the model.
In this case, the RMSE value of the neural net model is 368.7459352, while that of the linear model is 457.7454703. This means that the neural net model performs better than the linear model as it has a lower RMSE value.