Krister Martinez
6
library(ISLR)
str(Hitters)
'data.frame': 322 obs. of 20 variables:
$ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
$ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
$ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
$ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
$ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
$ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
$ Years : int 1 14 3 11 2 11 2 3 2 13 ...
$ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
$ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
$ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
$ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
$ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
$ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
$ League : Factor w/ 2 levels "A","N": 1 2 1 2 2 1 2 1 2 1 ...
$ Division : Factor w/ 2 levels "E","W": 1 2 2 1 1 2 1 2 2 1 ...
$ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
$ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
$ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
$ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
$ NewLeague: Factor w/ 2 levels "A","N": 1 2 1 2 2 1 1 1 2 1 ...
Hitters_Fixed = na.omit(Hitters )
str(Hitters_Fixed)
'data.frame': 263 obs. of 20 variables:
$ AtBat : int 315 479 496 321 594 185 298 323 401 574 ...
$ Hits : int 81 130 141 87 169 37 73 81 92 159 ...
$ HmRun : int 7 18 20 10 4 1 0 6 17 21 ...
$ Runs : int 24 66 65 39 74 23 24 26 49 107 ...
$ RBI : int 38 72 78 42 51 8 24 32 66 75 ...
$ Walks : int 39 76 37 30 35 21 7 8 65 59 ...
$ Years : int 14 3 11 2 11 2 3 2 13 10 ...
$ CAtBat : int 3449 1624 5628 396 4408 214 509 341 5206 4631 ...
$ CHits : int 835 457 1575 101 1133 42 108 86 1332 1300 ...
$ CHmRun : int 69 63 225 12 19 1 0 6 253 90 ...
$ CRuns : int 321 224 828 48 501 30 41 32 784 702 ...
$ CRBI : int 414 266 838 46 336 9 37 34 890 504 ...
$ CWalks : int 375 263 354 33 194 24 12 8 866 488 ...
$ League : Factor w/ 2 levels "A","N": 2 1 2 2 1 2 1 2 1 1 ...
$ Division : Factor w/ 2 levels "E","W": 2 2 1 1 2 1 2 2 1 1 ...
$ PutOuts : int 632 880 200 805 282 76 121 143 0 238 ...
$ Assists : int 43 82 11 40 421 127 283 290 0 445 ...
$ Errors : int 10 14 3 4 25 7 9 19 0 22 ...
$ Salary : num 475 480 500 91.5 750 ...
$ NewLeague: Factor w/ 2 levels "A","N": 2 1 2 2 1 1 1 2 1 1 ...
- attr(*, "na.action")= 'omit' Named int [1:59] 1 16 19 23 31 33 37 39 40 42 ...
..- attr(*, "names")= chr [1:59] "-Andy Allanson" "-Billy Beane" "-Bruce Bochte" "-Bob Boone" ...
head(Hitters_Fixed)
Apply the Best Subset Selection method to predict “Salary” (i.e., Salary is the outcome variable) based on all the predictors with the exemption of the three categorical predictors: League, Division, and NewLeague. Thus, do NOT include these three predictors when you apply the method. You must apply the Best Subset Selection method based on adjusted R squared
library(leaps)
Preparing the data by excluding the categorical variables
Hitters_Fixed <- Hitters_Fixed[, !(names(Hitters_Fixed) %in% c("League", "Division", "NewLeague"))]
Running regsubset from leaps
subset_fit <- regsubsets(Salary ~ ., data=Hitters_Fixed, nvmax=ncol(Hitters_Fixed)-1, method="exhaustive")
summary(subset_fit)
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters_Fixed, nvmax = ncol(Hitters_Fixed) -
1, method = "exhaustive")
16 Variables (and intercept)
Forced in Forced out
AtBat FALSE FALSE
Hits FALSE FALSE
HmRun FALSE FALSE
Runs FALSE FALSE
RBI FALSE FALSE
Walks FALSE FALSE
Years FALSE FALSE
CAtBat FALSE FALSE
CHits FALSE FALSE
CHmRun FALSE FALSE
CRuns FALSE FALSE
CRBI FALSE FALSE
CWalks FALSE FALSE
PutOuts FALSE FALSE
Assists FALSE FALSE
Errors FALSE FALSE
1 subsets of each size up to 16
Selection Algorithm: exhaustive
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks PutOuts Assists Errors
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" " " " " " " " "
2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" " " " " " " " "
3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" " " "*" " " " "
4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" " " "*" " " " "
5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" " " "*" " " " "
6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " "*" "*" " " " "
7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " "*" "*" " " " "
8 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" "*" "*" " " " "
9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" "*" "*" "*" " "
10 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" "*" "*" "*" " "
11 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
12 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
13 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*"
14 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
15 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
First, we are going to use Adjusted R Squared to compare the equations
summary (subset_fit)$adjr2
[1] 0.3188503 0.4208024 0.4450753 0.4621594 0.4805047 0.4855144 0.4983282 0.5036515 0.5086408 0.5077531 0.5064485 0.5049397 0.5031603 0.5012591 0.4992625
[16] 0.4972271
Adjusted R squared can be transformed into a % (similar to R squared). Let’s do it:
adj_r2_percent_HF = summary (subset_fit)$adjr2 * 100
adj_r2_percent_HF
[1] 31.88503 42.08024 44.50753 46.21594 48.05047 48.55144 49.83282 50.36515 50.86408 50.77531 50.64485 50.49397 50.31603 50.12591 49.92625 49.72271
which.max(adj_r2_percent_HF)
[1] 9
adj_r2_percent_HF
[1] 31.88503 42.08024 44.50753 46.21594 48.05047 48.55144 49.83282 50.36515 50.86408 50.77531 50.64485 50.49397 50.31603 50.12591 49.92625 49.72271
adj_r2_percent_HF[2:length(adj_r2_percent_HF)] - adj_r2_percent_HF [1: (length(adj_r2_percent_HF)-1)]
[1] 10.19521101 2.42729256 1.70840877 1.83452551 0.50097404 1.28137897 0.53232663 0.49893771 -0.08877784 -0.13045059 -0.15088969 -0.17793131
[13] -0.19011938 -0.19966419 -0.20353608
chosen_HF_number= which (adj_r2_percent_HF[2:length(adj_r2_percent_HF)] - adj_r2_percent_HF [1: (length(adj_r2_percent_HF)-1)] < 1) [1]
chosen_HF_number
[1] 5
coef (subset_fit, 5)
(Intercept) AtBat Hits Walks CRBI PutOuts
25.2819915 -2.0349977 8.1842739 3.9059431 0.6417565 0.2645828
Write the equation that resulted from applying the Best Subset Selection method based on adjusted R squared (alternative 2).
Predicted Salary = 25.282 - 2.035 * AtBat + 8.184 * Hits + 3.906 * Walks + 0.642 * CRBI + 0.0265 * PutOuts
Does the equation that you obtained allow you to eliminate at least 50% of the variation existing in the values of Salary? Justify.
model_HF <- lm(Salary ~ AtBat + Hits + Walks + CRBI + PutOuts, data=Hitters_Fixed)
summary(model_HF)
Call:
lm(formula = Salary ~ AtBat + Hits + Walks + CRBI + PutOuts,
data = Hitters_Fixed)
Residuals:
Min 1Q Median 3Q Max
-935.75 -170.61 -29.91 119.62 2109.44
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.28199 62.36927 0.405 0.685550
AtBat -2.03500 0.53330 -3.816 0.000170 ***
Hits 8.18427 1.67910 4.874 1.91e-06 ***
Walks 3.90594 1.22837 3.180 0.001655 **
CRBI 0.64176 0.06549 9.799 < 2e-16 ***
PutOuts 0.26458 0.07600 3.481 0.000586 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 325.1 on 257 degrees of freedom
Multiple R-squared: 0.4904, Adjusted R-squared: 0.4805
F-statistic: 49.47 on 5 and 257 DF, p-value: < 2.2e-16
The Adjusted R-squared that I obtained from the summary of the regression model is 0.4805 which means that 48% of the variation in salary is explain by the model. so 48 is less than 50 so the answer is no it is only 48%
Apply the Best Subset Selection method again, but this time using RSE (alternative 2). Consider a reduction in RSE practically significant if it represents at least a %1 reduction. Show all your work (I need to see every code chunk you wrote and the output you got).
We continue:..
Creating a vector to store the variable p predictors
p = 1:16
Computing RSE for each equation
rse_values_HF = sqrt((summary (subset_fit)$rss)/(nrow(Hitters_Fixed)-p-1))
rse_values_HF
[1] 372.3163 343.3240 336.0530 330.8397 325.1484 323.5768 319.5219 317.8222 316.2207 316.5063 316.9254 317.4095 317.9794 318.5872 319.2242 319.8724
IF we need to make the decision about the “best” model based on RSE, we observe the values of RSE to identify when it stops decreasing. We find where the minimum value of RSE happens for this purpose
which.min(rse_values_HF)
[1] 9
percent_decrease_rse_HF= (rse_values_HF [1: (length(rse_values_HF) -1 ) ]- rse_values_HF[2:length(rse_values_HF)]) / rse_values_HF [1: (length(rse_values_HF) -1 )]
percent_decrease_rse_HF
[1] 0.0778701317 0.0211781819 0.0155134931 0.0172025098 0.0048334185 0.0125315306 0.0053196761 0.0050387768 -0.0009029828 -0.0013241756
[11] -0.0015274450 -0.0017954551 -0.0019114602 -0.0019996834 -0.0020303020
which (percent_decrease_rse_HF < 0.01)[1]
[1] 5
Here are the predictors
coef (subset_fit, 5)
(Intercept) AtBat Hits Walks CRBI PutOuts
25.2819915 -2.0349977 8.1842739 3.9059431 0.6417565 0.2645828
I obtained the same equation as the one in part 1a
After you apply the method, if you get an equation different from the one you obtained in 1a, write this new equation. Otherwise, you answer this: “I obtained the same equation as the one in part 1a”
For the equation you obtained in 1 a), assume that Assumption 2 regarding the validity of the linear regression analysis is satisfied. Run an analysis to check for the validity of all other assumptions. Show all your work AND comment on your findings.
plot(predict (model_HF), residuals (model_HF))
# abline() allows us to draw a horizontal line at y=0 (because 0 is the mean of the residuals)
abline(h=0)
Interpretation: In this case, we can clearly see that the residuals show a nonlinear pattern. The data points seem to follow a parabolic shape; therefore, a quadratic model might be more appropriate than a linear model ( assumption 1 is not satisfied). We had already observed this nonlinear pattern when we did the original scatter plot. However, this graph shows the non-linear pattern more clearly.
When assumption 1 is not satisfied, it is really hard to evaluate assumption 4. So, NO need to assess assumption 4 in this case.
To check assumption 3 (whether the residuals follow a normal distribution), we are going to conduct a hypothesis test: the Shapiro test to check for normality
Ho: The residuals follow a Normal distribution Ha: The residuals do NOT follow a Normal distribution
shapiro.test(residuals (model_HF))
Shapiro-Wilk normality test
data: residuals(model_HF)
W = 0.90272, p-value = 5.266e-12
PV= 5.266e-12, which is a lot smaller than alpha (0.05). Thus, we reject Ho and support Ha. The data is giving us evidence that the residuals do NOT follow a Normal distribution. Assumption 3 is not satisfied either!
Apply Linear Regression to predict “Salary” based on these predictors: Hits, League, and Division. Show all your work.
Hitters_Fixed <- na.omit(Hitters)
model_2_HF = lm(Salary~Hits + League + Division , data = Hitters_Fixed)
summary(model_2_HF)
Call:
lm(formula = Salary ~ Hits + League + Division, data = Hitters_Fixed)
Residuals:
Min 1Q Median 3Q Max
-844.52 -264.61 -59.75 169.95 1958.07
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 118.1472 78.3232 1.508 0.13266
Hits 4.3350 0.5574 7.778 1.75e-13 ***
LeagueN 46.7805 50.1133 0.933 0.35143
DivisionW -140.7468 49.6199 -2.837 0.00492 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 400.8 on 259 degrees of freedom
Multiple R-squared: 0.2196, Adjusted R-squared: 0.2105
F-statistic: 24.29 on 3 and 259 DF, p-value: 6.985e-14
str(Hitters_Fixed)
'data.frame': 263 obs. of 20 variables:
$ AtBat : int 315 479 496 321 594 185 298 323 401 574 ...
$ Hits : int 81 130 141 87 169 37 73 81 92 159 ...
$ HmRun : int 7 18 20 10 4 1 0 6 17 21 ...
$ Runs : int 24 66 65 39 74 23 24 26 49 107 ...
$ RBI : int 38 72 78 42 51 8 24 32 66 75 ...
$ Walks : int 39 76 37 30 35 21 7 8 65 59 ...
$ Years : int 14 3 11 2 11 2 3 2 13 10 ...
$ CAtBat : int 3449 1624 5628 396 4408 214 509 341 5206 4631 ...
$ CHits : int 835 457 1575 101 1133 42 108 86 1332 1300 ...
$ CHmRun : int 69 63 225 12 19 1 0 6 253 90 ...
$ CRuns : int 321 224 828 48 501 30 41 32 784 702 ...
$ CRBI : int 414 266 838 46 336 9 37 34 890 504 ...
$ CWalks : int 375 263 354 33 194 24 12 8 866 488 ...
$ League : Factor w/ 2 levels "A","N": 2 1 2 2 1 2 1 2 1 1 ...
$ Division : Factor w/ 2 levels "E","W": 2 2 1 1 2 1 2 2 1 1 ...
$ PutOuts : int 632 880 200 805 282 76 121 143 0 238 ...
$ Assists : int 43 82 11 40 421 127 283 290 0 445 ...
$ Errors : int 10 14 3 4 25 7 9 19 0 22 ...
$ Salary : num 475 480 500 91.5 750 ...
$ NewLeague: Factor w/ 2 levels "A","N": 2 1 2 2 1 1 1 2 1 1 ...
- attr(*, "na.action")= 'omit' Named int [1:59] 1 16 19 23 31 33 37 39 40 42 ...
..- attr(*, "names")= chr [1:59] "-Andy Allanson" "-Billy Beane" "-Bruce Bochte" "-Bob Boone" ...
Predicted Salary = 118.1472 + 4.3350 * Hits - 140.7468 * Division
#a) Based on your results, write the equation that you consider should be used to predict players’ salaries.
Predicted Salary = 118.1472 + 4.3350 * Hits - 140.7468 * Division
#b) Use this equation to predict the Salary for a player who connected 120 hits and played in the National League in the West division in 1986. Show your work.
#levels(Hitters_Fixed$Division)
contrasts(Hitters_Fixed$Division)
W
E 0
W 1
pred_Salary_2b = 118.1472 + 4.3350 * 120 - 140.7468 * 1
pred_Salary_2b
[1] 497.6004
The predicted salary is $497.60 for a player in the West Division with 120 hits.
#c) Does this equation improve when you add the “NewLeague” predictor? Justify your answer and show all your work.
model_2c_HF = lm(Salary~Hits + League + Division+NewLeague , data = Hitters_Fixed)
summary(model_2c_HF)
Call:
lm(formula = Salary ~ Hits + League + Division + NewLeague, data = Hitters_Fixed)
Residuals:
Min 1Q Median 3Q Max
-843.75 -265.76 -59.39 171.08 1961.07
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 118.317 78.469 1.508 0.13283
Hits 4.345 0.560 7.760 1.99e-13 ***
LeagueN 67.617 99.084 0.682 0.49559
DivisionW -140.672 49.711 -2.830 0.00502 **
NewLeagueN -24.012 98.445 -0.244 0.80749
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 401.6 on 258 degrees of freedom
Multiple R-squared: 0.2197, Adjusted R-squared: 0.2076
F-statistic: 18.17 on 4 and 258 DF, p-value: 3.683e-13
The p value .80749 for NewLeague is greater than alpha, therefore, there is NO statistically significant difference. In this case, we can determine NewLeague predictor does not improve the equation.
# Load necessary library
library(ggplot2)
# Histogram of Salary
ggplot(Hitters_Fixed, aes(x = Salary)) +
geom_histogram(binwidth = 500, fill = "blue", color = "black") +
ggtitle("Distribution of Salaries") +
xlab("Salary") +
ylab("Frequency")
# Scatterplot of Salary vs Hits
ggplot(Hitters_Fixed, aes(x = Hits, y = Salary)) +
geom_point(aes(color = Division), alpha = 0.7) + # Color points by Division to add another dimension of analysis
geom_smooth(method = "lm", se = FALSE, color = "red") +
ggtitle("Relationship between Hits and Salary") +
xlab("Hits") +
ylab("Salary") +
theme_minimal()