The accuracy of logistic regression was 75%, the accuracy of of LDA is 87%, the accuracy of naive bayes is 83%, and KNN has an accuracy of 84%. This means that LDA was the most accurate apporach to predicting whether or not a district was above the mean crime level.

4.16

library(tidyverse)
library(ISLR2)

#created a variable that says whether the crime rate for this neighborhood is above the median for the data set

Boston$abv.med <- ifelse(Boston$crim > median(Boston$crim), 1, 0)


#LOGISTIC REGRESSION
#setting seed
set.seed(1)
#creating training data of the 506 observation in Boston data and using 80% so it is rounded to 405
train.sample <- sample(1:nrow(Boston), nrow(Boston)*.8)
#assigning my training data to the sample I just created
train <- Boston[train.sample,]
#assigning my test data to be all the data not in training data
test <- Boston[-train.sample,]
#fitting a regression for the training data
fit1 <- glm(abv.med~ zn+indus+chas+nox+rm+age+dis+rad+tax+medv, family=binomial, data=train)


#predicting testing data for my first regression
test.pred1 <- predict(fit1, test, type='response')
#changing my predictions to be either above or below the median
pred.word <- ifelse(test.pred1 > .05, 'above', 'below')
#changing my actual data to be words
test.word <- ifelse(test$abv.med == 1, 'above', 'below')


table1 <- table(pred.word, test.word)

logistic.test.accuracy1 <- mean(pred.word == test.word)


#LINEAR DISCRIMINANT ANALYSIS
set.seed(1)
library(MASS)

lda.1 <- lda(abv.med~ zn+indus+chas+nox+rm+age+dis+rad+tax+medv, data=train)
lda.1
## Call:
## lda(abv.med ~ zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + medv, data = train)
## 
## Prior probabilities of groups:
##   0   1 
## 0.5 0.5 
## 
## Group means:
##          zn     indus       chas       nox       rm      age      dis       rad
## 0 21.631188  7.165396 0.05940594 0.4704243 6.432842 51.33861 5.036025  4.148515
## 1  1.108911 15.358267 0.08910891 0.6356634 6.172574 85.71931 2.532918 15.099010
##        tax     medv
## 0 309.4109 25.28960
## 1 513.8812 20.43119
## 
## Coefficients of linear discriminants:
##                LD1
## zn    -0.008128609
## indus  0.020261639
## chas  -0.078649361
## nox    7.974579455
## rm     0.131062113
## age    0.012573800
## dis    0.085898482
## rad    0.094130945
## tax   -0.001517165
## medv   0.021404207
lda.pred <- predict(lda.1, test)
lda.class <- lda.pred$class

lda.word <- ifelse(lda.class == 1, 'above', 'below')
table(lda.word, test.word)
##         test.word
## lda.word above below
##    above    40     1
##    below    11    50
lda.test.acc <- mean(lda.word ==test.word)

#NAIVE BAYES
set.seed(1)
library(e1071)
nb.1 <- naiveBayes(abv.med~ zn+indus+nox+rm+age+dis+rad+tax+medv, data=train)

nb.pred <- predict(nb.1, test)
nb.word <- ifelse(nb.pred >.5, "above", "below")
nb.table <- table(nb.pred, test.word)
nb.table
##        test.word
## nb.pred above below
##       0    15    45
##       1    36     6
#KNN
library(class)

set.seed(1)
sam.knn <- sample(1:nrow(Boston), nrow(Boston)*.5)

knn.train <- as.matrix(Boston[sam.knn,-16])
knn.test <- as.matrix(Boston[-sam.knn,-16])
knn.target <- Boston$abv.med[sam.knn]
knn.test.target <- Boston$abv.med[-sam.knn]

knn <- knn(knn.train, knn.test, cl=knn.target, k=10)

mean(knn == knn.test.target)
## [1] 0.8537549

5.3

  1. The procedure of k-fold cross validation is when you pick a “k” which represents the number of splits in a given dataset. Hypothetically if k=10 and your dataset has 100 observations, you will randomly create 10 groups of 10 for your data. Next, you will go through and use each group as a test group once with all other groups being training groups. Then you will test the MSE on each method and average them out to see how well on average your model did.

  2. The validation set approach can lead to overestimation of the test error rate because the training set is using only half of the observations in the data set when the k-fold is using n-1 groups for the training set.

  1. There is a computational advantage to k fold validation over LOOCV when k<n because you are doing less computations because there is less subgroups and therefore less computations. The LOOCV however will give almost unbiased test errors because it is using a single observation for the test error, it is highly variable as well. So, LOOCV is better from the perspective or limiting bias, but can be a computational headache in some circumstances. Lastly LOOCV has a higher variance.

5.5

  1. ALl of my accuracies after three times were almost exactly similar. The largest difference was .0026 which is extremely small. After splitting the sets up multiple times it seems that the accuracy remains fairly close to the same point.

  2. When adding a dummy variable for student it did not seem to help the accuracy of the model. The accuracy actually went down slightly, but in the bigger picture they are likely no different from each other.

library(ISLR2)
set.seed(1)

train.sam <- sample(1:nrow(Default), nrow(Default)*.5)

Default$binary <- ifelse(Default$default == 'No', 0, 1)
#i
train2 <- Default[train.sam,]

test2 <- Default[-train.sam,]
#ii
fit2 <- glm(binary~income+balance, data=train2, family=binomial)

pred2 <- predict(fit2, test2, type='response')

pred2.bin<- ifelse(pred2 >.5, 1, 0)
acc2 <- mean((test2$binary == pred2.bin)^2)
#iii
test.error2 <- mean((test2$binary - pred2.bin)^2)
acc2
## [1] 0.9746
table(test2$binary, pred2.bin)
##    pred2.bin
##        0    1
##   0 4824   19
##   1  108   49
#PART C

set.seed(2)

train.sam2 <- sample(1:nrow(Default), nrow(Default)*.5)

Default$binary <- ifelse(Default$default == 'No', 0, 1)
train3 <- Default[train.sam2,]

test3 <- Default[-train.sam2,]

fit3 <- glm(binary~income+balance, data=train3, family=binomial)

pred3 <- predict(fit3, test3, type='response')

pred3.bin<- ifelse(pred3 >.5, 1, 0)
acc3 <- mean((test3$binary == pred3.bin)^2)
acc3
## [1] 0.9762
test.error3 <- mean((test3$binary - pred3.bin)^2)
test.error3
## [1] 0.0238
table(test3$binary, pred3.bin)
##    pred3.bin
##        0    1
##   0 4819   18
##   1  101   62
set.seed(3)

train.sam3 <- sample(1:nrow(Default), nrow(Default)*.5)

train4 <- Default[train.sam3,]

test4 <- Default[-train.sam3,]

fit4 <- glm(binary~income+balance, data=train4, family=binomial)

pred4 <- predict(fit4, test4, type='response')

pred4.bin<- ifelse(pred4 >.5, 1, 0)
acc4 <- mean((test4$binary == pred4.bin)^2)

acc4
## [1] 0.9736
table(test4$binary, pred4.bin)
##    pred4.bin
##        0    1
##   0 4822   23
##   1  109   46
#PART D
#creating a binary for student
Default$stbin <- ifelse(Default$student == "Yes", 1, 0)

train.sam4 <- sample(1:nrow(Default), nrow(Default)*.5)

train5 <- Default[train.sam4,]
test5 <- Default[-train.sam4,]


fit5 <- lm(binary~income+balance+stbin, data=train5, family=binomial)

pred5 <- predict(fit5, test5, type='response')

pred5.bin <- ifelse(pred5 > .5, 1, 0)
acc5 <- mean((test5$binary == pred5.bin)^2)

5.9

  1. our uhat was 22.5328.

  2. The standard error for the mean of medv is .4088611 which can be interpreted as the average error around our mean of 22.53. Two standard errors plus and minus the mean will give you a 95% confidence interval for the mean. Since the standard error is significantly smaller than the mean it would likely prove to be significant and shows that our estimate could be reliable.

  3. The bootstrap shows a standard error of .4060352 and before we got a standard deviation of .4088611 so these are very close with .002 of one another.

  4. The T-test function gave an interval of 21.729 to 23.336, while our bootstrap gave us 21.7 and 23.4 so they were very close in proximity.

  5. our estimated median was 21.2

  6. The standard error I recieved from the bootstrap was .377

  7. The tenth percentile of of medv is 12.8.

  8. The standard error is .49. The standard error is a lot smaller than our estimate so our confidence intervals would likely contain a postive value above 10 depending on the level of confidence,

library(ISLR2)
library(boot)
#a
uhat <- mean(Boston$medv)

#b 
ste.uhat <- (sd(Boston$medv))/sqrt(nrow(Boston))

#c
set.seed(1)
ste.fn <- function(data, index) {
  u <- mean(data[index])
  return(u)

}

ste.fn(Boston$medv, 1:506)
## [1] 22.53281
bootstrap <- boot(Boston$medv,ste.fn,1000)
bootstrap
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = ste.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007650791   0.4106622
#d
CI <- round(c(mean(bootstrap$t) - 2*sd(bootstrap$t),  mean(bootstrap$t) + 2*sd(bootstrap$t)),3)

t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281
#e

med.u <- median(Boston$medv)

#f
med.fn <- function(data, index){
  return(median(data[index]))
}

bootstrap2 <- boot(Boston$medv, med.fn, 1000)
bootstrap2
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = med.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0386   0.3770241
#g
u.1 <- quantile(Boston$medv, .1)

#h

u.1fn <- function(data, index){
  return(quantile(data[index], c(.1)))
}

bootstrap3 <- boot(Boston$medv, u.1fn, 1000)
bootstrap3
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = u.1fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0186   0.4925766

6.2

  1. iii is correct. The lasso has similar behavior to the ridge method in that as lambda increases, the model becomes less flexible and also has decreased variance and increased bias. So, as long as the increase in bias does not overtake the decrease in variance it has a an advantage over OLS.

  2. iii is correct. It is correct because as lambda increases the model becomes less flexible so the first part is correct. It becomes less flexible which then, in turn, leads to decreased variance but increased bias. The graphs on page 240 of the book illustrate this well. So, as long as the increase in bias is not larger than the decrease in variance the model will have an advantage to OLS.

  3. ii is correct. Non-linear methods are inherently more flexible than linear methods, so as long as the lower bias is not outweighed by the

6.9

library(ISLR2)
head(College)
##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55
#a
#Creating train and test
sam.college <- sample(1:nrow(College), nrow(College)*.8)

train.col <- College[sam.college,]
test.col <- College[-sam.college,]

#b
#OLS
ols.col <- lm(Apps~., data=train.col)

pred.col <- predict(ols.col, test.col, type='response')

error.ols <- mean((pred.col-test.col$Apps)^2)
#c
#RIDGE
library(glmnet)
set.seed(1)
grid <- 10^seq(10,-2,length=100)

train.mat <- model.matrix(Apps~., data=train.col)[,-1]
test.mat <- model.matrix(Apps~., data=test.col)[,-1]

cv <- cv.glmnet(train.mat, train.col$Apps, lambda = grid,  alpha=0)

bestlambda <-cv$lambda.min

ridge.lm <- glmnet(train.mat, train.col$Apps, alpha=0)

ridge.pred <- predict(ridge.lm, newx=test.mat, s=bestlambda)

error.ridge <- mean((test.col$Apps-ridge.pred)^2)

#d
#LASSO
set.seed(1)

cv2 <- cv.glmnet(train.mat, train.col$Apps, alpha=1, lambda=grid)
bestlambda2 <- cv2$lambda.min

lasso.lm <- glmnet(train.mat, train.col$Apps, alpha=1, lambda = grid)

lasso.pred <- predict(lasso.lm, newx=test.mat, lambda=grid, s=bestlambda2)

error.lasso <- mean((test.col$Apps-lasso.pred)^2)

8.4

On attached note page

8.7

In the lab, we applied random forests to the Boston data using mtry = 6 and using ntree = 25 and ntree = 500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. You can model your plot after Figure 8.10. Describe the results obtained.

library(randomForest)
set.seed(1)
bos.sam <- sample(1:nrow(Boston), nrow(Boston)*.8)

#making a training sample removing medv
x.train.bos <- Boston[bos.sam, -14]
x.test.bos <- Boston[-bos.sam, -14]
y.train.bos <- Boston[bos.sam, 14]
y.test.bos <- Boston[-bos.sam,14]

#randomforest for p
randomFor1 <- randomForest(x.train.bos, y.train.bos, x.test.bos,y.test.bos, mtry = ncol(Boston)-1, ntree=500)
#randomforest for p/2
randomFor2 <- randomForest(x.train.bos, y.train.bos, x.test.bos,y.test.bos, mtry = ncol(Boston)/2, ntree=500)
#randomforest for sqrt(p)
randomFor3 <- randomForest(x.train.bos, y.train.bos, x.test.bos,y.test.bos, mtry = sqrt(ncol(Boston)-1), ntree=500)

plot(1:500, randomFor1$test$mse, col='yellow')
lines(1:500, randomFor2$test$mse, col='red')
lines(1:500, randomFor3$test$mse, col='purple')

8.9

  1. There are 9 terminal nodes and the misclassification rate is .1588.

  2. One terminal node is and is for variable SpecialCH < .5. There are 64 observations in this node and the standard deviation of it is 51.98.

  3. The first split is at Loyal CH below ~.5 and the next two internal nodes are also Loyal CH so I am under the impression this variable is very important for the regression results. PriceDiff and ListPriceDiff are the next two interal nodes besides loyalCH so these may be the next most important variables.

  4. The test error rate is .19.

  5. The optimal tree size is 6 terminal nodes because its misclassification error is the lowest among the group.

  6. The lowest cross-validated classification error rate is 9 terminal nodes.

  7. For the pruned tree i got a misclassification rate of .1962 and the unpruned tree as .1588.

  8. The unpruned tree has a test error rate of .193 and the pruned tree has a test error rate of .370. The unpruned tree did better than the pruned tree.

library(ISLR2)
library(tree)
summary(OJ)
##  Purchase WeekofPurchase     StoreID        PriceCH         PriceMM     
##  CH:653   Min.   :227.0   Min.   :1.00   Min.   :1.690   Min.   :1.690  
##  MM:417   1st Qu.:240.0   1st Qu.:2.00   1st Qu.:1.790   1st Qu.:1.990  
##           Median :257.0   Median :3.00   Median :1.860   Median :2.090  
##           Mean   :254.4   Mean   :3.96   Mean   :1.867   Mean   :2.085  
##           3rd Qu.:268.0   3rd Qu.:7.00   3rd Qu.:1.990   3rd Qu.:2.180  
##           Max.   :278.0   Max.   :7.00   Max.   :2.090   Max.   :2.290  
##      DiscCH            DiscMM         SpecialCH        SpecialMM     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.05186   Mean   :0.1234   Mean   :0.1477   Mean   :0.1617  
##  3rd Qu.:0.00000   3rd Qu.:0.2300   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :0.50000   Max.   :0.8000   Max.   :1.0000   Max.   :1.0000  
##     LoyalCH          SalePriceMM     SalePriceCH      PriceDiff       Store7   
##  Min.   :0.000011   Min.   :1.190   Min.   :1.390   Min.   :-0.6700   No :714  
##  1st Qu.:0.325257   1st Qu.:1.690   1st Qu.:1.750   1st Qu.: 0.0000   Yes:356  
##  Median :0.600000   Median :2.090   Median :1.860   Median : 0.2300            
##  Mean   :0.565782   Mean   :1.962   Mean   :1.816   Mean   : 0.1465            
##  3rd Qu.:0.850873   3rd Qu.:2.130   3rd Qu.:1.890   3rd Qu.: 0.3200            
##  Max.   :0.999947   Max.   :2.290   Max.   :2.090   Max.   : 0.6400            
##    PctDiscMM        PctDiscCH       ListPriceDiff       STORE      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.140   1st Qu.:0.000  
##  Median :0.0000   Median :0.00000   Median :0.240   Median :2.000  
##  Mean   :0.0593   Mean   :0.02731   Mean   :0.218   Mean   :1.631  
##  3rd Qu.:0.1127   3rd Qu.:0.00000   3rd Qu.:0.300   3rd Qu.:3.000  
##  Max.   :0.4020   Max.   :0.25269   Max.   :0.440   Max.   :4.000
#a
set.seed(4)
oj.sam <- sample(1:nrow(OJ), 800)

train.oj <- OJ[oj.sam,]
test.oj <- OJ[-oj.sam,]

#b 
tree1 <- tree(Purchase~., data=train.oj)
summary(tree1)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train.oj)
## Variables actually used in tree construction:
## [1] "LoyalCH"        "PriceDiff"      "WeekofPurchase"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7617 = 602.5 / 791 
## Misclassification error rate: 0.1588 = 127 / 800
#c

tree1
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1062.000 CH ( 0.62125 0.37875 )  
##    2) LoyalCH < 0.5036 350  430.900 MM ( 0.30571 0.69429 )  
##      4) LoyalCH < 0.276142 165  140.400 MM ( 0.15152 0.84848 )  
##        8) LoyalCH < 0.0356415 52    9.883 MM ( 0.01923 0.98077 ) *
##        9) LoyalCH > 0.0356415 113  116.900 MM ( 0.21239 0.78761 ) *
##      5) LoyalCH > 0.276142 185  254.100 MM ( 0.44324 0.55676 )  
##       10) PriceDiff < 0.05 77   83.740 MM ( 0.23377 0.76623 ) *
##       11) PriceDiff > 0.05 108  146.000 CH ( 0.59259 0.40741 )  
##         22) WeekofPurchase < 247.5 40   52.930 MM ( 0.37500 0.62500 ) *
##         23) WeekofPurchase > 247.5 68   80.570 CH ( 0.72059 0.27941 ) *
##    3) LoyalCH > 0.5036 450  353.400 CH ( 0.86667 0.13333 )  
##      6) PriceDiff < -0.39 24   28.970 MM ( 0.29167 0.70833 ) *
##      7) PriceDiff > -0.39 426  278.700 CH ( 0.89906 0.10094 )  
##       14) LoyalCH < 0.764572 170  170.100 CH ( 0.80000 0.20000 )  
##         28) PriceDiff < 0.265 98  120.700 CH ( 0.69388 0.30612 ) *
##         29) PriceDiff > 0.265 72   30.900 CH ( 0.94444 0.05556 ) *
##       15) LoyalCH > 0.764572 256   77.940 CH ( 0.96484 0.03516 ) *
#d
plot(tree1)
text(tree1)

#e
tree.pred <- predict(tree1, test.oj, type='class')
table(tree.pred, test.oj$Purchase)
##          
## tree.pred  CH  MM
##        CH 137  31
##        MM  19  83
test.error.oj <- (19+33)/270
test.error.oj
## [1] 0.1925926
#f
oj.cv<- cv.tree(tree1)
oj.cv
## $size
## [1] 9 8 7 6 5 4 3 2 1
## 
## $dev
## [1]  726.8698  729.0749  737.0488  721.5666  743.3588  764.6417  779.0622
## [8]  784.3296 1063.4077
## 
## $k
## [1]      -Inf  12.50359  13.60942  18.51157  24.33767  30.64452  36.50611
## [8]  45.70812 277.17120
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
#g

plot(oj.cv$size, oj.cv$dev, type='b')

#i

prune <- prune.tree(tree1, best=5)
summary(prune)
## 
## Classification tree:
## snip.tree(tree = tree1, nodes = c(4L, 14L, 5L))
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.8446 = 671.5 / 795 
## Misclassification error rate: 0.1962 = 157 / 800
plot(prune)
text(prune)

summary(prune)
## 
## Classification tree:
## snip.tree(tree = tree1, nodes = c(4L, 14L, 5L))
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.8446 = 671.5 / 795 
## Misclassification error rate: 0.1962 = 157 / 800
summary(tree1)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train.oj)
## Variables actually used in tree construction:
## [1] "LoyalCH"        "PriceDiff"      "WeekofPurchase"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7617 = 602.5 / 791 
## Misclassification error rate: 0.1588 = 127 / 800
#k 

pred.prune <- predict(prune, test.oj, type='class')
table(pred.prune, test.oj$Purchase)
##           
## pred.prune  CH  MM
##         CH 125  17
##         MM  31  97
prune.test.error <- 100/270
prune.test.error
## [1] 0.3703704

8.10

For complete transparency I used an online source to help guide me with where to go on this problem.

  1. The test error for boosting is significantly lower than my test error for linear regression and my test error for ridge regression. My test error for boosting was .000048 and my linear models error was 1.4286 and ridge was .125.

  2. Using the summary function I believe CAtbats and Cwalks are the most important variables in this dataset.

  3. The test error rate for bagging is .0002 which is not lower than boosting.

#a
library(ISLR2)
library(gbm)
Hitters <- na.omit(Hitters)
Hitters$Log.Salary <- log(Hitters$Salary)
#b

train.first <- 1:200
train.hitters <- Hitters[train.first,]
test.hitters <- Hitters[-train.first,]

#c
set.seed(1)
sequence1 = seq(-50, 0, 0.1)
lambda = 10^sequence1
train.err.hitters = rep(NA, length(lambda))
for (i in 1:length(lambda)) {
    boost.hitters = gbm(Log.Salary ~ ., data = train.hitters, distribution = "gaussian", n.trees = 1000, shrinkage = lambda[i])
    pred.train.hitt = predict(boost.hitters, train.hitters, n.trees = 1000)
    train.err.hitters[i] = mean((pred.train.hitt - train.hitters$Log.Salary)^2)
}
plot(lambda, train.err.hitters, type = "b")
min(train.err.hitters)
## [1] 1.831462e-05
#d 
test.err.hitters = rep(NA, length(lambda))
for (i in 1:length(lambda)) {
    boost.hitters = gbm(Log.Salary ~ ., data = train.hitters, distribution = "gaussian", n.trees = 1000, shrinkage = lambda[i])
    pred.test.hitt = predict(boost.hitters, train.hitters, n.trees = 1000)
    test.err.hitters[i] = mean((pred.test.hitt - train.hitters$Log.Salary)^2)
}
plot(lambda, train.err.hitters, type = "b")

min(test.err.hitters)
## [1] 4.818151e-05
#e
#Linear Regression

fit.lin <- lm(train.hitters$Log.Salary~., data=train.hitters)
pred.lin <- predict(fit.lin, train.hitters)
lin.error <- mean((pred.lin-test.hitters$Log.Salary)^2)

#Lasso
library(glmnet)
set.seed(1)
grid2 <- 10^seq(10,-2,length=100)

train.mat.hit <- model.matrix(Log.Salary~., data=train.hitters)[,-1]
test.mat.hit <- model.matrix(Log.Salary~., data=test.hitters)[,-1]

cv.hit <- cv.glmnet(train.mat.hit, train.hitters$Log.Salary, lambda = grid2,  alpha=0)

bestlambda.hit <-cv.hit$lambda.min

ridge.hitters <- glmnet(train.mat.hit, train.hitters$Log.Salary, alpha=0)

ridge.pred.hit <- predict(ridge.hitters, newx=test.mat.hit, s=bestlambda.hit)

error.ridge.hit <- mean((test.hitters$Log.Salary-ridge.pred.hit)^2)


lin.error
## [1] 1.428653
error.ridge.hit
## [1] 0.1250386
min(test.err.hitters)
## [1] 4.818151e-05
#f

boost.hitters5 = gbm(Log.Salary ~ ., data = train.hitters, distribution = "gaussian", n.trees = 1000, shrinkage = 1)

summary(boost.hitters5)

##                 var     rel.inf
## Salary       Salary 82.52115854
## PutOuts     PutOuts  2.99797625
## CRBI           CRBI  1.86997520
## CWalks       CWalks  1.79695916
## Assists     Assists  1.55546329
## CHmRun       CHmRun  1.44822937
## Runs           Runs  1.03854732
## HmRun         HmRun  0.88668249
## RBI             RBI  0.86037802
## Walks         Walks  0.85860019
## CRuns         CRuns  0.75103531
## AtBat         AtBat  0.70048810
## Hits           Hits  0.66170670
## CHits         CHits  0.58410577
## Errors       Errors  0.53694500
## CAtBat       CAtBat  0.46878339
## Years         Years  0.31468036
## Division   Division  0.08273306
## League       League  0.05478492
## NewLeague NewLeague  0.01076756
#g
library(randomForest)
hitters.bag <- randomForest(Log.Salary~., data=train.hitters, mtry=19, ntree=500)

y.bag <- predict(hitters.bag, newdata=test.hitters)
mean((y.bag - test.hitters$Log.Salary)^2)
## [1] 0.0002582842