The accuracy of logistic regression was 75%, the accuracy of of LDA is 87%, the accuracy of naive bayes is 83%, and KNN has an accuracy of 84%. This means that LDA was the most accurate apporach to predicting whether or not a district was above the mean crime level.
library(tidyverse)
library(ISLR2)
#created a variable that says whether the crime rate for this neighborhood is above the median for the data set
Boston$abv.med <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
#LOGISTIC REGRESSION
#setting seed
set.seed(1)
#creating training data of the 506 observation in Boston data and using 80% so it is rounded to 405
train.sample <- sample(1:nrow(Boston), nrow(Boston)*.8)
#assigning my training data to the sample I just created
train <- Boston[train.sample,]
#assigning my test data to be all the data not in training data
test <- Boston[-train.sample,]
#fitting a regression for the training data
fit1 <- glm(abv.med~ zn+indus+chas+nox+rm+age+dis+rad+tax+medv, family=binomial, data=train)
#predicting testing data for my first regression
test.pred1 <- predict(fit1, test, type='response')
#changing my predictions to be either above or below the median
pred.word <- ifelse(test.pred1 > .05, 'above', 'below')
#changing my actual data to be words
test.word <- ifelse(test$abv.med == 1, 'above', 'below')
table1 <- table(pred.word, test.word)
logistic.test.accuracy1 <- mean(pred.word == test.word)
#LINEAR DISCRIMINANT ANALYSIS
set.seed(1)
library(MASS)
lda.1 <- lda(abv.med~ zn+indus+chas+nox+rm+age+dis+rad+tax+medv, data=train)
lda.1
## Call:
## lda(abv.med ~ zn + indus + chas + nox + rm + age + dis + rad +
## tax + medv, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## zn indus chas nox rm age dis rad
## 0 21.631188 7.165396 0.05940594 0.4704243 6.432842 51.33861 5.036025 4.148515
## 1 1.108911 15.358267 0.08910891 0.6356634 6.172574 85.71931 2.532918 15.099010
## tax medv
## 0 309.4109 25.28960
## 1 513.8812 20.43119
##
## Coefficients of linear discriminants:
## LD1
## zn -0.008128609
## indus 0.020261639
## chas -0.078649361
## nox 7.974579455
## rm 0.131062113
## age 0.012573800
## dis 0.085898482
## rad 0.094130945
## tax -0.001517165
## medv 0.021404207
lda.pred <- predict(lda.1, test)
lda.class <- lda.pred$class
lda.word <- ifelse(lda.class == 1, 'above', 'below')
table(lda.word, test.word)
## test.word
## lda.word above below
## above 40 1
## below 11 50
lda.test.acc <- mean(lda.word ==test.word)
#NAIVE BAYES
set.seed(1)
library(e1071)
nb.1 <- naiveBayes(abv.med~ zn+indus+nox+rm+age+dis+rad+tax+medv, data=train)
nb.pred <- predict(nb.1, test)
nb.word <- ifelse(nb.pred >.5, "above", "below")
nb.table <- table(nb.pred, test.word)
nb.table
## test.word
## nb.pred above below
## 0 15 45
## 1 36 6
#KNN
library(class)
set.seed(1)
sam.knn <- sample(1:nrow(Boston), nrow(Boston)*.5)
knn.train <- as.matrix(Boston[sam.knn,-16])
knn.test <- as.matrix(Boston[-sam.knn,-16])
knn.target <- Boston$abv.med[sam.knn]
knn.test.target <- Boston$abv.med[-sam.knn]
knn <- knn(knn.train, knn.test, cl=knn.target, k=10)
mean(knn == knn.test.target)
## [1] 0.8537549
The procedure of k-fold cross validation is when you pick a “k” which represents the number of splits in a given dataset. Hypothetically if k=10 and your dataset has 100 observations, you will randomly create 10 groups of 10 for your data. Next, you will go through and use each group as a test group once with all other groups being training groups. Then you will test the MSE on each method and average them out to see how well on average your model did.
The validation set approach can lead to overestimation of the test error rate because the training set is using only half of the observations in the data set when the k-fold is using n-1 groups for the training set.
ALl of my accuracies after three times were almost exactly similar. The largest difference was .0026 which is extremely small. After splitting the sets up multiple times it seems that the accuracy remains fairly close to the same point.
When adding a dummy variable for student it did not seem to help the accuracy of the model. The accuracy actually went down slightly, but in the bigger picture they are likely no different from each other.
library(ISLR2)
set.seed(1)
train.sam <- sample(1:nrow(Default), nrow(Default)*.5)
Default$binary <- ifelse(Default$default == 'No', 0, 1)
#i
train2 <- Default[train.sam,]
test2 <- Default[-train.sam,]
#ii
fit2 <- glm(binary~income+balance, data=train2, family=binomial)
pred2 <- predict(fit2, test2, type='response')
pred2.bin<- ifelse(pred2 >.5, 1, 0)
acc2 <- mean((test2$binary == pred2.bin)^2)
#iii
test.error2 <- mean((test2$binary - pred2.bin)^2)
acc2
## [1] 0.9746
table(test2$binary, pred2.bin)
## pred2.bin
## 0 1
## 0 4824 19
## 1 108 49
#PART C
set.seed(2)
train.sam2 <- sample(1:nrow(Default), nrow(Default)*.5)
Default$binary <- ifelse(Default$default == 'No', 0, 1)
train3 <- Default[train.sam2,]
test3 <- Default[-train.sam2,]
fit3 <- glm(binary~income+balance, data=train3, family=binomial)
pred3 <- predict(fit3, test3, type='response')
pred3.bin<- ifelse(pred3 >.5, 1, 0)
acc3 <- mean((test3$binary == pred3.bin)^2)
acc3
## [1] 0.9762
test.error3 <- mean((test3$binary - pred3.bin)^2)
test.error3
## [1] 0.0238
table(test3$binary, pred3.bin)
## pred3.bin
## 0 1
## 0 4819 18
## 1 101 62
set.seed(3)
train.sam3 <- sample(1:nrow(Default), nrow(Default)*.5)
train4 <- Default[train.sam3,]
test4 <- Default[-train.sam3,]
fit4 <- glm(binary~income+balance, data=train4, family=binomial)
pred4 <- predict(fit4, test4, type='response')
pred4.bin<- ifelse(pred4 >.5, 1, 0)
acc4 <- mean((test4$binary == pred4.bin)^2)
acc4
## [1] 0.9736
table(test4$binary, pred4.bin)
## pred4.bin
## 0 1
## 0 4822 23
## 1 109 46
#PART D
#creating a binary for student
Default$stbin <- ifelse(Default$student == "Yes", 1, 0)
train.sam4 <- sample(1:nrow(Default), nrow(Default)*.5)
train5 <- Default[train.sam4,]
test5 <- Default[-train.sam4,]
fit5 <- lm(binary~income+balance+stbin, data=train5, family=binomial)
pred5 <- predict(fit5, test5, type='response')
pred5.bin <- ifelse(pred5 > .5, 1, 0)
acc5 <- mean((test5$binary == pred5.bin)^2)
our uhat was 22.5328.
The standard error for the mean of medv is .4088611 which can be interpreted as the average error around our mean of 22.53. Two standard errors plus and minus the mean will give you a 95% confidence interval for the mean. Since the standard error is significantly smaller than the mean it would likely prove to be significant and shows that our estimate could be reliable.
The bootstrap shows a standard error of .4060352 and before we got a standard deviation of .4088611 so these are very close with .002 of one another.
The T-test function gave an interval of 21.729 to 23.336, while our bootstrap gave us 21.7 and 23.4 so they were very close in proximity.
our estimated median was 21.2
The standard error I recieved from the bootstrap was .377
The tenth percentile of of medv is 12.8.
The standard error is .49. The standard error is a lot smaller than our estimate so our confidence intervals would likely contain a postive value above 10 depending on the level of confidence,
library(ISLR2)
library(boot)
#a
uhat <- mean(Boston$medv)
#b
ste.uhat <- (sd(Boston$medv))/sqrt(nrow(Boston))
#c
set.seed(1)
ste.fn <- function(data, index) {
u <- mean(data[index])
return(u)
}
ste.fn(Boston$medv, 1:506)
## [1] 22.53281
bootstrap <- boot(Boston$medv,ste.fn,1000)
bootstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = ste.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
#d
CI <- round(c(mean(bootstrap$t) - 2*sd(bootstrap$t), mean(bootstrap$t) + 2*sd(bootstrap$t)),3)
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
#e
med.u <- median(Boston$medv)
#f
med.fn <- function(data, index){
return(median(data[index]))
}
bootstrap2 <- boot(Boston$medv, med.fn, 1000)
bootstrap2
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = med.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0386 0.3770241
#g
u.1 <- quantile(Boston$medv, .1)
#h
u.1fn <- function(data, index){
return(quantile(data[index], c(.1)))
}
bootstrap3 <- boot(Boston$medv, u.1fn, 1000)
bootstrap3
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = u.1fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0186 0.4925766
iii is correct. The lasso has similar behavior to the ridge method in that as lambda increases, the model becomes less flexible and also has decreased variance and increased bias. So, as long as the increase in bias does not overtake the decrease in variance it has a an advantage over OLS.
iii is correct. It is correct because as lambda increases the model becomes less flexible so the first part is correct. It becomes less flexible which then, in turn, leads to decreased variance but increased bias. The graphs on page 240 of the book illustrate this well. So, as long as the increase in bias is not larger than the decrease in variance the model will have an advantage to OLS.
ii is correct. Non-linear methods are inherently more flexible than linear methods, so as long as the lower bias is not outweighed by the
library(ISLR2)
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
#a
#Creating train and test
sam.college <- sample(1:nrow(College), nrow(College)*.8)
train.col <- College[sam.college,]
test.col <- College[-sam.college,]
#b
#OLS
ols.col <- lm(Apps~., data=train.col)
pred.col <- predict(ols.col, test.col, type='response')
error.ols <- mean((pred.col-test.col$Apps)^2)
#c
#RIDGE
library(glmnet)
set.seed(1)
grid <- 10^seq(10,-2,length=100)
train.mat <- model.matrix(Apps~., data=train.col)[,-1]
test.mat <- model.matrix(Apps~., data=test.col)[,-1]
cv <- cv.glmnet(train.mat, train.col$Apps, lambda = grid, alpha=0)
bestlambda <-cv$lambda.min
ridge.lm <- glmnet(train.mat, train.col$Apps, alpha=0)
ridge.pred <- predict(ridge.lm, newx=test.mat, s=bestlambda)
error.ridge <- mean((test.col$Apps-ridge.pred)^2)
#d
#LASSO
set.seed(1)
cv2 <- cv.glmnet(train.mat, train.col$Apps, alpha=1, lambda=grid)
bestlambda2 <- cv2$lambda.min
lasso.lm <- glmnet(train.mat, train.col$Apps, alpha=1, lambda = grid)
lasso.pred <- predict(lasso.lm, newx=test.mat, lambda=grid, s=bestlambda2)
error.lasso <- mean((test.col$Apps-lasso.pred)^2)
On attached note page
In the lab, we applied random forests to the Boston data using mtry = 6 and using ntree = 25 and ntree = 500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. You can model your plot after Figure 8.10. Describe the results obtained.
library(randomForest)
set.seed(1)
bos.sam <- sample(1:nrow(Boston), nrow(Boston)*.8)
#making a training sample removing medv
x.train.bos <- Boston[bos.sam, -14]
x.test.bos <- Boston[-bos.sam, -14]
y.train.bos <- Boston[bos.sam, 14]
y.test.bos <- Boston[-bos.sam,14]
#randomforest for p
randomFor1 <- randomForest(x.train.bos, y.train.bos, x.test.bos,y.test.bos, mtry = ncol(Boston)-1, ntree=500)
#randomforest for p/2
randomFor2 <- randomForest(x.train.bos, y.train.bos, x.test.bos,y.test.bos, mtry = ncol(Boston)/2, ntree=500)
#randomforest for sqrt(p)
randomFor3 <- randomForest(x.train.bos, y.train.bos, x.test.bos,y.test.bos, mtry = sqrt(ncol(Boston)-1), ntree=500)
plot(1:500, randomFor1$test$mse, col='yellow')
lines(1:500, randomFor2$test$mse, col='red')
lines(1:500, randomFor3$test$mse, col='purple')
There are 9 terminal nodes and the misclassification rate is .1588.
One terminal node is and is for variable SpecialCH < .5. There are 64 observations in this node and the standard deviation of it is 51.98.
The first split is at Loyal CH below ~.5 and the next two internal nodes are also Loyal CH so I am under the impression this variable is very important for the regression results. PriceDiff and ListPriceDiff are the next two interal nodes besides loyalCH so these may be the next most important variables.
The test error rate is .19.
The optimal tree size is 6 terminal nodes because its misclassification error is the lowest among the group.
The lowest cross-validated classification error rate is 9 terminal nodes.
For the pruned tree i got a misclassification rate of .1962 and the unpruned tree as .1588.
The unpruned tree has a test error rate of .193 and the pruned tree has a test error rate of .370. The unpruned tree did better than the pruned tree.
library(ISLR2)
library(tree)
summary(OJ)
## Purchase WeekofPurchase StoreID PriceCH PriceMM
## CH:653 Min. :227.0 Min. :1.00 Min. :1.690 Min. :1.690
## MM:417 1st Qu.:240.0 1st Qu.:2.00 1st Qu.:1.790 1st Qu.:1.990
## Median :257.0 Median :3.00 Median :1.860 Median :2.090
## Mean :254.4 Mean :3.96 Mean :1.867 Mean :2.085
## 3rd Qu.:268.0 3rd Qu.:7.00 3rd Qu.:1.990 3rd Qu.:2.180
## Max. :278.0 Max. :7.00 Max. :2.090 Max. :2.290
## DiscCH DiscMM SpecialCH SpecialMM
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.05186 Mean :0.1234 Mean :0.1477 Mean :0.1617
## 3rd Qu.:0.00000 3rd Qu.:0.2300 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :0.50000 Max. :0.8000 Max. :1.0000 Max. :1.0000
## LoyalCH SalePriceMM SalePriceCH PriceDiff Store7
## Min. :0.000011 Min. :1.190 Min. :1.390 Min. :-0.6700 No :714
## 1st Qu.:0.325257 1st Qu.:1.690 1st Qu.:1.750 1st Qu.: 0.0000 Yes:356
## Median :0.600000 Median :2.090 Median :1.860 Median : 0.2300
## Mean :0.565782 Mean :1.962 Mean :1.816 Mean : 0.1465
## 3rd Qu.:0.850873 3rd Qu.:2.130 3rd Qu.:1.890 3rd Qu.: 0.3200
## Max. :0.999947 Max. :2.290 Max. :2.090 Max. : 0.6400
## PctDiscMM PctDiscCH ListPriceDiff STORE
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.140 1st Qu.:0.000
## Median :0.0000 Median :0.00000 Median :0.240 Median :2.000
## Mean :0.0593 Mean :0.02731 Mean :0.218 Mean :1.631
## 3rd Qu.:0.1127 3rd Qu.:0.00000 3rd Qu.:0.300 3rd Qu.:3.000
## Max. :0.4020 Max. :0.25269 Max. :0.440 Max. :4.000
#a
set.seed(4)
oj.sam <- sample(1:nrow(OJ), 800)
train.oj <- OJ[oj.sam,]
test.oj <- OJ[-oj.sam,]
#b
tree1 <- tree(Purchase~., data=train.oj)
summary(tree1)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train.oj)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "WeekofPurchase"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7617 = 602.5 / 791
## Misclassification error rate: 0.1588 = 127 / 800
#c
tree1
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1062.000 CH ( 0.62125 0.37875 )
## 2) LoyalCH < 0.5036 350 430.900 MM ( 0.30571 0.69429 )
## 4) LoyalCH < 0.276142 165 140.400 MM ( 0.15152 0.84848 )
## 8) LoyalCH < 0.0356415 52 9.883 MM ( 0.01923 0.98077 ) *
## 9) LoyalCH > 0.0356415 113 116.900 MM ( 0.21239 0.78761 ) *
## 5) LoyalCH > 0.276142 185 254.100 MM ( 0.44324 0.55676 )
## 10) PriceDiff < 0.05 77 83.740 MM ( 0.23377 0.76623 ) *
## 11) PriceDiff > 0.05 108 146.000 CH ( 0.59259 0.40741 )
## 22) WeekofPurchase < 247.5 40 52.930 MM ( 0.37500 0.62500 ) *
## 23) WeekofPurchase > 247.5 68 80.570 CH ( 0.72059 0.27941 ) *
## 3) LoyalCH > 0.5036 450 353.400 CH ( 0.86667 0.13333 )
## 6) PriceDiff < -0.39 24 28.970 MM ( 0.29167 0.70833 ) *
## 7) PriceDiff > -0.39 426 278.700 CH ( 0.89906 0.10094 )
## 14) LoyalCH < 0.764572 170 170.100 CH ( 0.80000 0.20000 )
## 28) PriceDiff < 0.265 98 120.700 CH ( 0.69388 0.30612 ) *
## 29) PriceDiff > 0.265 72 30.900 CH ( 0.94444 0.05556 ) *
## 15) LoyalCH > 0.764572 256 77.940 CH ( 0.96484 0.03516 ) *
#d
plot(tree1)
text(tree1)
#e
tree.pred <- predict(tree1, test.oj, type='class')
table(tree.pred, test.oj$Purchase)
##
## tree.pred CH MM
## CH 137 31
## MM 19 83
test.error.oj <- (19+33)/270
test.error.oj
## [1] 0.1925926
#f
oj.cv<- cv.tree(tree1)
oj.cv
## $size
## [1] 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 726.8698 729.0749 737.0488 721.5666 743.3588 764.6417 779.0622
## [8] 784.3296 1063.4077
##
## $k
## [1] -Inf 12.50359 13.60942 18.51157 24.33767 30.64452 36.50611
## [8] 45.70812 277.17120
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
#g
plot(oj.cv$size, oj.cv$dev, type='b')
#i
prune <- prune.tree(tree1, best=5)
summary(prune)
##
## Classification tree:
## snip.tree(tree = tree1, nodes = c(4L, 14L, 5L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.8446 = 671.5 / 795
## Misclassification error rate: 0.1962 = 157 / 800
plot(prune)
text(prune)
summary(prune)
##
## Classification tree:
## snip.tree(tree = tree1, nodes = c(4L, 14L, 5L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.8446 = 671.5 / 795
## Misclassification error rate: 0.1962 = 157 / 800
summary(tree1)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train.oj)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "WeekofPurchase"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7617 = 602.5 / 791
## Misclassification error rate: 0.1588 = 127 / 800
#k
pred.prune <- predict(prune, test.oj, type='class')
table(pred.prune, test.oj$Purchase)
##
## pred.prune CH MM
## CH 125 17
## MM 31 97
prune.test.error <- 100/270
prune.test.error
## [1] 0.3703704
For complete transparency I used an online source to help guide me with where to go on this problem.
The test error for boosting is significantly lower than my test error for linear regression and my test error for ridge regression. My test error for boosting was .000048 and my linear models error was 1.4286 and ridge was .125.
Using the summary function I believe CAtbats and Cwalks are the most important variables in this dataset.
The test error rate for bagging is .0002 which is not lower than boosting.
#a
library(ISLR2)
library(gbm)
Hitters <- na.omit(Hitters)
Hitters$Log.Salary <- log(Hitters$Salary)
#b
train.first <- 1:200
train.hitters <- Hitters[train.first,]
test.hitters <- Hitters[-train.first,]
#c
set.seed(1)
sequence1 = seq(-50, 0, 0.1)
lambda = 10^sequence1
train.err.hitters = rep(NA, length(lambda))
for (i in 1:length(lambda)) {
boost.hitters = gbm(Log.Salary ~ ., data = train.hitters, distribution = "gaussian", n.trees = 1000, shrinkage = lambda[i])
pred.train.hitt = predict(boost.hitters, train.hitters, n.trees = 1000)
train.err.hitters[i] = mean((pred.train.hitt - train.hitters$Log.Salary)^2)
}
plot(lambda, train.err.hitters, type = "b")
min(train.err.hitters)
## [1] 1.831462e-05
#d
test.err.hitters = rep(NA, length(lambda))
for (i in 1:length(lambda)) {
boost.hitters = gbm(Log.Salary ~ ., data = train.hitters, distribution = "gaussian", n.trees = 1000, shrinkage = lambda[i])
pred.test.hitt = predict(boost.hitters, train.hitters, n.trees = 1000)
test.err.hitters[i] = mean((pred.test.hitt - train.hitters$Log.Salary)^2)
}
plot(lambda, train.err.hitters, type = "b")
min(test.err.hitters)
## [1] 4.818151e-05
#e
#Linear Regression
fit.lin <- lm(train.hitters$Log.Salary~., data=train.hitters)
pred.lin <- predict(fit.lin, train.hitters)
lin.error <- mean((pred.lin-test.hitters$Log.Salary)^2)
#Lasso
library(glmnet)
set.seed(1)
grid2 <- 10^seq(10,-2,length=100)
train.mat.hit <- model.matrix(Log.Salary~., data=train.hitters)[,-1]
test.mat.hit <- model.matrix(Log.Salary~., data=test.hitters)[,-1]
cv.hit <- cv.glmnet(train.mat.hit, train.hitters$Log.Salary, lambda = grid2, alpha=0)
bestlambda.hit <-cv.hit$lambda.min
ridge.hitters <- glmnet(train.mat.hit, train.hitters$Log.Salary, alpha=0)
ridge.pred.hit <- predict(ridge.hitters, newx=test.mat.hit, s=bestlambda.hit)
error.ridge.hit <- mean((test.hitters$Log.Salary-ridge.pred.hit)^2)
lin.error
## [1] 1.428653
error.ridge.hit
## [1] 0.1250386
min(test.err.hitters)
## [1] 4.818151e-05
#f
boost.hitters5 = gbm(Log.Salary ~ ., data = train.hitters, distribution = "gaussian", n.trees = 1000, shrinkage = 1)
summary(boost.hitters5)
## var rel.inf
## Salary Salary 82.52115854
## PutOuts PutOuts 2.99797625
## CRBI CRBI 1.86997520
## CWalks CWalks 1.79695916
## Assists Assists 1.55546329
## CHmRun CHmRun 1.44822937
## Runs Runs 1.03854732
## HmRun HmRun 0.88668249
## RBI RBI 0.86037802
## Walks Walks 0.85860019
## CRuns CRuns 0.75103531
## AtBat AtBat 0.70048810
## Hits Hits 0.66170670
## CHits CHits 0.58410577
## Errors Errors 0.53694500
## CAtBat CAtBat 0.46878339
## Years Years 0.31468036
## Division Division 0.08273306
## League League 0.05478492
## NewLeague NewLeague 0.01076756
#g
library(randomForest)
hitters.bag <- randomForest(Log.Salary~., data=train.hitters, mtry=19, ntree=500)
y.bag <- predict(hitters.bag, newdata=test.hitters)
mean((y.bag - test.hitters$Log.Salary)^2)
## [1] 0.0002582842