library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.3
hitters1 <- Hitters
hitters2 <- hitters1[,-c(14,15,20)]
hitters3 <- hitters2[complete.cases(hitters2),]
hitters3["logSalary"] <- NA
hitters3$logSalary <- log(hitters3$Salary)
hitters4 <- hitters3[,-c(17)]
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
set.seed(12345)
data_partition <- createDataPartition(hitters4$logSalary, p = .7, list = FALSE)
training <- hitters4[data_partition,]
testing <- hitters4[-data_partition,]
The R-Squared is .451802
LM1 <- lm(logSalary ~ ., data=training)
summary(LM1)
##
## Call:
## lm(formula = logSalary ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27147 -0.44874 0.01607 0.40936 2.79909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.677e+00 1.959e-01 23.872 <2e-16 ***
## AtBat -3.931e-03 1.523e-03 -2.582 0.0107 *
## Hits 1.445e-02 5.813e-03 2.485 0.0139 *
## HmRun 9.406e-03 1.480e-02 0.636 0.5258
## Runs -3.868e-03 6.987e-03 -0.554 0.5805
## RBI 3.493e-03 6.463e-03 0.540 0.5896
## Walks 1.066e-02 4.492e-03 2.373 0.0188 *
## Years 4.177e-02 3.128e-02 1.335 0.1836
## CAtBat -2.807e-05 3.299e-04 -0.085 0.9323
## CHits 1.097e-03 1.745e-03 0.629 0.5302
## CHmRun 5.071e-04 3.912e-03 0.130 0.8970
## CRuns 8.409e-04 1.764e-03 0.477 0.6343
## CRBI -1.265e-03 1.822e-03 -0.694 0.4886
## CWalks -9.993e-04 8.293e-04 -1.205 0.2299
## PutOuts 4.154e-04 1.765e-04 2.355 0.0197 *
## Assists 1.052e-03 5.131e-04 2.051 0.0418 *
## Errors -1.647e-02 1.017e-02 -1.619 0.1074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6188 on 168 degrees of freedom
## Multiple R-squared: 0.5713, Adjusted R-squared: 0.5305
## F-statistic: 13.99 on 16 and 168 DF, p-value: < 2.2e-16
predicted <- predict(LM1, newdata = testing)
RSquared_LM1Testing <- cor(predicted, testing$logSalary)^2
RSquared_LM1Testing
## [1] 0.4518025
The R-Squared is .48121
library(ridge)
set.seed(12345)
RR1 <- train(logSalary~., data = training, method = "ridge", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"))
predicted_RR1 <- predict(RR1, newdata = testing)
RSquared_RR1 <- cor(predicted_RR1, testing$logSalary)^2
RSquared_RR1
## [1] 0.4812084
The R-squared is .4686
set.seed(12345)
LASSO1 <- train(logSalary~., data = training, method = "lasso", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"))
predicted_LASSO1 <- predict(LASSO1, newdata = testing)
RSquared_LASSO1 <- cor(predicted_LASSO1, testing$logSalary)^2
RSquared_LASSO1
## [1] 0.4686052
The variables that were removed are atbat, hits, hmrun, runs, rbi, walks, years, catbat, chits, chmrun, cruns, crbi, cwalks, putouts, assists and errors
predict.enet(LASSO1$finalModel, type="coefficients", s=LASSO1$bestTune$fraction, mode="fraction")$coefficients
## AtBat Hits HmRun Runs RBI Walks
## -0.26718587 0.40050874 0.04954137 0.00000000 0.03072973 0.12541714
## Years CAtBat CHits CHmRun CRuns CRBI
## 0.13511795 0.00000000 0.45915555 -0.09788284 0.04494156 0.00000000
## CWalks PutOuts Assists Errors
## -0.07672909 0.11115853 0.09988299 -0.09130279
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.3.3
data(permeability)
data(fingerprints)
## Warning in data(fingerprints): data set 'fingerprints' not found
permeability <- data.frame(permeability)
fingerprints <- data.frame(fingerprints)
df1 <- cbind(permeability, fingerprints)
df2 <- df1[, -nearZeroVar(df1)]
df <- data.frame(df2)
names(df)[389] = 'Permeability'
library(caret)
set.seed(12345)
data_partition_permeability <- createDataPartition(df$Permeability, p = .7, list = FALSE)
training_permeability <- df[data_partition_permeability,]
testing_permeability <- df[-data_partition_permeability,]
library(caret)
LM2 <- train(permeability~., data = training_permeability, method = "lm", preProcess=c("center", "scale"))
predicted_LM2 <- predict(LM2, newdata = testing_permeability)
RSquared_LM2 <- cor(predicted_LM2, testing_permeability$Permeability)^2
RSquared_LM2
## [1] 0.001620515
The R-squared got smaller.
LM2pca <- train(permeability~., data = training_permeability, method = "lm", preProcess=c("center", "scale", "pca"))
predicted_LM2pca <- predict(LM2pca, newdata = testing_permeability)
RSquared_LM2pca <- cor(predicted_LM2pca, testing_permeability$Permeability)^2
RSquared_LM2pca
## [1] 9.370273e-05
library(data.table)
library(curl)
cancer1 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.xtrain"))
cancer2 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.ytrain"))
c_tra <- cbind(cancer1, cancer2)
names(c_tra)[16064] = 'response'
c_tra$response <- as.factor(c_tra$response)
c_tra <- as.data.frame(c_tra)
cancer3 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.xtest"))
cancer4 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.ytest"))
c_test <- cbind(cancer3, cancer4)
names(c_test)[16064] = 'response'
c_test$response <- as.factor(c_test$response)
c_test <- as.data.frame(c_test)
The accuracy is .5
set.seed(12345)
LDA3 <- train(response~., data = c_tra, method = "lda", trControl = trainControl(method = "cv", number = 10))
predicted_LDA3 <- predict(LDA3, newdata = c_test)
confusionMatrix(predicted_LDA3, c_test$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 1 0 0 1 0 2 0 0 0 0 2 1 1 0
## 2 0 2 1 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 2 0 1 0 1 0 0 1 0 0 0 0
## 4 0 0 0 3 0 0 0 0 0 0 0 0 0 0
## 5 0 1 0 0 4 0 0 0 2 0 0 0 0 0
## 6 1 0 0 0 0 1 0 0 0 0 0 2 0 0
## 7 0 1 0 0 0 0 1 0 0 0 0 0 0 0
## 8 1 0 1 0 0 0 0 2 0 2 0 0 0 0
## 9 0 0 0 0 0 0 0 0 4 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 12 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 13 0 1 0 0 0 0 0 0 0 0 0 1 2 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.5
## 95% CI : (0.3608, 0.6392)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : 1.581e-12
##
## Kappa : 0.4594
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.25000 0.33333 0.50000 0.75000 0.66667 0.33333
## Specificity 0.86000 0.97917 0.92000 1.00000 0.93750 0.94118
## Pos Pred Value 0.12500 0.66667 0.33333 1.00000 0.57143 0.25000
## Neg Pred Value 0.93478 0.92157 0.95833 0.98039 0.95745 0.96000
## Prevalence 0.07407 0.11111 0.07407 0.07407 0.11111 0.05556
## Detection Rate 0.01852 0.03704 0.03704 0.05556 0.07407 0.01852
## Detection Prevalence 0.14815 0.05556 0.11111 0.05556 0.12963 0.07407
## Balanced Accuracy 0.55500 0.65625 0.71000 0.87500 0.80208 0.63725
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity 0.50000 1.00000 0.66667 0.00000 0.33333
## Specificity 0.98077 0.92308 1.00000 1.00000 1.00000
## Pos Pred Value 0.50000 0.33333 1.00000 NaN 1.00000
## Neg Pred Value 0.98077 1.00000 0.96000 0.94444 0.96226
## Prevalence 0.03704 0.03704 0.11111 0.05556 0.05556
## Detection Rate 0.01852 0.03704 0.07407 0.00000 0.01852
## Detection Prevalence 0.03704 0.11111 0.07407 0.00000 0.01852
## Balanced Accuracy 0.74038 0.96154 0.83333 0.50000 0.66667
## Class: 12 Class: 13 Class: 14
## Sensitivity 0.00000 0.66667 1.00000
## Specificity 0.96000 0.96078 1.00000
## Pos Pred Value 0.00000 0.50000 1.00000
## Neg Pred Value 0.92308 0.98000 1.00000
## Prevalence 0.07407 0.05556 0.07407
## Detection Rate 0.00000 0.03704 0.07407
## Detection Prevalence 0.03704 0.07407 0.07407
## Balanced Accuracy 0.48000 0.81373 1.00000
The accuracy is .3889
set.seed(12345)
KNN3 <- train(response~., data = c_tra, method = "knn", trControl = trainControl(method = "cv", number = 10))
predicted_KNN3 <- predict(KNN3, c_test)
confusionMatrix(predicted_KNN3, c_test$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 2 0 0 0 1 0 1 0 2 0 0
## 5 0 2 0 0 4 0 0 0 1 0 0 0 0 0
## 6 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 7 0 1 0 0 0 0 0 0 0 0 0 1 0 0
## 8 1 1 0 1 0 0 1 0 0 2 1 1 1 0
## 9 0 0 0 0 0 0 0 0 5 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 1 0 0 0 1 1 1 0 0 0 1 0 0 0
## 12 1 0 0 0 1 0 0 0 0 0 0 0 0 0
## 13 0 0 3 0 0 1 0 0 0 0 0 0 2 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.3889
## 95% CI : (0.2592, 0.5312)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : 1.196e-07
##
## Kappa : 0.3424
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.25000 0.0000 0.25000 0.50000 0.66667 0.33333
## Specificity 0.92000 1.0000 1.00000 0.90000 0.93750 1.00000
## Pos Pred Value 0.20000 NaN 1.00000 0.28571 0.57143 1.00000
## Neg Pred Value 0.93878 0.8889 0.94340 0.95745 0.95745 0.96226
## Prevalence 0.07407 0.1111 0.07407 0.07407 0.11111 0.05556
## Detection Rate 0.01852 0.0000 0.01852 0.03704 0.07407 0.01852
## Detection Prevalence 0.09259 0.0000 0.01852 0.12963 0.12963 0.01852
## Balanced Accuracy 0.58500 0.5000 0.62500 0.70000 0.80208 0.66667
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity 0.00000 0.00000 0.83333 0.00000 0.33333
## Specificity 0.96154 0.82692 1.00000 1.00000 0.92157
## Pos Pred Value 0.00000 0.00000 1.00000 NaN 0.20000
## Neg Pred Value 0.96154 0.95556 0.97959 0.94444 0.95918
## Prevalence 0.03704 0.03704 0.11111 0.05556 0.05556
## Detection Rate 0.00000 0.00000 0.09259 0.00000 0.01852
## Detection Prevalence 0.03704 0.16667 0.09259 0.00000 0.09259
## Balanced Accuracy 0.48077 0.41346 0.91667 0.50000 0.62745
## Class: 12 Class: 13 Class: 14
## Sensitivity 0.00000 0.66667 1.00000
## Specificity 0.96000 0.92157 1.00000
## Pos Pred Value 0.00000 0.33333 1.00000
## Neg Pred Value 0.92308 0.97917 1.00000
## Prevalence 0.07407 0.05556 0.07407
## Detection Rate 0.00000 0.03704 0.07407
## Detection Prevalence 0.03704 0.11111 0.07407
## Balanced Accuracy 0.48000 0.79412 1.00000
The accuracy in problem 3 b is not extremely high but not extremely low and the accuracy in problem 3 c is relatively low. Even though the classifier in 3 b is more effective than the one in 3 c, they are both not very reliable classifiers.
library(caret)
library(kernlab)
data(spam)
set.seed(12345)
data_partition <- createDataPartition(spam$type, p = .7, list = FALSE)
training_spam <- spam[data_partition,]
testing_spam <- spam[-data_partition,]
library(caret)
NB5 <- train(type~., data = training_spam, method = "nb", trControl = trainControl(method = "cv", number = 10))
predicted_NB5 <- predict(NB5, testing_spam)
confusionMatrix(predicted_NB5, testing_spam$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 476 31
## spam 360 512
##
## Accuracy : 0.7165
## 95% CI : (0.6919, 0.7401)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4631
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5694
## Specificity : 0.9429
## Pos Pred Value : 0.9389
## Neg Pred Value : 0.5872
## Prevalence : 0.6062
## Detection Rate : 0.3452
## Detection Prevalence : 0.3677
## Balanced Accuracy : 0.7561
##
## 'Positive' Class : nonspam
##
library(caret)
NB5pca <- train(type~., data = training_spam, method = "nb", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale", "pca"))
predicted_NB5pca <- predict(NB5pca, testing_spam)
confusionMatrix(predicted_NB5pca, testing_spam$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 734 62
## spam 102 481
##
## Accuracy : 0.8811
## 95% CI : (0.8628, 0.8977)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7541
## Mcnemar's Test P-Value : 0.002324
##
## Sensitivity : 0.8780
## Specificity : 0.8858
## Pos Pred Value : 0.9221
## Neg Pred Value : 0.8250
## Prevalence : 0.6062
## Detection Rate : 0.5323
## Detection Prevalence : 0.5772
## Balanced Accuracy : 0.8819
##
## 'Positive' Class : nonspam
##
The sensitivity and accuracy are both higher in the naive Bayes classifier when you center and scale the data than the naive bayes when you do not center and scale the data. Thus, this tells us that centering and scaling the data leaves you with more accurate results.