Problem 1

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.3.3

hitters1 <- Hitters
hitters2 <- hitters1[,-c(14,15,20)]
hitters3 <- hitters2[complete.cases(hitters2),]
hitters3["logSalary"] <- NA
hitters3$logSalary <- log(hitters3$Salary)
hitters4 <- hitters3[,-c(17)]
library(caret)

## Warning: package 'caret' was built under R version 3.3.3

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.3.3

set.seed(12345)
data_partition <- createDataPartition(hitters4$logSalary, p = .7, list = FALSE)
training <- hitters4[data_partition,]
testing <- hitters4[-data_partition,]

Problem 1 a

The R-Squared is .451802

LM1 <- lm(logSalary ~ ., data=training)
summary(LM1)

## 
## Call:
## lm(formula = logSalary ~ ., data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.27147 -0.44874  0.01607  0.40936  2.79909 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.677e+00  1.959e-01  23.872   <2e-16 ***
## AtBat       -3.931e-03  1.523e-03  -2.582   0.0107 *  
## Hits         1.445e-02  5.813e-03   2.485   0.0139 *  
## HmRun        9.406e-03  1.480e-02   0.636   0.5258    
## Runs        -3.868e-03  6.987e-03  -0.554   0.5805    
## RBI          3.493e-03  6.463e-03   0.540   0.5896    
## Walks        1.066e-02  4.492e-03   2.373   0.0188 *  
## Years        4.177e-02  3.128e-02   1.335   0.1836    
## CAtBat      -2.807e-05  3.299e-04  -0.085   0.9323    
## CHits        1.097e-03  1.745e-03   0.629   0.5302    
## CHmRun       5.071e-04  3.912e-03   0.130   0.8970    
## CRuns        8.409e-04  1.764e-03   0.477   0.6343    
## CRBI        -1.265e-03  1.822e-03  -0.694   0.4886    
## CWalks      -9.993e-04  8.293e-04  -1.205   0.2299    
## PutOuts      4.154e-04  1.765e-04   2.355   0.0197 *  
## Assists      1.052e-03  5.131e-04   2.051   0.0418 *  
## Errors      -1.647e-02  1.017e-02  -1.619   0.1074    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6188 on 168 degrees of freedom
## Multiple R-squared:  0.5713, Adjusted R-squared:  0.5305 
## F-statistic: 13.99 on 16 and 168 DF,  p-value: < 2.2e-16

predicted <- predict(LM1, newdata = testing)
RSquared_LM1Testing <- cor(predicted, testing$logSalary)^2
RSquared_LM1Testing

## [1] 0.4518025

Problem 1 b

The R-Squared is .48121

library(ridge)
set.seed(12345)
RR1 <- train(logSalary~., data = training, method = "ridge", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"))
predicted_RR1 <- predict(RR1, newdata = testing)
RSquared_RR1 <- cor(predicted_RR1, testing$logSalary)^2
RSquared_RR1

## [1] 0.4812084

Problem 1 c

The R-squared is .4686

set.seed(12345)
LASSO1 <- train(logSalary~., data = training, method = "lasso", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"))
predicted_LASSO1 <- predict(LASSO1, newdata = testing)
RSquared_LASSO1 <- cor(predicted_LASSO1, testing$logSalary)^2
RSquared_LASSO1

## [1] 0.4686052

Problem 1 d

The variables that were removed are atbat, hits, hmrun, runs, rbi, walks, years, catbat, chits, chmrun, cruns, crbi, cwalks, putouts, assists and errors

predict.enet(LASSO1$finalModel, type="coefficients", s=LASSO1$bestTune$fraction, mode="fraction")$coefficients

##       AtBat        Hits       HmRun        Runs         RBI       Walks 
## -0.26718587  0.40050874  0.04954137  0.00000000  0.03072973  0.12541714 
##       Years      CAtBat       CHits      CHmRun       CRuns        CRBI 
##  0.13511795  0.00000000  0.45915555 -0.09788284  0.04494156  0.00000000 
##      CWalks     PutOuts     Assists      Errors 
## -0.07672909  0.11115853  0.09988299 -0.09130279

Problem 2 a

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.3.3

data(permeability)
data(fingerprints)

## Warning in data(fingerprints): data set 'fingerprints' not found

permeability <- data.frame(permeability)
fingerprints <- data.frame(fingerprints)
df1 <- cbind(permeability, fingerprints)
df2 <- df1[, -nearZeroVar(df1)]
df <- data.frame(df2)
names(df)[389] = 'Permeability'

Problem 2 b

library(caret)
set.seed(12345)
data_partition_permeability <- createDataPartition(df$Permeability, p = .7, list = FALSE)
training_permeability <- df[data_partition_permeability,]
testing_permeability <- df[-data_partition_permeability,]

Problem 2 c

library(caret)
LM2 <- train(permeability~., data = training_permeability, method = "lm", preProcess=c("center", "scale"))
predicted_LM2 <- predict(LM2, newdata = testing_permeability)
RSquared_LM2 <- cor(predicted_LM2, testing_permeability$Permeability)^2
RSquared_LM2

## [1] 0.001620515

Problem 2 d

The R-squared got smaller.

LM2pca <- train(permeability~., data = training_permeability, method = "lm", preProcess=c("center", "scale", "pca"))
predicted_LM2pca <- predict(LM2pca, newdata = testing_permeability)
RSquared_LM2pca <- cor(predicted_LM2pca, testing_permeability$Permeability)^2
RSquared_LM2pca

## [1] 9.370273e-05

Problem 3 a

library(data.table)
library(curl)
cancer1 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.xtrain"))
cancer2 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.ytrain"))
c_tra <- cbind(cancer1, cancer2)
names(c_tra)[16064] = 'response'
c_tra$response <- as.factor(c_tra$response)
c_tra <- as.data.frame(c_tra)
cancer3 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.xtest"))
cancer4 <- transpose(fread("https://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/14cancer.ytest"))
c_test <- cbind(cancer3, cancer4)
names(c_test)[16064] = 'response'
c_test$response <- as.factor(c_test$response)
c_test <- as.data.frame(c_test)

Problem 3 b

The accuracy is .5

set.seed(12345)
LDA3 <- train(response~., data = c_tra, method = "lda", trControl = trainControl(method = "cv", number = 10))
predicted_LDA3 <- predict(LDA3, newdata = c_test)
confusionMatrix(predicted_LDA3, c_test$response)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##         1  1 0 0 1 0 2 0 0 0  0  2  1  1  0
##         2  0 2 1 0 0 0 0 0 0  0  0  0  0  0
##         3  1 0 2 0 1 0 1 0 0  1  0  0  0  0
##         4  0 0 0 3 0 0 0 0 0  0  0  0  0  0
##         5  0 1 0 0 4 0 0 0 2  0  0  0  0  0
##         6  1 0 0 0 0 1 0 0 0  0  0  2  0  0
##         7  0 1 0 0 0 0 1 0 0  0  0  0  0  0
##         8  1 0 1 0 0 0 0 2 0  2  0  0  0  0
##         9  0 0 0 0 0 0 0 0 4  0  0  0  0  0
##         10 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         11 0 0 0 0 0 0 0 0 0  0  1  0  0  0
##         12 0 1 0 0 1 0 0 0 0  0  0  0  0  0
##         13 0 1 0 0 0 0 0 0 0  0  0  1  2  0
##         14 0 0 0 0 0 0 0 0 0  0  0  0  0  4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.3608, 0.6392)
##     No Information Rate : 0.1111          
##     P-Value [Acc > NIR] : 1.581e-12       
##                                           
##                   Kappa : 0.4594          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.25000  0.33333  0.50000  0.75000  0.66667  0.33333
## Specificity           0.86000  0.97917  0.92000  1.00000  0.93750  0.94118
## Pos Pred Value        0.12500  0.66667  0.33333  1.00000  0.57143  0.25000
## Neg Pred Value        0.93478  0.92157  0.95833  0.98039  0.95745  0.96000
## Prevalence            0.07407  0.11111  0.07407  0.07407  0.11111  0.05556
## Detection Rate        0.01852  0.03704  0.03704  0.05556  0.07407  0.01852
## Detection Prevalence  0.14815  0.05556  0.11111  0.05556  0.12963  0.07407
## Balanced Accuracy     0.55500  0.65625  0.71000  0.87500  0.80208  0.63725
##                      Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity           0.50000  1.00000  0.66667   0.00000   0.33333
## Specificity           0.98077  0.92308  1.00000   1.00000   1.00000
## Pos Pred Value        0.50000  0.33333  1.00000       NaN   1.00000
## Neg Pred Value        0.98077  1.00000  0.96000   0.94444   0.96226
## Prevalence            0.03704  0.03704  0.11111   0.05556   0.05556
## Detection Rate        0.01852  0.03704  0.07407   0.00000   0.01852
## Detection Prevalence  0.03704  0.11111  0.07407   0.00000   0.01852
## Balanced Accuracy     0.74038  0.96154  0.83333   0.50000   0.66667
##                      Class: 12 Class: 13 Class: 14
## Sensitivity            0.00000   0.66667   1.00000
## Specificity            0.96000   0.96078   1.00000
## Pos Pred Value         0.00000   0.50000   1.00000
## Neg Pred Value         0.92308   0.98000   1.00000
## Prevalence             0.07407   0.05556   0.07407
## Detection Rate         0.00000   0.03704   0.07407
## Detection Prevalence   0.03704   0.07407   0.07407
## Balanced Accuracy      0.48000   0.81373   1.00000

Problem 3 c

The accuracy is .3889

set.seed(12345)
KNN3 <- train(response~., data = c_tra, method = "knn", trControl = trainControl(method = "cv", number = 10))
predicted_KNN3 <- predict(KNN3, c_test)
confusionMatrix(predicted_KNN3, c_test$response)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##         1  1 1 0 1 0 0 0 1 0  0  1  0  0  0
##         2  0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         3  0 0 1 0 0 0 0 0 0  0  0  0  0  0
##         4  0 1 0 2 0 0 0 1 0  1  0  2  0  0
##         5  0 2 0 0 4 0 0 0 1  0  0  0  0  0
##         6  0 0 0 0 0 1 0 0 0  0  0  0  0  0
##         7  0 1 0 0 0 0 0 0 0  0  0  1  0  0
##         8  1 1 0 1 0 0 1 0 0  2  1  1  1  0
##         9  0 0 0 0 0 0 0 0 5  0  0  0  0  0
##         10 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         11 1 0 0 0 1 1 1 0 0  0  1  0  0  0
##         12 1 0 0 0 1 0 0 0 0  0  0  0  0  0
##         13 0 0 3 0 0 1 0 0 0  0  0  0  2  0
##         14 0 0 0 0 0 0 0 0 0  0  0  0  0  4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3889          
##                  95% CI : (0.2592, 0.5312)
##     No Information Rate : 0.1111          
##     P-Value [Acc > NIR] : 1.196e-07       
##                                           
##                   Kappa : 0.3424          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.25000   0.0000  0.25000  0.50000  0.66667  0.33333
## Specificity           0.92000   1.0000  1.00000  0.90000  0.93750  1.00000
## Pos Pred Value        0.20000      NaN  1.00000  0.28571  0.57143  1.00000
## Neg Pred Value        0.93878   0.8889  0.94340  0.95745  0.95745  0.96226
## Prevalence            0.07407   0.1111  0.07407  0.07407  0.11111  0.05556
## Detection Rate        0.01852   0.0000  0.01852  0.03704  0.07407  0.01852
## Detection Prevalence  0.09259   0.0000  0.01852  0.12963  0.12963  0.01852
## Balanced Accuracy     0.58500   0.5000  0.62500  0.70000  0.80208  0.66667
##                      Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity           0.00000  0.00000  0.83333   0.00000   0.33333
## Specificity           0.96154  0.82692  1.00000   1.00000   0.92157
## Pos Pred Value        0.00000  0.00000  1.00000       NaN   0.20000
## Neg Pred Value        0.96154  0.95556  0.97959   0.94444   0.95918
## Prevalence            0.03704  0.03704  0.11111   0.05556   0.05556
## Detection Rate        0.00000  0.00000  0.09259   0.00000   0.01852
## Detection Prevalence  0.03704  0.16667  0.09259   0.00000   0.09259
## Balanced Accuracy     0.48077  0.41346  0.91667   0.50000   0.62745
##                      Class: 12 Class: 13 Class: 14
## Sensitivity            0.00000   0.66667   1.00000
## Specificity            0.96000   0.92157   1.00000
## Pos Pred Value         0.00000   0.33333   1.00000
## Neg Pred Value         0.92308   0.97917   1.00000
## Prevalence             0.07407   0.05556   0.07407
## Detection Rate         0.00000   0.03704   0.07407
## Detection Prevalence   0.03704   0.11111   0.07407
## Balanced Accuracy      0.48000   0.79412   1.00000

Problem 3 d

The accuracy in problem 3 b is not extremely high but not extremely low and the accuracy in problem 3 c is relatively low. Even though the classifier in 3 b is more effective than the one in 3 c, they are both not very reliable classifiers.

Problem 5 a

library(caret)
library(kernlab)
data(spam)
set.seed(12345)
data_partition <- createDataPartition(spam$type, p = .7, list = FALSE)
training_spam <- spam[data_partition,]
testing_spam <- spam[-data_partition,]

Problem 5 b

library(caret)
NB5 <- train(type~., data = training_spam, method = "nb", trControl = trainControl(method = "cv", number = 10))
predicted_NB5 <- predict(NB5, testing_spam)
confusionMatrix(predicted_NB5, testing_spam$type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     476   31
##    spam        360  512
##                                           
##                Accuracy : 0.7165          
##                  95% CI : (0.6919, 0.7401)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4631          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5694          
##             Specificity : 0.9429          
##          Pos Pred Value : 0.9389          
##          Neg Pred Value : 0.5872          
##              Prevalence : 0.6062          
##          Detection Rate : 0.3452          
##    Detection Prevalence : 0.3677          
##       Balanced Accuracy : 0.7561          
##                                           
##        'Positive' Class : nonspam         
##

Problem 5 c

library(caret)
NB5pca <- train(type~., data = training_spam, method = "nb", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale", "pca"))
predicted_NB5pca <- predict(NB5pca, testing_spam)
confusionMatrix(predicted_NB5pca, testing_spam$type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     734   62
##    spam        102  481
##                                           
##                Accuracy : 0.8811          
##                  95% CI : (0.8628, 0.8977)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7541          
##  Mcnemar's Test P-Value : 0.002324        
##                                           
##             Sensitivity : 0.8780          
##             Specificity : 0.8858          
##          Pos Pred Value : 0.9221          
##          Neg Pred Value : 0.8250          
##              Prevalence : 0.6062          
##          Detection Rate : 0.5323          
##    Detection Prevalence : 0.5772          
##       Balanced Accuracy : 0.8819          
##                                           
##        'Positive' Class : nonspam         
##

Problem 5 d

The sensitivity and accuracy are both higher in the naive Bayes classifier when you center and scale the data than the naive bayes when you do not center and scale the data. Thus, this tells us that centering and scaling the data leaves you with more accurate results.

Project 4

Ben Geiger

March 21, 2017