Problem 1 A

cancer = read.csv("U:/Private/Data 315/cancer.txt")
cancer1 <- cancer
cancer2 <- cancer1[, -c(1)]

Problem 1 B

library(caret)

## Warning: package 'caret' was built under R version 3.3.2

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.3.2

set.seed(12345)
data_partition <- createDataPartition(y=cancer$Diagnosis, p = .7, list = FALSE)
training <- cancer2[data_partition,]
testing <- cancer2[-data_partition,]

Problem 1 C

kNN1 <- train(Diagnosis~., data = training, method = "knn", maximize = TRUE, trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"), tuneGrid=data.frame(.k=1:20))

Problem 1 D

Optimal value of k is 8

confusionMatrix(kNN1)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction    B    M
##          B 62.2  2.8
##          M  0.5 34.6
##                             
##  Accuracy (average) : 0.9674

plot(kNN1)

#Problem 1 E

predictedkNN1 <- predict(kNN1, newdata = testing)
confusionMatrix(predictedkNN1, testing$Diagnosis)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 105   5
##          M   2  58
##                                          
##                Accuracy : 0.9588         
##                  95% CI : (0.917, 0.9833)
##     No Information Rate : 0.6294         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9109         
##  Mcnemar's Test P-Value : 0.4497         
##                                          
##             Sensitivity : 0.9813         
##             Specificity : 0.9206         
##          Pos Pred Value : 0.9545         
##          Neg Pred Value : 0.9667         
##              Prevalence : 0.6294         
##          Detection Rate : 0.6176         
##    Detection Prevalence : 0.6471         
##       Balanced Accuracy : 0.9510         
##                                          
##        'Positive' Class : B              
##

Problem 1 F

The probability of a type 1 error occuring is .019 and the probability of a type 2 error occuring is .079. For when the test fails, the associated probability is .045 when they predict a positive test when in fact the test is negative. For when the test predicts negative but in the end is actually positive, the probability .033. We expect these probabilties to be rather low. So, these results are not that surprising.

Problem 2 A

library(MASS)

## Warning: package 'MASS' was built under R version 3.3.2

library(caret)
set.seed(12345)
data_partition1 <- createDataPartition(y=Boston$medv, p = .7, list = FALSE)
training1 <- Boston[data_partition1,]
testing1 <- Boston[-data_partition1,]

Problem 2 B

The Rsquared is .489145

kNN2 <- train(medv~., data = training1, method = "knn", maximize = TRUE, metric = "Rsquared", trControl = trainControl(method = "cv", number = 10))
predicted_kNN2 <- predict(kNN2, newdata = testing1)
RSquared_kNN2 <- cor(predicted_kNN2, testing1$medv)^2
RSquared_kNN2

## [1] 0.489145

Problem 2 C

The Rsquared is .784935

kNN3 <- train(medv~., data = training1, method = "knn", maximize = TRUE, metric = "Rsquared", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"))
predicted_kNN3 <- predict(kNN3, newdata = testing1)
RSquared_kNN3 <- cor(predicted_kNN3, testing1$medv)^2
RSquared_kNN3

## [1] 0.784935

plot(kNN3)

#Problem 2 D After scaling and centering the predictors from kNN2 to kNN3, the R^2 went up by .3 which mean the correlation is better after centering and scaling the predictors.

Problem 2 E

The optimal value of k is 5

Problem 3 A

There are 106 NA’s.

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.3.2

data(ChemicalManufacturingProcess)
sum(is.na(ChemicalManufacturingProcess))

## [1] 106

Problem 3 B

library(DMwR)

## Warning: package 'DMwR' was built under R version 3.3.2

## Loading required package: grid

require(DMwR)
ChemicalManufacturingProcess1 <- knnImputation(ChemicalManufacturingProcess,5)

Problem 3 C

library(caret)
set.seed(12345)
data_partition2 <- createDataPartition(y=ChemicalManufacturingProcess1$Yield, p = .7, list = FALSE)
training2 <- ChemicalManufacturingProcess1[data_partition2,]
testing2 <- ChemicalManufacturingProcess1[-data_partition2,]

Problem 3 D

The Rsquared is .3960914

kNN4 <- train(Yield~., data = training2, method = "knn", trControl = trainControl(method = "cv", number = 10), preProcess=c("center", "scale"), tuneGrid=data.frame(.k=1:20))

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BiologicalMaterial07

predicted_kNN4 <- predict(kNN4, newdata = testing2)
RSquared_kNN4 <- cor(predicted_kNN4, testing2$Yield)^2
RSquared_kNN4

## [1] 0.4344609

plot(kNN4)

#Problem 3 E The optimal value of k is 4

Project 3

Ben Geiger

February 23, 2017