#Q1
library(AppliedPredictiveModeling)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
#Remember what we only put is y in this function, it will become a indicator
training = adData[trainIndex,]
testing = adData[-trainIndex,]
#Q2
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
##
hist(training$Superplasticizer, breaks=20)

hist(log(training$Superplasticizer+1), breaks=20)

# --> There are a large number of values that are the same and even if you took the log(SuperPlasticizer + 1) they would still all be identical so the distribution would not be symmetric.
#Q3
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
#Use regular expression to capture the IL column
IL_col_idx<- grep("^[Ii][Ll].*",names(training))
#Use thresh to find the the variance > 80%
preObj<- preProcess(training[, IL_col_idx],method=c("center","scale","pca"),thresh=0.8)
preObj
##
## Call:
## preProcess.default(x = training[, IL_col_idx], method =
## c("center", "scale", "pca"), thresh = 0.8)
##
## Created from 251 samples and 12 variables
## Pre-processing: centered, scaled, principal component signal extraction
##
## PCA needed 7 components to capture 80 percent of the variance
names(preObj)
## [1] "call" "dim" "bc" "yj" "et"
## [6] "mean" "std" "ranges" "rotation" "method"
## [11] "thresh" "pcaComp" "numComp" "ica" "k"
## [16] "knnSummary" "bagImp" "median" "data"
#Q4
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
####No PCA Model
## extract new training and testing sets
IL_col_idx <- grep("^[Ii][Ll].*", names(training))
suppressMessages(library(dplyr))
new_training <- training[, c(names(training)[IL_col_idx], "diagnosis")]
#This is a good way to select the column you want
names(new_training)
## [1] "IL_11" "IL_13" "IL_16" "IL_17E"
## [5] "IL_1alpha" "IL_3" "IL_4" "IL_5"
## [9] "IL_6" "IL_6_Receptor" "IL_7" "IL_8"
## [13] "diagnosis"
#
IL_col_idx <- grep("^[Ii][Ll].*", names(testing))
suppressMessages(library(dplyr))
new_testing <- testing[, c(names(testing)[IL_col_idx], "diagnosis")]
names(new_testing)
## [1] "IL_11" "IL_13" "IL_16" "IL_17E"
## [5] "IL_1alpha" "IL_3" "IL_4" "IL_5"
## [9] "IL_6" "IL_6_Receptor" "IL_7" "IL_8"
## [13] "diagnosis"
# compute the model with non_pca predictors
non_pca_model <- train(diagnosis ~ ., data=new_training, method="glm")
# apply the non pca model on the testing set and check the accuracy
non_pca_result <- confusionMatrix(new_testing[, 13], predict(non_pca_model, new_testing[, -13]))
non_pca_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 2 20
## Control 9 51
##
## Accuracy : 0.6463
## 95% CI : (0.533, 0.7488)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : -0.0702
## Mcnemar's Test P-Value : 0.06332
##
## Sensitivity : 0.18182
## Specificity : 0.71831
## Pos Pred Value : 0.09091
## Neg Pred Value : 0.85000
## Prevalence : 0.13415
## Detection Rate : 0.02439
## Detection Prevalence : 0.26829
## Balanced Accuracy : 0.45006
##
## 'Positive' Class : Impaired
##
####PCA Model
## extract new training and testing sets
IL_col_idx<- grep("^[Ii][Ll].*",names(training))
suppressMessages(library(dplyr))
pc_training_obj<- training[,c(names(training)[IL_col_idx],"diagnosis")]
pc_testing_obj<- testing[,c(names(testing)[IL_col_idx],"diagnosis")]
pc_training<- preProcess(pc_training_obj[,-13],method=c("center","scale","pca"),thresh=0.8)
#We use the preProcess function in Caret package to do the PCA,and we delete the diagnosis, which is the outcome we will predict
pc_training_tred<- predict(pc_training,pc_training_obj[,-13])
#Then, we predict the testing data with the preProcess we set from training data
pc_testing_tred<- predict(pc_training,pc_testing_obj[,-13])
#After we got all the data, we build the model with train function
modelFit2<- train(pc_training_obj$dia~.,data=pc_training_tred,method="glm")
#And use it to do the prediction based on the predited testing data.Notice: we cannot use the original testind data to do the PCA, because it will cause the overfitting problem
pc_prediction<- predict(modelFit2,pc_testing_tred)
#Now, compare 2 y
pc_result<- confusionMatrix(pc_testing_obj[,13],pc_prediction)
pc_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 3 19
## Control 4 56
##
## Accuracy : 0.7195
## 95% CI : (0.6094, 0.8132)
## No Information Rate : 0.9146
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.0889
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.42857
## Specificity : 0.74667
## Pos Pred Value : 0.13636
## Neg Pred Value : 0.93333
## Prevalence : 0.08537
## Detection Rate : 0.03659
## Detection Prevalence : 0.26829
## Balanced Accuracy : 0.58762
##
## 'Positive' Class : Impaired
##