#Q1
library(AppliedPredictiveModeling)
library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
#Remember what we only put is y in this function, it will become a indicator 
training = adData[trainIndex,]
testing = adData[-trainIndex,]

#Q2
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
##
hist(training$Superplasticizer, breaks=20)

hist(log(training$Superplasticizer+1), breaks=20)

# --> There are a large number of values that are the same and even if you took the log(SuperPlasticizer + 1) they would still all be identical so the distribution would not be symmetric.
#Q3
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
#Use regular expression to capture the IL column 
IL_col_idx<- grep("^[Ii][Ll].*",names(training))
#Use thresh to find the the variance > 80%
preObj<- preProcess(training[, IL_col_idx],method=c("center","scale","pca"),thresh=0.8)
preObj

## 
## Call:
## preProcess.default(x = training[, IL_col_idx], method =
##  c("center", "scale", "pca"), thresh = 0.8)
## 
## Created from 251 samples and 12 variables
## Pre-processing: centered, scaled, principal component signal extraction 
## 
## PCA needed 7 components to capture 80 percent of the variance

names(preObj)

##  [1] "call"       "dim"        "bc"         "yj"         "et"        
##  [6] "mean"       "std"        "ranges"     "rotation"   "method"    
## [11] "thresh"     "pcaComp"    "numComp"    "ica"        "k"         
## [16] "knnSummary" "bagImp"     "median"     "data"

#Q4
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
####No PCA Model
## extract new training and testing sets
IL_col_idx <- grep("^[Ii][Ll].*", names(training))
suppressMessages(library(dplyr))
new_training <- training[, c(names(training)[IL_col_idx], "diagnosis")]
#This is a good way to select the column you want
names(new_training)

##  [1] "IL_11"         "IL_13"         "IL_16"         "IL_17E"       
##  [5] "IL_1alpha"     "IL_3"          "IL_4"          "IL_5"         
##  [9] "IL_6"          "IL_6_Receptor" "IL_7"          "IL_8"         
## [13] "diagnosis"

#
IL_col_idx <- grep("^[Ii][Ll].*", names(testing))
suppressMessages(library(dplyr))
new_testing <- testing[, c(names(testing)[IL_col_idx], "diagnosis")]
names(new_testing)

##  [1] "IL_11"         "IL_13"         "IL_16"         "IL_17E"       
##  [5] "IL_1alpha"     "IL_3"          "IL_4"          "IL_5"         
##  [9] "IL_6"          "IL_6_Receptor" "IL_7"          "IL_8"         
## [13] "diagnosis"

# compute the model with non_pca predictors
non_pca_model <- train(diagnosis ~ ., data=new_training, method="glm")
# apply the non pca model on the testing set and check the accuracy
non_pca_result <- confusionMatrix(new_testing[, 13], predict(non_pca_model, new_testing[, -13]))
non_pca_result

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        2      20
##   Control         9      51
##                                          
##                Accuracy : 0.6463         
##                  95% CI : (0.533, 0.7488)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1.00000        
##                                          
##                   Kappa : -0.0702        
##  Mcnemar's Test P-Value : 0.06332        
##                                          
##             Sensitivity : 0.18182        
##             Specificity : 0.71831        
##          Pos Pred Value : 0.09091        
##          Neg Pred Value : 0.85000        
##              Prevalence : 0.13415        
##          Detection Rate : 0.02439        
##    Detection Prevalence : 0.26829        
##       Balanced Accuracy : 0.45006        
##                                          
##        'Positive' Class : Impaired       
##

####PCA Model
## extract new training and testing sets
IL_col_idx<- grep("^[Ii][Ll].*",names(training))
suppressMessages(library(dplyr))
pc_training_obj<- training[,c(names(training)[IL_col_idx],"diagnosis")]
pc_testing_obj<- testing[,c(names(testing)[IL_col_idx],"diagnosis")]
pc_training<- preProcess(pc_training_obj[,-13],method=c("center","scale","pca"),thresh=0.8)
#We use the preProcess function in Caret package to do the PCA,and we delete the diagnosis, which is the outcome we will predict
pc_training_tred<- predict(pc_training,pc_training_obj[,-13])
#Then, we predict the testing data with the preProcess we set from training data
pc_testing_tred<- predict(pc_training,pc_testing_obj[,-13])
#After we got all the data, we build the model with train function
modelFit2<- train(pc_training_obj$dia~.,data=pc_training_tred,method="glm")
#And use it to do the prediction based on the predited testing data.Notice: we cannot use the original testind data to do the PCA, because it will cause the overfitting problem
pc_prediction<- predict(modelFit2,pc_testing_tred)
#Now, compare 2 y
pc_result<- confusionMatrix(pc_testing_obj[,13],pc_prediction)
pc_result

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        3      19
##   Control         4      56
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.9146          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0889          
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.74667         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.93333         
##              Prevalence : 0.08537         
##          Detection Rate : 0.03659         
##    Detection Prevalence : 0.26829         
##       Balanced Accuracy : 0.58762         
##                                           
##        'Positive' Class : Impaired        
##