Q1

p=0.50

[]内の符号に注意する。

library(AppliedPredictiveModeling)
data(AlzheimerDisease)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)

adData = data.frame(diagnosis,predictors)
testIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[-testIndex,]
testing = adData[testIndex,]

Q2

とりあえずグラフを書く。

インデックス依存のパターンが見えるが、何か意味があるわけでもない(と思う)。

library(Hmisc)
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.2.5
## 
## Attaching package: 'survival'
##  以下のオブジェクトは 'package:caret' からマスクされています: 
## 
##      cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
##  以下のオブジェクトは 'package:base' からマスクされています: 
## 
##      format.pval, round.POSIXt, trunc.POSIXt, units
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

str(training)
## 'data.frame':    774 obs. of  9 variables:
##  $ Cement             : num  0.2231 0.1492 0.0853 0.1705 0.1705 ...
##  $ BlastFurnaceSlag   : num  0 0.0639 0.0569 0.0426 0.0426 ...
##  $ FlyAsh             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Water              : num  0.0669 0.1023 0.0825 0.1023 0.1023 ...
##  $ Superplasticizer   : num  0.00103 0 0 0 0 ...
##  $ CoarseAggregate    : num  0.43 0.418 0.42 0.418 0.418 ...
##  $ FineAggregate      : num  0.279 0.266 0.355 0.266 0.266 ...
##  $ Age                : int  28 270 360 365 28 28 28 90 270 90 ...
##  $ CompressiveStrength: num  80 40.3 44.3 43.7 36.5 ...
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$Cement, g=5))

qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$BlastFurnaceSlag, g=5))

qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$FlyAsh, g=5))

qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$Water, g=5))

qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$Superplasticizer, g=5))

qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$FineAggregate, g=5))

qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
      colour = cut2(training$Age, g=5))

Q3

0が多い(全く同じ数値が多い)とLog 変換しても意味は薄いよね

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

head(training$Superplasticizer)
## [1] 0.001032844 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
qplot(training$Superplasticizer, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(log(training$Superplasticizer), geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 288 rows containing non-finite values (stat_bin).

qplot(log(training$Superplasticizer + 1), geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Q4

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

# from here
IL.index <- grep(x = variable.names(adData), pattern = "\\<IL")
head(adData[IL.index])
##      IL_11    IL_13    IL_16   IL_17E IL_1alpha      IL_3     IL_4
## 1 5.121987 1.282549 4.192081 5.731246 -6.571283 -3.244194 2.484907
## 2 4.936704 1.269463 2.876338 6.705891 -8.047190 -3.912023 2.397895
## 3 4.665910 1.274133 2.616102 4.149327 -8.180721 -4.645992 1.824549
## 4 6.223931 1.307549 2.441056 4.695848 -7.600902 -4.268698 1.481605
## 5 7.070709 1.309980 4.736472 4.204987 -6.943657 -2.995732 2.708050
## 6 6.103215 1.282549 2.671032 3.637051 -8.180721 -3.863233 1.208960
##         IL_5        IL_6 IL_6_Receptor     IL_7     IL_8
## 1  1.0986123  0.26936976    0.64279595 4.805045 1.711325
## 2  0.6931472  0.09622438    0.43115645 3.705506 1.675557
## 3 -0.2484614  0.18568645    0.09668586 1.005622 1.691393
## 4  0.7884574 -0.37116408    0.57519641 2.336211 1.719944
## 5  1.1631508 -0.07204658    0.09668586 4.287562 1.764298
## 6 -0.4004776  0.18568645   -0.51727788 2.776394 1.708270
result = preProcess(training[, IL.index], thresh = 0.9, method = "pca")
result$numComp
## [1] 9

Q5

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

# from here

#Non PCA
IL.index <- grep(x = variable.names(adData), pattern = "\\<IL")
trainingIL <- training[, c(IL.index, 1)]    #1番目はdiagnosisだから、1番目も加えるという意味。
testingIL <- testing[, c(IL.index, 1)]

fit_non_PCA <- train(diagnosis ~ ., data = trainingIL, method = "glm")
confusionMatrix(testingIL$diagnosis, predict(fit_non_PCA, testingIL))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        2      20
##   Control         9      51
##                                          
##                Accuracy : 0.6463         
##                  95% CI : (0.533, 0.7488)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1.00000        
##                                          
##                   Kappa : -0.0702        
##  Mcnemar's Test P-Value : 0.06332        
##                                          
##             Sensitivity : 0.18182        
##             Specificity : 0.71831        
##          Pos Pred Value : 0.09091        
##          Neg Pred Value : 0.85000        
##              Prevalence : 0.13415        
##          Detection Rate : 0.02439        
##    Detection Prevalence : 0.26829        
##       Balanced Accuracy : 0.45006        
##                                          
##        'Positive' Class : Impaired       
## 
#PCA
#Training でpreProcess して
preProc <- preProcess(training[, c(IL.index)], thresh = 0.8, method = "pca")

#Training とTest でPredict する
train_pca <- predict(preProc, training[, IL.index])
testing_pca <- predict(preProc, testing[, IL.index])

#Model はTrain で作成して
fit_PCA <- train(trainingIL$diagnosis ~ ., data = train_pca, method = "glm")

#それをtestingIL$diagnosis で評価するけど、predict の引数がこんがらがらないように注意。 
#testingIL を渡すとPC1 オブジェクトが無いというエラーが発生する。
confusionMatrix(testingIL$diagnosis, predict(fit_PCA, testing_pca))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        3      19
##   Control         4      56
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.9146          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0889          
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.74667         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.93333         
##              Prevalence : 0.08537         
##          Detection Rate : 0.03659         
##    Detection Prevalence : 0.26829         
##       Balanced Accuracy : 0.58762         
##                                           
##        'Positive' Class : Impaired        
##