p=0.50
[]内の符号に注意する。
library(AppliedPredictiveModeling)
data(AlzheimerDisease)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
adData = data.frame(diagnosis,predictors)
testIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[-testIndex,]
testing = adData[testIndex,]
とりあえずグラフを書く。
インデックス依存のパターンが見えるが、何か意味があるわけでもない(と思う)。
library(Hmisc)
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.2.5
##
## Attaching package: 'survival'
## 以下のオブジェクトは 'package:caret' からマスクされています:
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## 以下のオブジェクトは 'package:base' からマスクされています:
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
str(training)
## 'data.frame': 774 obs. of 9 variables:
## $ Cement : num 0.2231 0.1492 0.0853 0.1705 0.1705 ...
## $ BlastFurnaceSlag : num 0 0.0639 0.0569 0.0426 0.0426 ...
## $ FlyAsh : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Water : num 0.0669 0.1023 0.0825 0.1023 0.1023 ...
## $ Superplasticizer : num 0.00103 0 0 0 0 ...
## $ CoarseAggregate : num 0.43 0.418 0.42 0.418 0.418 ...
## $ FineAggregate : num 0.279 0.266 0.355 0.266 0.266 ...
## $ Age : int 28 270 360 365 28 28 28 90 270 90 ...
## $ CompressiveStrength: num 80 40.3 44.3 43.7 36.5 ...
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$Cement, g=5))
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$BlastFurnaceSlag, g=5))
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$FlyAsh, g=5))
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$Water, g=5))
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$Superplasticizer, g=5))
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$FineAggregate, g=5))
qplot(seq_along(training$CompressiveStrength), training$CompressiveStrength,
colour = cut2(training$Age, g=5))
0が多い(全く同じ数値が多い)とLog 変換しても意味は薄いよね
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
head(training$Superplasticizer)
## [1] 0.001032844 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
qplot(training$Superplasticizer, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(log(training$Superplasticizer), geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 288 rows containing non-finite values (stat_bin).
qplot(log(training$Superplasticizer + 1), geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
# from here
IL.index <- grep(x = variable.names(adData), pattern = "\\<IL")
head(adData[IL.index])
## IL_11 IL_13 IL_16 IL_17E IL_1alpha IL_3 IL_4
## 1 5.121987 1.282549 4.192081 5.731246 -6.571283 -3.244194 2.484907
## 2 4.936704 1.269463 2.876338 6.705891 -8.047190 -3.912023 2.397895
## 3 4.665910 1.274133 2.616102 4.149327 -8.180721 -4.645992 1.824549
## 4 6.223931 1.307549 2.441056 4.695848 -7.600902 -4.268698 1.481605
## 5 7.070709 1.309980 4.736472 4.204987 -6.943657 -2.995732 2.708050
## 6 6.103215 1.282549 2.671032 3.637051 -8.180721 -3.863233 1.208960
## IL_5 IL_6 IL_6_Receptor IL_7 IL_8
## 1 1.0986123 0.26936976 0.64279595 4.805045 1.711325
## 2 0.6931472 0.09622438 0.43115645 3.705506 1.675557
## 3 -0.2484614 0.18568645 0.09668586 1.005622 1.691393
## 4 0.7884574 -0.37116408 0.57519641 2.336211 1.719944
## 5 1.1631508 -0.07204658 0.09668586 4.287562 1.764298
## 6 -0.4004776 0.18568645 -0.51727788 2.776394 1.708270
result = preProcess(training[, IL.index], thresh = 0.9, method = "pca")
result$numComp
## [1] 9
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
# from here
#Non PCA
IL.index <- grep(x = variable.names(adData), pattern = "\\<IL")
trainingIL <- training[, c(IL.index, 1)] #1番目はdiagnosisだから、1番目も加えるという意味。
testingIL <- testing[, c(IL.index, 1)]
fit_non_PCA <- train(diagnosis ~ ., data = trainingIL, method = "glm")
confusionMatrix(testingIL$diagnosis, predict(fit_non_PCA, testingIL))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 2 20
## Control 9 51
##
## Accuracy : 0.6463
## 95% CI : (0.533, 0.7488)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : -0.0702
## Mcnemar's Test P-Value : 0.06332
##
## Sensitivity : 0.18182
## Specificity : 0.71831
## Pos Pred Value : 0.09091
## Neg Pred Value : 0.85000
## Prevalence : 0.13415
## Detection Rate : 0.02439
## Detection Prevalence : 0.26829
## Balanced Accuracy : 0.45006
##
## 'Positive' Class : Impaired
##
#PCA
#Training でpreProcess して
preProc <- preProcess(training[, c(IL.index)], thresh = 0.8, method = "pca")
#Training とTest でPredict する
train_pca <- predict(preProc, training[, IL.index])
testing_pca <- predict(preProc, testing[, IL.index])
#Model はTrain で作成して
fit_PCA <- train(trainingIL$diagnosis ~ ., data = train_pca, method = "glm")
#それをtestingIL$diagnosis で評価するけど、predict の引数がこんがらがらないように注意。
#testingIL を渡すとPC1 オブジェクトが無いというエラーが発生する。
confusionMatrix(testingIL$diagnosis, predict(fit_PCA, testing_pca))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 3 19
## Control 4 56
##
## Accuracy : 0.7195
## 95% CI : (0.6094, 0.8132)
## No Information Rate : 0.9146
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.0889
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.42857
## Specificity : 0.74667
## Pos Pred Value : 0.13636
## Neg Pred Value : 0.93333
## Prevalence : 0.08537
## Detection Rate : 0.03659
## Detection Prevalence : 0.26829
## Balanced Accuracy : 0.58762
##
## 'Positive' Class : Impaired
##