最後は指でなぞって行こう!
library(AppliedPredictiveModeling)
data(segmentationOriginal)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rattle)
## Rattle は、R のデータマイニングに使用する、無償のグラフィカルインターフェースです。
## バージョン 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## 'rattle()' と入力して、データを多角的に分析します。
library(rpart.plot)
## Loading required package: rpart
set.seed(125)
train_data <- segmentationOriginal[(segmentationOriginal$Case == "Train"), ]
test_data <- segmentationOriginal[(segmentationOriginal$Case == "Test"), ]
fit_1 <- train(Class ~ ., data = train_data, method = "rpart")
fancyRpartPlot(fit_1$finalModel)
#PS, WS, PS, Not possible to predict
skip
Area がfloat に変化したことに注意する。
library(pgmm)
#data(olive)
data_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/data/olive_data.zip"
download.file(data_url, destfile = "temp.zip")
olive <- unzip("temp.zip")
load("~/olive.rda")
olive = olive[,-1]
fit_3 <- train(Area ~ ., data = olive, method = "rpart")
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
fancyRpartPlot(fit_3$finalModel)
newdata = as.data.frame(t(colMeans(olive)))
predict(fit_3, newdata = newdata)
## 1
## 2.783282
unique(olive$Area)
## [1] 1 2 3 4 5 6 9 7 8
library(ElemStatLearn)
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]
str(trainSA)
## 'data.frame': 231 obs. of 10 variables:
## $ sbp : int 176 174 174 166 130 128 112 106 126 144 ...
## $ tobacco : num 5.76 0 3.5 4.1 0.05 0.04 0.41 1.08 0 6.75 ...
## $ ldl : num 4.89 8.46 5.26 4 2.44 8.22 1.88 4.37 5.29 5.45 ...
## $ adiposity: num 26.1 35.1 22 34.3 28.2 ...
## $ famhist : Factor w/ 2 levels "Absent","Present": 2 2 2 2 2 1 1 1 1 1 ...
## $ typea : int 46 35 36 32 67 65 39 67 25 53 ...
## $ obesity : num 27.3 25.3 22 29.5 30.9 ...
## $ alcohol : num 19.44 0 8.33 8.23 40.32 ...
## $ age : int 57 61 59 53 34 24 27 28 45 43 ...
## $ chd : int 0 1 1 0 0 0 0 1 0 1 ...
set.seed(13234)
fit_4 <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl,
data = trainSA, method = "glm", family="binomial")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.
missClass = function(values,prediction){sum(((prediction > 0.5)*1) != values)/length(values)}
missClass(testSA$chd, predict(fit_4, newdata = testSA))
## [1] 0.3116883
missClass(trainSA$chd, predict(fit_4, newdata = trainSA))
## [1] 0.2727273
randomForest関数を使う。 train(method = “rf”) はダメppoi。
library(ElemStatLearn)
data(vowel.train)
data(vowel.test)
vowel.train$y <- as.factor(vowel.train$y)
set.seed(33833)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## 以下のオブジェクトは 'package:ggplot2' からマスクされています:
##
## margin
fit_5 <- randomForest(y ~ ., data = vowel.train)
varImp(fit_5)
## Overall
## x.1 89.12864
## x.2 91.24009
## x.3 33.08111
## x.4 34.24433
## x.5 50.25539
## x.6 43.33148
## x.7 31.88132
## x.8 42.92470
## x.9 33.37031
## x.10 29.59956
order(varImp(fit_5), decreasing = T)
## [1] 2 1 5 6 8 4 9 3 7 10