Caretめも

Q1

最後は指でなぞって行こう！

library(AppliedPredictiveModeling)
data(segmentationOriginal)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(rattle)

## Rattle は、R のデータマイニングに使用する、無償のグラフィカルインターフェースです。
## バージョン 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## 'rattle()' と入力して、データを多角的に分析します。

library(rpart.plot)

## Loading required package: rpart

set.seed(125)

train_data <- segmentationOriginal[(segmentationOriginal$Case == "Train"), ]
test_data <- segmentationOriginal[(segmentationOriginal$Case == "Test"), ]
fit_1 <- train(Class ~ ., data = train_data, method = "rpart")
fancyRpartPlot(fit_1$finalModel)

#PS, WS, PS, Not possible to predict

Q2

skip

Q3

Area がfloat に変化したことに注意する。

library(pgmm)
#data(olive)
data_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/data/olive_data.zip"
download.file(data_url, destfile = "temp.zip")
olive <- unzip("temp.zip")

load("~/olive.rda")
olive = olive[,-1]

fit_3 <- train(Area ~ ., data = olive, method = "rpart")

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

fancyRpartPlot(fit_3$finalModel)

newdata = as.data.frame(t(colMeans(olive)))
predict(fit_3, newdata = newdata)

##        1 
## 2.783282

unique(olive$Area)

## [1] 1 2 3 4 5 6 9 7 8

Q4

library(ElemStatLearn)
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]

str(trainSA)

## 'data.frame':    231 obs. of  10 variables:
##  $ sbp      : int  176 174 174 166 130 128 112 106 126 144 ...
##  $ tobacco  : num  5.76 0 3.5 4.1 0.05 0.04 0.41 1.08 0 6.75 ...
##  $ ldl      : num  4.89 8.46 5.26 4 2.44 8.22 1.88 4.37 5.29 5.45 ...
##  $ adiposity: num  26.1 35.1 22 34.3 28.2 ...
##  $ famhist  : Factor w/ 2 levels "Absent","Present": 2 2 2 2 2 1 1 1 1 1 ...
##  $ typea    : int  46 35 36 32 67 65 39 67 25 53 ...
##  $ obesity  : num  27.3 25.3 22 29.5 30.9 ...
##  $ alcohol  : num  19.44 0 8.33 8.23 40.32 ...
##  $ age      : int  57 61 59 53 34 24 27 28 45 43 ...
##  $ chd      : int  0 1 1 0 0 0 0 1 0 1 ...

set.seed(13234)
fit_4 <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl,
               data = trainSA, method = "glm", family="binomial")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.

missClass = function(values,prediction){sum(((prediction > 0.5)*1) != values)/length(values)}

missClass(testSA$chd, predict(fit_4, newdata = testSA))

## [1] 0.3116883

missClass(trainSA$chd, predict(fit_4, newdata = trainSA))

## [1] 0.2727273

Q5

randomForest関数を使う。 train(method = “rf”) はダメppoi。

library(ElemStatLearn)
data(vowel.train)
data(vowel.test)

vowel.train$y <- as.factor(vowel.train$y)

set.seed(33833)
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

##  以下のオブジェクトは 'package:ggplot2' からマスクされています: 
## 
##      margin

fit_5 <- randomForest(y ~ ., data = vowel.train)
varImp(fit_5)

##       Overall
## x.1  89.12864
## x.2  91.24009
## x.3  33.08111
## x.4  34.24433
## x.5  50.25539
## x.6  43.33148
## x.7  31.88132
## x.8  42.92470
## x.9  33.37031
## x.10 29.59956

order(varImp(fit_5), decreasing = T)

##  [1]  2  1  5  6  8  4  9  3  7 10

Caretめも

Tadashi Horie

2016年6月19日

Q1

Q2

Q3

Q4

Q5