Practical machine learning

Q1

library(caret); library(pgmm); library(rpart); library(rpart.plot); library(ElemStatLearn); library(AppliedPredictiveModeling);library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

data("segmentationOriginal")
#segmentationOriginal
inTrain <- createDataPartition(y = segmentationOriginal$Class, p = .6, list = FALSE)
training <- segmentationOriginal[inTrain, ]
testing <- segmentationOriginal[-inTrain, ]
set.seed(125)
modelFit <- train(Class~., data = training, method = 'rpart')
a  = data.frame(TotalIntench2 = 23000, FiberWidthCh1 = 10, PerimStatusCh1=2)
b = data.frame(TotalIntench2 = 50000, FiberWidthCh1 = 10, VarIntenCh4 = 100)
c = data.frame(TotalIntench2 = 57000, FiberWidthCh1 = 8, VarIntenCh4 = 100)
d = data.frame(FiberWidthCh1 = 8, VarIntenCh4 = 100, PerimStatusCh1=2)
library(rpart.plot)
rpart.plot(modelFit$finalModel)

Q3

data("olive")
head(olive)

##   Region Area Palmitic Palmitoleic Stearic Oleic Linoleic Linolenic
## 1      1    1     1075          75     226  7823      672        36
## 2      1    1     1088          73     224  7709      781        31
## 3      1    1      911          54     246  8113      549        31
## 4      1    1      966          57     240  7952      619        50
## 5      1    1     1051          67     259  7771      672        50
## 6      1    1      911          49     268  7924      678        51
##   Arachidic Eicosenoic
## 1        60         29
## 2        61         29
## 3        63         29
## 4        78         35
## 5        80         46
## 6        70         44

modelFit <- train(Area~., data = olive, method = 'rpart')

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

newdata = as.data.frame(t(colMeans(olive)))
predict(modelFit$finalModel, newdata)

##        1 
## 2.783282

Q4

library(ElemStatLearn)
data(SAheart)
set.seed(8484)
#SAheart = SAheart[, aa]
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]
set.seed(13234)

modelFit <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl, data = trainSA, method = 'glm', family = 'binomial')

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.

missClass = function(values,prediction){sum(((prediction > 0.5)*1) != values)/length(values)}
trainSA.P = predict(modelFit$finalModel, newdata = trainSA)
testSA.p = predict(modelFit$finalModel, newdata = testSA)
missClass(trainSA$chd, trainSA.P)

## [1] 0.2597403

missClass(testSA$chd, testSA.p)

## [1] 0.2943723

Q5

library(ElemStatLearn)
data(vowel.train)
data(vowel.test)
vowel.train$y = as.factor(vowel.train$y)
vowel.test$y = as.factor(vowel.test$y)
set.seed(33833)
modelFit <- randomForest(y~., data = vowel.train)
re = varImp(modelFit)
order(re)

##  [1] 10  7  3  9  4  8  6  5  1  2

Practical machine learning - quiz3

Q1

Q3

Q4

Q5