# Author: Rongbin Ye
# Date: Nov 26, 2019
# JHU_Practical Machine Learning_Quiz3
# Clear out the environment
rm(list = ls())
# Question 1
# Load Library
library(AppliedPredictiveModeling)
data(segmentationOriginal)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(pgmm)
library(rpart)
library(ElemStatLearn)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
# Data Partition
df_seg <- data.frame(segmentationOriginal)
testIndex = createDataPartition(df_seg$Case, p = 0.60, list=FALSE)
training = df_seg[-testIndex,]
testing = df_seg[testIndex,]
# train model
set.seed(125)
Modfit <- train(Class ~ ., method ="rpart", data = training)
print(Modfit$finalModel)
## n= 807
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 807 284 PS (0.6480793 0.3519207)
## 2) TotalIntenCh2< 52419 430 44 PS (0.8976744 0.1023256) *
## 3) TotalIntenCh2>=52419 377 137 WS (0.3633952 0.6366048)
## 6) FiberWidthCh1< 10.6729 120 45 PS (0.6250000 0.3750000) *
## 7) FiberWidthCh1>=10.6729 257 62 WS (0.2412451 0.7587549) *
# Viz
# 2. Set the seed to 125 and fit a CART model with the rpart method using all predictor variables and default caret settings.
# (The outcome class is contained in a factor variable called Class with levels "PS" for poorly segmented and "WS" for well segmented.)
plot(Modfit$finalModel, uniform = TRUE, main = "Classification Tree")
text(Modfit$finalModel, use.n = TRUE, all = TRUE, cex = 0.8)

# Visualization
suppressMessages(library(rattle))
library(rpart.plot)
fancyRpartPlot(Modfit$finalModel)

## Answer: PS, WS, PS, NaN
# Question 2:
#If K is small in a K-fold cross validation is the bias in the estimate of out-of-sample (test set)
#accuracy smaller or bigger? If K is small is the variance in the estimate of out-of-sample (test set)
#accuracy smaller or bigger. Is K large or small in leave one out cross validation?
## Bias is larger as less sample use for validation. Variance smaller for same reason.
## failed to read the question.
# Question 3:
## Loading Data & Library
library(caret)
library(pgmm)
data(olive)
olive = olive[,-1]
df_o = as.data.frame(olive)
df_o$Area = as.factor(df_o$Area)
newdata = as.data.frame(t(colMeans(olive)))
# *After Factoraization*
## Partition
intrain <- createDataPartition(olive$Area, p = 0.7, list = FALSE)
training <- df_o[intrain,]
test<- df_o[-intrain,]
## Train Model : Area
Modfit <- train(Area ~ ., method = "rpart", data = training)
predict(Modfit, newdata = newdata)
## [1] 7
## Levels: 1 2 3 4 5 6 7 8 9
# * Original
intrain <- createDataPartition(olive$Area, p = 0.7, list = FALSE)
training <- olive[intrain,]
test<- olive[-intrain,]
Modfit2 <- train(Area ~ ., method = "rpart", data = training)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
predict(Modfit2, newdata = newdata)
## 1
## 2.801762
## The options are strangte. But Based on the understanding of the data and method,
# The answer should be the last one. The return is weird if teh average leaf
# being returned.
# Question 4
## Loading Data & libraries
library(ElemStatLearn)
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]
# Defining the Function
missClass = function(values, prediction){sum(((prediction > 0.5) * 1) != values) / length(values)}
## Model Creation
set.seed(13234)
Modglm <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl, data = trainSA, method = 'glm', family = "binomial")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.
missClass(trainSA$chd, predict(Modglm, newdata = trainSA))
## [1] 0.2727273
missClass(testSA$chd, predict(Modglm, newdata = testSA))
## [1] 0.3116883
## Therefore, these two are the results.
# Question 5
## Loading Data
library(ElemStatLearn)
library(caret)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
data(vowel.train)
data(vowel.test)
## Factorization
vowel.test$y <- as.factor(vowel.test$y)
vowel.train$y <- as.factor(vowel.train$y)
## Set
set.seed(33833)
## Create a model
modRf <- randomForest(y ~. , data = vowel.train)
a <- varImp(modRf)
a <- data.frame(a)
## Despite there are some slight differences, the general order demonstrates that the answer is
## answer of
# 1,2,5,6,8,4,9,3,7,10 - 2,1,5,6,8,4,9,3,7,10.