ML_Quiz3.utf8.md

# Author: Rongbin Ye
# Date: Nov 26, 2019
# JHU_Practical Machine Learning_Quiz3

# Clear out the environment
rm(list = ls())

# Question 1
# Load Library
library(AppliedPredictiveModeling)
data(segmentationOriginal)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(pgmm)
library(rpart)
library(ElemStatLearn)
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

# Data Partition
df_seg <- data.frame(segmentationOriginal)
testIndex = createDataPartition(df_seg$Case, p = 0.60, list=FALSE)
training = df_seg[-testIndex,]
testing = df_seg[testIndex,]

# train model
set.seed(125) 
Modfit <- train(Class ~ ., method ="rpart", data = training)
print(Modfit$finalModel)

## n= 807 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 807 284 PS (0.6480793 0.3519207)  
##   2) TotalIntenCh2< 52419 430  44 PS (0.8976744 0.1023256) *
##   3) TotalIntenCh2>=52419 377 137 WS (0.3633952 0.6366048)  
##     6) FiberWidthCh1< 10.6729 120  45 PS (0.6250000 0.3750000) *
##     7) FiberWidthCh1>=10.6729 257  62 WS (0.2412451 0.7587549) *

# Viz
# 2. Set the seed to 125 and fit a CART model with the rpart method using all predictor variables and default caret settings. 
# (The outcome class is contained in a factor variable called Class with levels "PS" for poorly segmented and "WS" for well segmented.)
plot(Modfit$finalModel, uniform = TRUE, main = "Classification Tree")
text(Modfit$finalModel, use.n = TRUE, all = TRUE, cex = 0.8)

# Visualization
suppressMessages(library(rattle))
library(rpart.plot)
fancyRpartPlot(Modfit$finalModel)

## Answer: PS, WS, PS, NaN

# Question 2: 
#If K is small in a K-fold cross validation is the bias in the estimate of out-of-sample (test set) 
#accuracy smaller or bigger? If K is small is the variance in the estimate of out-of-sample (test set) 
#accuracy smaller or bigger. Is K large or small in leave one out cross validation?

## Bias is larger as less sample use for validation. Variance smaller for same reason. 
## failed to read the question. 

# Question 3：
## Loading Data & Library
library(caret)
library(pgmm)
data(olive)
olive = olive[,-1]
df_o = as.data.frame(olive)
df_o$Area = as.factor(df_o$Area)

newdata = as.data.frame(t(colMeans(olive)))

# *After Factoraization*
## Partition
intrain <- createDataPartition(olive$Area, p = 0.7, list = FALSE)
training <- df_o[intrain,]
test<- df_o[-intrain,]

## Train Model : Area
Modfit <- train(Area ~ ., method = "rpart", data = training)

predict(Modfit, newdata = newdata)

## [1] 7
## Levels: 1 2 3 4 5 6 7 8 9

# * Original
intrain <- createDataPartition(olive$Area, p = 0.7, list = FALSE)
training <- olive[intrain,]
test<- olive[-intrain,]


Modfit2 <- train(Area ~ ., method = "rpart", data = training)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

predict(Modfit2, newdata = newdata)

##        1 
## 2.801762

## The options are strangte. But Based on the understanding of the data and method, 
# The answer should be the last one. The return is weird if teh average leaf
# being returned. 

# Question 4
## Loading Data & libraries
library(ElemStatLearn)
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]

# Defining the Function
missClass = function(values, prediction){sum(((prediction > 0.5) * 1) != values) / length(values)}

## Model Creation
set.seed(13234)

Modglm <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl, data = trainSA, method = 'glm', family = "binomial")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.

missClass(trainSA$chd, predict(Modglm, newdata = trainSA))

## [1] 0.2727273

missClass(testSA$chd, predict(Modglm, newdata = testSA))

## [1] 0.3116883

## Therefore, these two are the results. 

# Question 5
## Loading Data
library(ElemStatLearn)
library(caret)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:rattle':
## 
##     importance

## The following object is masked from 'package:ggplot2':
## 
##     margin

data(vowel.train)
data(vowel.test)
## Factorization
vowel.test$y <- as.factor(vowel.test$y)
vowel.train$y <- as.factor(vowel.train$y)
## Set
set.seed(33833)
## Create a model
modRf <- randomForest(y ~. , data = vowel.train) 
a <- varImp(modRf)
a <- data.frame(a)

## Despite there are some slight differences, the general order demonstrates that the answer is
## answer of
# 1,2,5,6,8,4,9,3,7,10 - 2,1,5,6,8,4,9,3,7,10.

ML_Quiz3.R

rongbin

2019-11-27