library(caret)
library(rattle)
library(AppliedPredictiveModeling)
library(pgmm)
library(tree)
library(ElemStatLearn)
library(randomForest)

Question1: Subset the data to a training set and testing set based on the Case variable in the data set. Set the seed to 125 and fit a CART model with the rpart method using all predictor variables and default caret settings. In the final model what would be the final model prediction for cases with the following variable values: a.TotalIntench2 = 23,000; FiberWidthCh1 = 10; PerimStatusCh1 = 2 b.TotalIntench2 = 50,000; FiberWidthCh1 = 10; VarIntenCh4 = 100 c.TotalIntench2 = 57,000; FiberWidthCh1 = 8; VarIntenCh4 = 100 d.FiberWidthCh1 = 8; VarIntenCh4 = 100; PerimStatusCh1 = 2

data(segmentationOriginal)
training <- subset(segmentationOriginal, Case == "Train")
training$Case <- NULL
testing <- subset(segmentationOriginal, Case == "Test")
testing$Case <- NULL

set.seed(125)
modFit <- train(Class ~ ., 
                method = "rpart", 
                data = training)
## Loading required package: rpart
fancyRpartPlot(modFit$finalModel)

Question 2: olive data set. Predict new dataset using classification.

data(olive)
olive = olive[,-1]

#olive$Area <- as.factor(olive$Area)

index <- createDataPartition(olive$Area,
                             p = 0.7, list = F)


training <- olive[index, ]
testing <- olive[-index,]

modFit <- train(Area ~ ., 
                method = "rpart", 
                data = olive)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
newdata = as.data.frame(t(colMeans(olive)))
pred <- predict(modFit, newdata = newdata)
pred
##        1 
## 2.783282

tree and caret produced different results

modTree <- tree(Area ~., data = olive)
plot(modTree)
text(modTree, cex = 0.75)

summary(modTree)
## 
## Regression tree:
## tree(formula = Area ~ ., data = olive)
## Variables actually used in tree construction:
## [1] "Eicosenoic" "Linoleic"   "Oleic"     
## Number of terminal nodes:  5 
## Residual mean deviance:  0.3431 = 194.5 / 567 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.8750 -0.3367  0.1250  0.0000  0.1607  2.6840
newdata = as.data.frame(t(colMeans(olive)))


pred2 <- predict(modTree, newdata = newdata)

pred2
##     1 
## 2.875

Question 3: SAheart dataset, logistic regression

data(SAheart)
SAheart$chd <- as.factor(SAheart$chd)

set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]


modFit <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl, data = trainSA, method = "glm", family = "binomial")


missClass = function(values,prediction){sum(((prediction > 0.5)*1) != values)/length(values)}

testPC <- predict(modFit, testSA)
trainPC <- predict(modFit, trainSA)

confusionMatrix(testSA$chd, testPC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 117  38
##          1  34  42
##                                           
##                Accuracy : 0.6883          
##                  95% CI : (0.6243, 0.7474)
##     No Information Rate : 0.6537          
##     P-Value [Acc > NIR] : 0.1497          
##                                           
##                   Kappa : 0.3034          
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.7748          
##             Specificity : 0.5250          
##          Pos Pred Value : 0.7548          
##          Neg Pred Value : 0.5526          
##              Prevalence : 0.6537          
##          Detection Rate : 0.5065          
##    Detection Prevalence : 0.6710          
##       Balanced Accuracy : 0.6499          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(trainSA$chd, trainPC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 124  23
##          1  40  44
##                                          
##                Accuracy : 0.7273         
##                  95% CI : (0.665, 0.7836)
##     No Information Rate : 0.71           
##     P-Value [Acc > NIR] : 0.30862        
##                                          
##                   Kappa : 0.384          
##  Mcnemar's Test P-Value : 0.04382        
##                                          
##             Sensitivity : 0.7561         
##             Specificity : 0.6567         
##          Pos Pred Value : 0.8435         
##          Neg Pred Value : 0.5238         
##              Prevalence : 0.7100         
##          Detection Rate : 0.5368         
##    Detection Prevalence : 0.6364         
##       Balanced Accuracy : 0.7064         
##                                          
##        'Positive' Class : 0              
## 
missClass(as.numeric(trainSA$chd), as.numeric(trainPC))
## [1] 0.3636364
missClass(as.numeric(testSA$chd), as.numeric(testPC))
## [1] 0.3290043

Question 4: vowel train dataset. use randomForest and varImp to rank the importance scale of the variables

data(vowel.train)
data(vowel.test)

vowel.train$y <- as.factor(vowel.train$y)
vowel.test$y <- as.factor(vowel.test$y)
set.seed(33833)

rf <- randomForest(y ~., data = vowel.train)
order(varImp(rf))
##  [1] 10  7  3  9  4  8  6  5  1  2