library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 3.4.4
abalone_clean <- read.csv("C:/LocalFiles/Documents/Freshman TSU/STAT-220/HW 8/abalone_clean.csv")
View(abalone_clean)
1) You do not have to do any pre-processing of the abalone_clean data set, so use what you had from before.
#a) Eliminate the X1 variable using
abalone_tree <- abalone_clean[c(2:10)]
#b) Make a second copy of your dataset,
abalone_MF <- filter(abalone_tree, sex != "I")
2) Make a CART tree using rpart on the entire abalone_MF dataset.
cart.abalone_MF <- rpart(sex ~ ., data=abalone_MF, na.action = na.rpart)
rpart.plot(cart.abalone_MF)
#a) What variables are important? What predicts each (of the 2) genders?
#It looks like height, shucked weight, and diameter are important. Females only show up after diameter and shucked weight are used.
#b) Make a visualization of your classification tree, using the rpart.plot command.
rpart.plot(cart.abalone_MF)

#c) Extra credit. Download and install one of the ggplot tree packages and make your tree with it.
#hc <- hclust(dist(abalone_MF$sex), "ave")
#ggdendrogram(hc, rotate = FALSE, size = 2)
3) Make a second CART tree, using all 3 genders from abalone_tree.
#a) Use set.seed (1234), then createDataPartition for training/test (65%/35%) sets
set.seed(2234)
trainIndex <- createDataPartition(abalone_clean$sex, p = .65, list = FALSE)
abaloneTrain <- abalone_clean[ trainIndex,]
abaloneTest <- abalone_clean[-trainIndex,]
#b) Run the analysis on the training set, similar to what you did above.
cart.abaloneTrain <- rpart(sex ~ ., data=abaloneTrain, na.action = na.rpart)
cart.abaloneTrain
## n= 2717
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2717 1723 M (0.31284505 0.32131027 0.36584468)
## 2) whole.weight>=0.61725 1695 908 M (0.41946903 0.11622419 0.46430678)
## 4) whole.weight>=0.87525 1199 609 M (0.46288574 0.04503753 0.49207673)
## 8) shucked.weight< 0.87325 1137 593 M (0.47405453 0.04749340 0.47845207)
## 16) length>=0.6525 265 108 F (0.59245283 0.01886792 0.38867925) *
## 17) length< 0.6525 872 431 M (0.43807339 0.05619266 0.50573394) *
## 9) shucked.weight>=0.87325 62 16 M (0.25806452 0.00000000 0.74193548) *
## 5) whole.weight< 0.87525 496 299 M (0.31451613 0.28830645 0.39717742) *
## 3) whole.weight< 0.61725 1022 346 I (0.13600783 0.66144814 0.20254403) *
rpart.plot(cart.abaloneTrain)

#c) How is this model different from your 2 gender model? Which variables matter?
#whole.weight, shucked.weight, and length matter for this one. This model has all three genders displayed.
#d)This time, make a tree (add colors) using prp
rpart.plot(cart.abaloneTrain, digits=3, cex=.7, extra=2, under=TRUE)

prp(cart.abaloneTrain, digits=3, extra=101, cex=.7, box.palette = "PuBu") #cooler

rpart.plot(cart.abaloneTrain, digits=3, extra=101, cex=.7, box.palette = "PuBu")

#e) Validate the model on the test data set
#i) Use predict to do internal validation on the training set.
abaloneclean.rpart <-predict(cart.abaloneTrain, newdata=abaloneTest, na.action=na.pass, type="class")
summary(abaloneclean.rpart)
## F I M
## 133 561 766
#ii) Make a simple table using table
table(abaloneTest$sex, abaloneclean.rpart, useNA="always")
## abaloneclean.rpart
## F I M <NA>
## F 68 89 300 0
## I 2 354 113 0
## M 63 118 353 0
## <NA> 0 0 0 0
#iii) Make a confusionMatrix
confusionMatrix(abaloneclean.rpart, abaloneTest$sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction F I M
## F 68 2 63
## I 89 354 118
## M 300 113 353
##
## Overall Statistics
##
## Accuracy : 0.5308
## 95% CI : (0.5048, 0.5567)
## No Information Rate : 0.3658
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.285
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: F Class: I Class: M
## Sensitivity 0.14880 0.7548 0.6610
## Specificity 0.93519 0.7911 0.5540
## Pos Pred Value 0.51128 0.6310 0.4608
## Neg Pred Value 0.70686 0.8721 0.7392
## Prevalence 0.31301 0.3212 0.3658
## Detection Rate 0.04658 0.2425 0.2418
## Detection Prevalence 0.09110 0.3842 0.5247
## Balanced Accuracy 0.54200 0.7730 0.6075
#iv) Was our model validated? How well did it work on the test set?
#Our model worked 53 percent of the time, so it did not do super well. It was pretty specific and had a high percentage of using the neg predicting value. Also, the p-value was very small.