library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart) 
library(MASS) 
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 3.4.4
abalone_clean <- read.csv("C:/LocalFiles/Documents/Freshman TSU/STAT-220/HW 8/abalone_clean.csv")
View(abalone_clean)

1) You do not have to do any pre-processing of the abalone_clean data set, so use what you had from before.

#a) Eliminate the X1 variable using 
abalone_tree <- abalone_clean[c(2:10)]

#b) Make a second copy of your dataset, 
abalone_MF <- filter(abalone_tree, sex != "I")

2) Make a CART tree using rpart on the entire abalone_MF dataset.

cart.abalone_MF <- rpart(sex ~ ., data=abalone_MF, na.action = na.rpart)
rpart.plot(cart.abalone_MF)

#a) What variables are important? What predicts each (of the 2) genders?
#It looks like height, shucked weight, and diameter are important.  Females only show up after diameter and shucked weight are used.

#b) Make a visualization of your classification tree, using the rpart.plot command.
rpart.plot(cart.abalone_MF)

#c) Extra credit. Download and install one of the ggplot tree packages and make your tree with it.
#hc <- hclust(dist(abalone_MF$sex), "ave")
#ggdendrogram(hc, rotate = FALSE, size = 2)

3) Make a second CART tree, using all 3 genders from abalone_tree.

#a) Use set.seed (1234), then createDataPartition for training/test (65%/35%) sets
set.seed(2234)
trainIndex <- createDataPartition(abalone_clean$sex, p = .65, list = FALSE)
abaloneTrain <- abalone_clean[ trainIndex,]
abaloneTest  <- abalone_clean[-trainIndex,]

#b) Run the analysis on the training set, similar to what you did above.
cart.abaloneTrain <- rpart(sex ~ ., data=abaloneTrain, na.action = na.rpart)
cart.abaloneTrain
## n= 2717 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2717 1723 M (0.31284505 0.32131027 0.36584468)  
##    2) whole.weight>=0.61725 1695  908 M (0.41946903 0.11622419 0.46430678)  
##      4) whole.weight>=0.87525 1199  609 M (0.46288574 0.04503753 0.49207673)  
##        8) shucked.weight< 0.87325 1137  593 M (0.47405453 0.04749340 0.47845207)  
##         16) length>=0.6525 265  108 F (0.59245283 0.01886792 0.38867925) *
##         17) length< 0.6525 872  431 M (0.43807339 0.05619266 0.50573394) *
##        9) shucked.weight>=0.87325 62   16 M (0.25806452 0.00000000 0.74193548) *
##      5) whole.weight< 0.87525 496  299 M (0.31451613 0.28830645 0.39717742) *
##    3) whole.weight< 0.61725 1022  346 I (0.13600783 0.66144814 0.20254403) *
rpart.plot(cart.abaloneTrain)        

#c) How is this model different from your 2 gender model? Which variables matter?
#whole.weight, shucked.weight, and length matter for this one.  This model has all three genders displayed.

#d)This time, make a tree (add colors) using prp
rpart.plot(cart.abaloneTrain, digits=3, cex=.7, extra=2, under=TRUE)

prp(cart.abaloneTrain, digits=3, extra=101, cex=.7, box.palette = "PuBu")   #cooler

rpart.plot(cart.abaloneTrain, digits=3, extra=101, cex=.7, box.palette = "PuBu")

#e) Validate the model on the test data set 
#i) Use predict to do internal validation on the training set.
abaloneclean.rpart <-predict(cart.abaloneTrain, newdata=abaloneTest, na.action=na.pass, type="class")
summary(abaloneclean.rpart)
##   F   I   M 
## 133 561 766
#ii) Make a simple table using table
table(abaloneTest$sex, abaloneclean.rpart, useNA="always")
##       abaloneclean.rpart
##          F   I   M <NA>
##   F     68  89 300    0
##   I      2 354 113    0
##   M     63 118 353    0
##   <NA>   0   0   0    0
#iii) Make a confusionMatrix
confusionMatrix(abaloneclean.rpart, abaloneTest$sex)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   F   I   M
##          F  68   2  63
##          I  89 354 118
##          M 300 113 353
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5308          
##                  95% CI : (0.5048, 0.5567)
##     No Information Rate : 0.3658          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.285           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: F Class: I Class: M
## Sensitivity           0.14880   0.7548   0.6610
## Specificity           0.93519   0.7911   0.5540
## Pos Pred Value        0.51128   0.6310   0.4608
## Neg Pred Value        0.70686   0.8721   0.7392
## Prevalence            0.31301   0.3212   0.3658
## Detection Rate        0.04658   0.2425   0.2418
## Detection Prevalence  0.09110   0.3842   0.5247
## Balanced Accuracy     0.54200   0.7730   0.6075
#iv) Was our model validated? How well did it work on the test set?
#Our model worked 53 percent of the time, so it did not do super well.  It was pretty specific and had a high percentage of using the neg predicting value.  Also, the p-value was very small.