#Install all packages:
install.packages("ridittools", repos = "https://cran.rstudio.com")
install.packages("Amelia", repos = "https://cran.rstudio.com")
install.packages("ggvis", repos = "https://cran.rstudio.com")
install.packages("lattice", repos = "https://cran.rstudio.com")
install.packages("ggplot2", repos = "https://cran.rstudio.com")
install.packages("e1071", repos = "https://cran.rstudio.com")
install.packages("caret", repos = "https://cran.rstudio.com")
install.packages("FactoMineR", repos = "https://cran.rstudio.com")
install.packages("ROCR", repos = "https://cran.rstduio.com")
install.packages("arules", repos = "https://cran.rstudio.com")
install.packages("mlr", repos = "https://cran.rstudio.com")
#Load packages into working memory:
library("ridittools")
library("Amelia")
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library("ggvis")
library("lattice")
library("ggplot2")
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
## 
##     resolution
library("e1071")
library("caret")
library("FactoMineR")
library("ROCR")
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("arules")
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:ggvis':
## 
##     band
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library("mlr")
## Loading required package: ParamHelpers
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:ROCR':
## 
##     performance
## The following object is masked from 'package:caret':
## 
##     train
## The following object is masked from 'package:e1071':
## 
##     impute
## The following object is masked from 'package:ridittools':
## 
##     acc

Part 1: Data Import and Transformation

#import dataset into workspace
mammMasses <- read.csv("c:/users/Joshu/documents/datasets/mammMasses.csv")
str(mammMasses)
## 'data.frame':    829 obs. of  6 variables:
##  $ BI_RADS: int  5 5 4 5 5 3 4 4 4 3 ...
##  $ Age    : int  67 58 28 57 76 42 36 60 54 52 ...
##  $ Shape  : int  3 4 1 1 1 2 3 2 1 3 ...
##  $ Margin : int  5 5 1 5 4 1 1 1 1 4 ...
##  $ Density: int  3 3 3 3 3 3 2 2 3 3 ...
##  $ Class  : int  1 1 0 1 1 1 0 0 0 0 ...
#reorder columns so age is first
mammMasses <- mammMasses[c(6,2,1,3,4,5)]
str(mammMasses)
## 'data.frame':    829 obs. of  6 variables:
##  $ Class  : int  1 1 0 1 1 1 0 0 0 0 ...
##  $ Age    : int  67 58 28 57 76 42 36 60 54 52 ...
##  $ BI_RADS: int  5 5 4 5 5 3 4 4 4 3 ...
##  $ Shape  : int  3 4 1 1 1 2 3 2 1 3 ...
##  $ Margin : int  5 5 1 5 4 1 1 1 1 4 ...
##  $ Density: int  3 3 3 3 3 3 2 2 3 3 ...

Create and Transform Dot Products for Each Variable combination

#Create a Copy of mammMasses
mammMassesDot = mammMasses

#Like above, create a new vector for each individual variable.
dot1 <- mammMassesDot[2]
dot2 <- mammMassesDot[3]
dot3 <- mammMassesDot[4]
dot4 <- mammMassesDot[5]
dot5 <- mammMassesDot[6]

#create a derived variable product for each of 10 possible variable combinations:
# 1*2, 1*3, 1*4, 1*5, 2*3, 2*4, 2*5, 3*4, 3*5, 4*5
#new variable (dotp1) = matrix1 * matrix2

dotp1 <- dot1 * dot2
dotp2 <- dot1 * dot3
dotp3 <- dot1 * dot4
dotp4 <- dot1 * dot5
dotp5 <- dot2 * dot3
dotp6 <- dot2 * dot4
dotp7 <- dot2 * dot5
dotp8 <- dot3 * dot4
dotp9 <- dot3 * dot5
dotp10 <- dot4 * dot5

#Column bind discretized dot products back into mammMassesDot:
mammMassesDot = cbind.data.frame(dotp1, dotp2, dotp3, dotp4, dotp5, dotp6, dotp7, dotp8, dotp9, dotp10)

#Rename mammMassesDot Columns to reflect the two variables from which the new variables are derived
colnames(mammMassesDot) <- c("age_birads", "age_shape", "age_margin", "age_density", "birads_shape", "birads_margin", "birads_density", "shape_margin", "shape_density", "margin_density")

#Append mammMassesRidit and mammMassesDot onto mammMasses
mammMasses2 = cbind(mammMasses, mammMassesDot)
str(mammMasses2)
## 'data.frame':    829 obs. of  16 variables:
##  $ Class         : int  1 1 0 1 1 1 0 0 0 0 ...
##  $ Age           : int  67 58 28 57 76 42 36 60 54 52 ...
##  $ BI_RADS       : int  5 5 4 5 5 3 4 4 4 3 ...
##  $ Shape         : int  3 4 1 1 1 2 3 2 1 3 ...
##  $ Margin        : int  5 5 1 5 4 1 1 1 1 4 ...
##  $ Density       : int  3 3 3 3 3 3 2 2 3 3 ...
##  $ age_birads    : int  335 290 112 285 380 126 144 240 216 156 ...
##  $ age_shape     : int  201 232 28 57 76 84 108 120 54 156 ...
##  $ age_margin    : int  335 290 28 285 304 42 36 60 54 208 ...
##  $ age_density   : int  201 174 84 171 228 126 72 120 162 156 ...
##  $ birads_shape  : int  15 20 4 5 5 6 12 8 4 9 ...
##  $ birads_margin : int  25 25 4 25 20 3 4 4 4 12 ...
##  $ birads_density: int  15 15 12 15 15 9 8 8 12 9 ...
##  $ shape_margin  : int  15 20 1 5 4 2 3 2 1 12 ...
##  $ shape_density : int  9 12 3 3 3 6 6 4 3 9 ...
##  $ margin_density: int  15 15 3 15 12 3 2 2 3 12 ...
#Discretize Age and Dot Product Variables
mammMasses2$Age <- discretize(mammMasses2$Age, breaks = 7)
mammMasses2$age_birads <- discretize(mammMasses2$age_birads, breaks = 5)
mammMasses2$age_shape <- discretize(mammMasses2$age_shape, breaks = 5)
mammMasses2$age_margin <- discretize(mammMasses2$age_margin, breaks = 5)
mammMasses2$age_density <- discretize(mammMasses2$age_density, breaks = 5)
mammMasses2$birads_shape <- discretize(mammMasses2$birads_shape, breaks = 5)
mammMasses2$birads_margin <- discretize(mammMasses2$birads_margin, breaks = 5)
mammMasses2$birads_density <- discretize(mammMasses2$birads_density, breaks = 3)
mammMasses2$shape_margin <- discretize(mammMasses2$shape_margin, breaks = 3)
mammMasses2$shape_density <- discretize(mammMasses2$shape_density, breaks = 3)
mammMasses2$margin_density <- discretize(mammMasses2$margin_density, breaks = 3)

str(mammMasses2)
## 'data.frame':    829 obs. of  16 variables:
##  $ Class         : int  1 1 0 1 1 1 0 0 0 0 ...
##  $ Age           : Factor w/ 7 levels "[18,40)","[40,47)",..: 6 4 1 4 7 2 1 5 4 3 ...
##   ..- attr(*, "discretized:breaks")= num  18 40 47 54 59 65 71 96
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ BI_RADS       : int  5 5 4 5 5 3 4 4 4 3 ...
##  $ Shape         : int  3 4 1 1 1 2 3 2 1 3 ...
##  $ Margin        : int  5 5 1 5 4 1 1 1 1 4 ...
##  $ Density       : int  3 3 3 3 3 3 2 2 3 3 ...
##  $ age_birads    : Factor w/ 5 levels "[0,172)","[172,220)",..: 5 4 1 4 5 1 1 3 2 1 ...
##   ..- attr(*, "discretized:breaks")= num  0 172 220 264 320 480
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_shape     : Factor w/ 5 levels "[18,63)","[63,118)",..: 4 4 1 1 2 2 2 3 1 3 ...
##   ..- attr(*, "discretized:breaks")= num  18 63 118 200 260 352
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_margin    : Factor w/ 5 levels "[18,48)","[48,108)",..: 5 5 1 5 5 1 1 2 2 3 ...
##   ..- attr(*, "discretized:breaks")= num  18 48 108 213 269 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_density   : Factor w/ 5 levels "[40,120)","[120,156)",..: 4 3 1 3 5 2 1 2 3 3 ...
##   ..- attr(*, "discretized:breaks")= num  40 120 156 177 204 308
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ birads_shape  : Factor w/ 5 levels "[0,4)","[4,8)",..: 3 5 2 2 2 2 3 3 2 3 ...
##   ..- attr(*, "discretized:breaks")= num  0 4 8 16 20 24
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ birads_margin : Factor w/ 5 levels "[0,4)","[4,8)",..: 5 5 2 5 5 1 2 2 2 3 ...
##   ..- attr(*, "discretized:breaks")= num  0 4 8 16 20 30
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ birads_density: Factor w/ 3 levels "[0,12)","[12,15)",..: 3 3 2 3 3 1 1 1 2 1 ...
##   ..- attr(*, "discretized:breaks")= num  0 12 15 20
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ shape_margin  : Factor w/ 3 levels "[1,2)","[2,16)",..: 2 3 1 2 2 2 2 2 1 2 ...
##   ..- attr(*, "discretized:breaks")= num  1 2 16 20
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ shape_density : Factor w/ 3 levels "[1,6)","[6,12)",..: 2 3 1 1 1 2 2 1 1 2 ...
##   ..- attr(*, "discretized:breaks")= num  1 6 12 16
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ margin_density: Factor w/ 3 levels "[1,3)","[3,12)",..: 3 3 2 3 3 2 1 1 2 3 ...
##   ..- attr(*, "discretized:breaks")= num  1 3 12 20
##   ..- attr(*, "discretized:method")= chr "frequency"
#Using ggvis, make a histogram for each variable.
#With this version of the dataset, the intervals of discretization are visualized in the histograms:
mammMasses2 %>% ggvis(~Class) %>% layer_bars()
## Warning: package 'bindrcpp' was built under R version 3.4.4
mammMasses2 %>% ggvis(~Age) %>% layer_bars()
mammMasses2 %>% ggvis(~BI_RADS) %>% layer_bars()
mammMasses2 %>% ggvis(~Shape) %>% layer_bars()
mammMasses2 %>% ggvis(~Density) %>% layer_bars()
mammMasses2 %>% ggvis(~age_birads) %>% layer_bars()
mammMasses2 %>% ggvis(~age_shape) %>% layer_bars()
mammMasses2 %>% ggvis(~age_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~age_density) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_shape) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_density) %>% layer_bars()
mammMasses2 %>% ggvis(~shape_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~shape_density) %>% layer_bars()
mammMasses2 %>% ggvis(~margin_density) %>% layer_bars()
#For modeling, I want these to be in integers (testing below) 
#Coerce each column to integer and save as mammMassesFinal
mammMassesFinal = mammMasses2

mammMassesFinal$Age <- as.integer(mammMassesFinal$Age)
mammMassesFinal$age_birads <- as.integer(mammMassesFinal$age_birads)
mammMassesFinal$age_shape <- as.integer(mammMassesFinal$age_shape)
mammMassesFinal$age_margin <- as.integer(mammMassesFinal$age_margin)
mammMassesFinal$age_density <- as.integer(mammMassesFinal$age_density)
mammMassesFinal$birads_shape <- as.integer(mammMassesFinal$birads_shape)
mammMassesFinal$birads_margin <- as.integer(mammMassesFinal$birads_margin)
mammMassesFinal$birads_density <- as.integer(mammMassesFinal$birads_density)
mammMassesFinal$shape_margin <- as.integer(mammMassesFinal$shape_margin)
mammMassesFinal$shape_density <- as.integer(mammMassesFinal$shape_density)
mammMassesFinal$margin_density <- as.integer(mammMassesFinal$margin_density)

str(mammMassesFinal)
## 'data.frame':    829 obs. of  16 variables:
##  $ Class         : int  1 1 0 1 1 1 0 0 0 0 ...
##  $ Age           : int  6 4 1 4 7 2 1 5 4 3 ...
##  $ BI_RADS       : int  5 5 4 5 5 3 4 4 4 3 ...
##  $ Shape         : int  3 4 1 1 1 2 3 2 1 3 ...
##  $ Margin        : int  5 5 1 5 4 1 1 1 1 4 ...
##  $ Density       : int  3 3 3 3 3 3 2 2 3 3 ...
##  $ age_birads    : int  5 4 1 4 5 1 1 3 2 1 ...
##  $ age_shape     : int  4 4 1 1 2 2 2 3 1 3 ...
##  $ age_margin    : int  5 5 1 5 5 1 1 2 2 3 ...
##  $ age_density   : int  4 3 1 3 5 2 1 2 3 3 ...
##  $ birads_shape  : int  3 5 2 2 2 2 3 3 2 3 ...
##  $ birads_margin : int  5 5 2 5 5 1 2 2 2 3 ...
##  $ birads_density: int  3 3 2 3 3 1 1 1 2 1 ...
##  $ shape_margin  : int  2 3 1 2 2 2 2 2 1 2 ...
##  $ shape_density : int  2 3 1 1 1 2 2 1 1 2 ...
##  $ margin_density: int  3 3 2 3 3 2 1 1 2 3 ...
write.csv(mammMassesFinal, file = "mammMassesFinal.csv")

Part 2: Naive-Bayes Modeling, Analysis, and Visualization

#Create testing and training datasets
#Using an 20:80 partition gives us 166:663
#train <- mammMassesFinal[1:166,]
#test <- mammMassesFinal[167:829,2:16]

#Call the NaiveBayes() Function on MammMasses2
NBModel <- naiveBayes(Class ~., mammMassesFinal)
NBModel
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.5150784 0.4849216 
## 
## Conditional probabilities:
##    Age
## Y       [,1]     [,2]
##   0 3.189696 1.830643
##   1 5.037313 1.770104
## 
##    BI_RADS
## Y       [,1]      [,2]
##   0 3.983607 0.5282739
##   1 4.703980 0.6429623
## 
##    Shape
## Y       [,1]      [,2]
##   0 2.100703 1.1012901
##   1 3.502488 0.9402307
## 
##    Margin
## Y       [,1]     [,2]
##   0 1.939110 1.384367
##   1 3.741294 1.168043
## 
##    Density
## Y       [,1]      [,2]
##   0 2.892272 0.3785458
##   1 2.940299 0.3180645
## 
##    age_birads
## Y       [,1]     [,2]
##   0 2.227166 1.126885
##   1 3.888060 1.166835
## 
##    age_shape
## Y       [,1]     [,2]
##   0 2.189696 1.152294
##   1 3.900498 1.100918
## 
##    age_margin
## Y       [,1]     [,2]
##   0 2.168618 1.162662
##   1 3.890547 1.077061
## 
##    age_density
## Y       [,1]     [,2]
##   0 2.435597 1.275426
##   1 3.694030 1.252793
## 
##    birads_shape
## Y       [,1]      [,2]
##   0 2.805621 0.8866068
##   1 4.211443 1.0605454
## 
##    birads_margin
## Y       [,1]     [,2]
##   0 2.583138 1.054610
##   1 4.164179 1.104473
## 
##    birads_density
## Y       [,1]      [,2]
##   0 1.913349 0.4742028
##   1 2.614428 0.6181711
## 
##    shape_margin
## Y       [,1]      [,2]
##   0 1.791569 0.6686291
##   1 2.532338 0.5825381
## 
##    shape_density
## Y       [,1]      [,2]
##   0 1.756440 0.7422049
##   1 2.609453 0.6427887
## 
##    margin_density
## Y       [,1]      [,2]
##   0 2.133489 0.5249438
##   1 2.659204 0.4951408
#Create a Classification Task for the model
require(mlr)
task <- makeClassifTask(data = mammMassesFinal, target = "Class")

#initialize the NB Classifier
selected_model <- makeLearner("classif.naiveBayes")

#Train the Model:
NBPred <- train(selected_model, task)
NBPred
## Model for learner.id=classif.naiveBayes; learner.class=classif.naiveBayes
## Trained on: task.id = mammMassesFinal; obs = 829; features = 15
## Hyperparameters:
#Apply Predictive model to mammMassesFinal without passing on the target variable
predictions_mlr <- as.data.frame(predict(NBPred, newdata = mammMassesFinal[2:16]))

#Create a Confusion Matrix as First Test of Accuracy
require(caret)
table1 <- table(predictions_mlr[,1], mammMassesFinal$Class)
table1
##    
##       0   1
##   0 343  57
##   1  84 345
table2 <- prop.table(table1)
table2
##    
##              0          1
##   0 0.41375151 0.06875754
##   1 0.10132690 0.41616405
totalSuccessRate = ((0.41375 + 0.416164) * 100)
totalSuccessRate
## [1] 82.9914
totalErrorRate = 100 - totalSuccessRate
totalErrorRate
## [1] 17.0086
#Save the model as an RDS object
#With a new dataset, the RDS model can be re-loaded and used 
#in a new predict() function:
saveRDS(NBPred, file = "initialNaiveBayesModel.rds")
#to restore: readRDS(file = "initialNaiveBayesModel.rds")

Total reduction of class error: 1%.

Part 3: RIDIT Transformation and Principal Component Analysis

#Create subset of mammMassesFinal for RIDIT-Transformation
mammMassesRidit <- mammMassesFinal[2:16] #Leave off the target variable

#RIDIT-ize Each Variable:
rid1 <- mammMassesRidit[1]
rid1 <- toridit(rid1)

rid2 <- mammMassesRidit[2]
rid2 <- toridit(rid2)

rid3 <- mammMassesRidit[3]
rid3 <- toridit(rid3)

rid4 <- mammMassesRidit[4]
rid4 <- toridit(rid4)

rid5 <- mammMassesRidit[5]
rid5 <- toridit(rid5)

rid6 <- mammMassesRidit[6]
rid6 <- toridit(rid6)

rid7 <- mammMassesRidit[7]
rid7 <- toridit(rid7)

rid8 <- mammMassesRidit[8]
rid8 <- toridit(rid8)

rid9 <- mammMassesRidit[9]
rid9 <- toridit(rid9)

rid10 <- mammMassesRidit[10]
rid10 <- toridit(rid10)

rid11 <- mammMassesRidit[11]
rid11 <- toridit(rid11)

rid12 <- mammMassesRidit[12]
rid12 <- toridit(rid12)

rid13 <- mammMassesRidit[13]
rid13 <- toridit(rid13)

rid14 <- mammMassesRidit[14]
rid14 <- toridit(rid14)

rid15 <- mammMassesRidit[15]
rid15 <- toridit(rid15)
#Remake mammMassesRidit by Column Binding all Ridit-ized Variables
mammMassesRidit <- cbind(rid1, rid2, rid3, rid4, rid5, rid6, rid7, rid8, rid9, rid10, rid11, rid12, rid13, rid14, rid15)
colnames(mammMassesRidit) <- c("Age_RIDIT","BI_RADS_RIDIT", "Shape_RIDIT", "Margin_RIDIT", "Density_RIDIT", "age_birads_RIDIT", "age_shape_RIDIT", "age_margin_RIDIT", "age_density_RIDIT", "birads_shape_RIDIT", "birads_margin_RIDIT", "birads_density_RIDIT", "shape_margin_RIDIT", "shape_density_RIDIT", "margin_density_RIDIT")
#Discretize the RIDITs:
mammMassesRidit$Age_RIDIT <- discretize(mammMassesRidit$Age_RIDIT, breaks = 5)
mammMassesRidit$BI_RADS_RIDIT <- discretize(mammMassesRidit$BI_RADS_RIDIT, breaks = 5)
mammMassesRidit$Shape_RIDIT <- discretize(mammMassesRidit$Shape_RIDIT, breaks = 5)
mammMassesRidit$Margin_RIDIT <- discretize(mammMassesRidit$Margin_RIDIT, breaks = 5)
mammMassesRidit$Density_RIDIT <- discretize(mammMassesRidit$Density_RIDIT, breaks = 5)
mammMassesRidit$age_birads_RIDIT <- discretize(mammMassesRidit$age_birads_RIDIT, breaks = 5)
mammMassesRidit$age_shape_RIDIT <- discretize(mammMassesRidit$age_shape_RIDIT, breaks = 5)
mammMassesRidit$age_margin_RIDIT <- discretize(mammMassesRidit$age_margin_RIDIT, breaks = 5)
mammMassesRidit$age_density_RIDIT <- discretize(mammMassesRidit$age_density_RIDIT, breaks = 5)
mammMassesRidit$birads_shape_RIDIT <- discretize(mammMassesRidit$birads_shape_RIDIT, breaks = 5)
mammMassesRidit$birads_margin_RIDIT <- discretize(mammMassesRidit$birads_margin_RIDIT, breaks = 5)
mammMassesRidit$birads_density_RIDIT <- discretize(mammMassesRidit$birads_density_RIDIT, breaks = 5)
mammMassesRidit$shape_margin_RIDIT <- discretize(mammMassesRidit$shape_margin_RIDIT, breaks = 5)
mammMassesRidit$shape_density_RIDIT <- discretize(mammMassesRidit$shape_density_RIDIT, breaks = 5)
mammMassesRidit$margin_density_RIDIT <- discretize(mammMassesRidit$margin_density_RIDIT, breaks = 5)

str(mammMassesRidit)
## 'data.frame':    829 obs. of  15 variables:
##  $ Age_RIDIT           : Factor w/ 5 levels "[0.000886,0.21)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000886 0.209596 0.414172 0.602273 0.800266 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ BI_RADS_RIDIT       : Factor w/ 5 levels "[0.000696,0.204)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000696 0.203703 0.405763 0.606487 0.807517 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ Shape_RIDIT         : Factor w/ 5 levels "[0.000651,0.204)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000651 0.204252 0.41705 0.609197 0.800998 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ Margin_RIDIT        : Factor w/ 5 levels "[0.00107,0.211)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.00107 0.21055 0.41188 0.61728 0.80446 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ Density_RIDIT       : Factor w/ 5 levels "[0.000621,0.194)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000621 0.194249 0.395987 0.601531 0.80211 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_birads_RIDIT    : Factor w/ 5 levels "[0.000994,0.211)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000994 0.211337 0.417462 0.609626 0.81257 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_shape_RIDIT     : Factor w/ 5 levels "[0.000799,0.204)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000799 0.203875 0.418698 0.606712 0.800759 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_margin_RIDIT    : Factor w/ 5 levels "[0.001,0.209)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.001 0.209 0.413 0.615 0.806 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ age_density_RIDIT   : Factor w/ 5 levels "[0.000792,0.199)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000792 0.198535 0.406099 0.600871 0.804554 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ birads_shape_RIDIT  : Factor w/ 5 levels "[0.000519,0.205)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000519 0.204739 0.414735 0.610688 0.808544 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ birads_margin_RIDIT : Factor w/ 5 levels "[0.0009,0.21)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.0009 0.21 0.4131 0.616 0.8063 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ birads_density_RIDIT: Factor w/ 5 levels "[0.000803,0.196)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000803 0.195771 0.402623 0.613437 0.814668 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ shape_margin_RIDIT  : Factor w/ 5 levels "[0.000561,0.205)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000561 0.204992 0.40903 0.606786 0.800673 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ shape_density_RIDIT : Factor w/ 5 levels "[0.000556,0.199)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000556 0.198833 0.409283 0.607504 0.802446 ...
##   ..- attr(*, "discretized:method")= chr "frequency"
##  $ margin_density_RIDIT: Factor w/ 5 levels "[0.000758,0.2)",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "discretized:breaks")= num  0.000758 0.200354 0.400505 0.606515 0.802778 ...
##   ..- attr(*, "discretized:method")= chr "frequency"

Principal Component Analysis

#Perform Simple PCA
require(FactoMineR)
require(ggplot2)

pca1 <- PCA(mammMassesFinal, graph = T)

#More Visualizations
pca1
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 829 individuals, described by 16 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
pca1$eig
##         eigenvalue percentage of variance
## comp 1  9.51082518             59.4426574
## comp 2  1.85791071             11.6119419
## comp 3  1.45355822              9.0847389
## comp 4  1.20277672              7.5173545
## comp 5  0.91716150              5.7322594
## comp 6  0.47728990              2.9830619
## comp 7  0.14531560              0.9082225
## comp 8  0.09184057              0.5740036
## comp 9  0.08055293              0.5034558
## comp 10 0.06605638              0.4128523
## comp 11 0.04855148              0.3034468
## comp 12 0.03704839              0.2315524
## comp 13 0.03519130              0.2199456
## comp 14 0.03047811              0.1904882
## comp 15 0.02407251              0.1504532
## comp 16 0.02137050              0.1335656
##         cumulative percentage of variance
## comp 1                           59.44266
## comp 2                           71.05460
## comp 3                           80.13934
## comp 4                           87.65669
## comp 5                           93.38895
## comp 6                           96.37201
## comp 7                           97.28024
## comp 8                           97.85424
## comp 9                           98.35770
## comp 10                          98.77055
## comp 11                          99.07399
## comp 12                          99.30555
## comp 13                          99.52549
## comp 14                          99.71598
## comp 15                          99.86643
## comp 16                         100.00000
dimdesc(pca1)
## $Dim.1
## $Dim.1$quanti
##                correlation       p.value
## age_margin       0.9047214 1.026504e-308
## age_shape        0.8991275 5.385241e-299
## birads_margin    0.8765853 6.579247e-265
## Margin           0.8634782 4.918222e-248
## birads_shape     0.8615333 1.110260e-245
## Shape            0.8499196 2.460988e-232
## shape_density    0.8488544 3.617002e-231
## shape_margin     0.8368481 1.322346e-218
## age_birads       0.7805400 6.366265e-171
## margin_density   0.7645356 6.792762e-160
## Class            0.7139186 4.002362e-130
## birads_density   0.6935197 6.817905e-120
## age_density      0.6621935 9.494540e-106
## Age              0.6446897  1.445482e-98
## BI_RADS          0.6236180  1.602997e-90
## Density          0.2386945  3.328662e-12
## 
## 
## $Dim.2
## $Dim.2$quanti
##                correlation       p.value
## age_density     0.65746103 9.266088e-104
## Age             0.63214440  1.058572e-93
## age_birads      0.52767844  1.299775e-60
## birads_density  0.24632439  6.353569e-13
## Density         0.22459122  6.131868e-11
## BI_RADS         0.13757177  7.069079e-05
## Class           0.08024702  2.084710e-02
## margin_density -0.14609742  2.412646e-05
## birads_margin  -0.19833580  8.449009e-09
## Margin         -0.26451105  9.730901e-15
## birads_shape   -0.26932208  3.047710e-15
## shape_density  -0.30803451  1.113139e-19
## Shape          -0.36454376  1.867179e-27
## shape_margin   -0.42083559  6.486149e-37
## 
## 
## $Dim.3
## $Dim.3$quanti
##                correlation       p.value
## Density         0.85746413 7.176141e-241
## birads_density  0.48754476  1.029129e-50
## margin_density  0.42388016  1.765129e-37
## BI_RADS         0.07909796  2.275341e-02
## birads_margin   0.06861367  4.827991e-02
## age_margin     -0.06887923  4.741681e-02
## shape_margin   -0.07709764  2.643383e-02
## Class          -0.08366773  1.597079e-02
## birads_shape   -0.10356965  2.830712e-03
## Shape          -0.16410717  2.032728e-06
## age_birads     -0.21566134  3.518820e-10
## age_shape      -0.26512425  8.403457e-15
## Age            -0.33796385  1.347565e-23
#Part 4: Investigating the False Positives and False Negatives