#Install all packages:
install.packages("ridittools", repos = "https://cran.rstudio.com")
install.packages("Amelia", repos = "https://cran.rstudio.com")
install.packages("ggvis", repos = "https://cran.rstudio.com")
install.packages("lattice", repos = "https://cran.rstudio.com")
install.packages("ggplot2", repos = "https://cran.rstudio.com")
install.packages("e1071", repos = "https://cran.rstudio.com")
install.packages("caret", repos = "https://cran.rstudio.com")
install.packages("FactoMineR", repos = "https://cran.rstudio.com")
install.packages("ROCR", repos = "https://cran.rstduio.com")
install.packages("arules", repos = "https://cran.rstudio.com")
install.packages("mlr", repos = "https://cran.rstudio.com")
#Load packages into working memory:
library("ridittools")
library("Amelia")
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library("ggvis")
library("lattice")
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
##
## resolution
library("e1071")
library("caret")
library("FactoMineR")
library("ROCR")
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("arules")
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:ggvis':
##
## band
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library("mlr")
## Loading required package: ParamHelpers
##
## Attaching package: 'mlr'
## The following object is masked from 'package:ROCR':
##
## performance
## The following object is masked from 'package:caret':
##
## train
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:ridittools':
##
## acc
#import dataset into workspace
mammMasses <- read.csv("c:/users/Joshu/documents/datasets/mammMasses.csv")
str(mammMasses)
## 'data.frame': 829 obs. of 6 variables:
## $ BI_RADS: int 5 5 4 5 5 3 4 4 4 3 ...
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density: int 3 3 3 3 3 3 2 2 3 3 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
#reorder columns so age is first
mammMasses <- mammMasses[c(6,2,1,3,4,5)]
str(mammMasses)
## 'data.frame': 829 obs. of 6 variables:
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ BI_RADS: int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density: int 3 3 3 3 3 3 2 2 3 3 ...
Create and Transform Dot Products for Each Variable combination
#Create a Copy of mammMasses
mammMassesDot = mammMasses
#Like above, create a new vector for each individual variable.
dot1 <- mammMassesDot[2]
dot2 <- mammMassesDot[3]
dot3 <- mammMassesDot[4]
dot4 <- mammMassesDot[5]
dot5 <- mammMassesDot[6]
#create a derived variable product for each of 10 possible variable combinations:
# 1*2, 1*3, 1*4, 1*5, 2*3, 2*4, 2*5, 3*4, 3*5, 4*5
#new variable (dotp1) = matrix1 * matrix2
dotp1 <- dot1 * dot2
dotp2 <- dot1 * dot3
dotp3 <- dot1 * dot4
dotp4 <- dot1 * dot5
dotp5 <- dot2 * dot3
dotp6 <- dot2 * dot4
dotp7 <- dot2 * dot5
dotp8 <- dot3 * dot4
dotp9 <- dot3 * dot5
dotp10 <- dot4 * dot5
#Column bind discretized dot products back into mammMassesDot:
mammMassesDot = cbind.data.frame(dotp1, dotp2, dotp3, dotp4, dotp5, dotp6, dotp7, dotp8, dotp9, dotp10)
#Rename mammMassesDot Columns to reflect the two variables from which the new variables are derived
colnames(mammMassesDot) <- c("age_birads", "age_shape", "age_margin", "age_density", "birads_shape", "birads_margin", "birads_density", "shape_margin", "shape_density", "margin_density")
#Append mammMassesRidit and mammMassesDot onto mammMasses
mammMasses2 = cbind(mammMasses, mammMassesDot)
str(mammMasses2)
## 'data.frame': 829 obs. of 16 variables:
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ BI_RADS : int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density : int 3 3 3 3 3 3 2 2 3 3 ...
## $ age_birads : int 335 290 112 285 380 126 144 240 216 156 ...
## $ age_shape : int 201 232 28 57 76 84 108 120 54 156 ...
## $ age_margin : int 335 290 28 285 304 42 36 60 54 208 ...
## $ age_density : int 201 174 84 171 228 126 72 120 162 156 ...
## $ birads_shape : int 15 20 4 5 5 6 12 8 4 9 ...
## $ birads_margin : int 25 25 4 25 20 3 4 4 4 12 ...
## $ birads_density: int 15 15 12 15 15 9 8 8 12 9 ...
## $ shape_margin : int 15 20 1 5 4 2 3 2 1 12 ...
## $ shape_density : int 9 12 3 3 3 6 6 4 3 9 ...
## $ margin_density: int 15 15 3 15 12 3 2 2 3 12 ...
#Discretize Age and Dot Product Variables
mammMasses2$Age <- discretize(mammMasses2$Age, breaks = 7)
mammMasses2$age_birads <- discretize(mammMasses2$age_birads, breaks = 5)
mammMasses2$age_shape <- discretize(mammMasses2$age_shape, breaks = 5)
mammMasses2$age_margin <- discretize(mammMasses2$age_margin, breaks = 5)
mammMasses2$age_density <- discretize(mammMasses2$age_density, breaks = 5)
mammMasses2$birads_shape <- discretize(mammMasses2$birads_shape, breaks = 5)
mammMasses2$birads_margin <- discretize(mammMasses2$birads_margin, breaks = 5)
mammMasses2$birads_density <- discretize(mammMasses2$birads_density, breaks = 3)
mammMasses2$shape_margin <- discretize(mammMasses2$shape_margin, breaks = 3)
mammMasses2$shape_density <- discretize(mammMasses2$shape_density, breaks = 3)
mammMasses2$margin_density <- discretize(mammMasses2$margin_density, breaks = 3)
str(mammMasses2)
## 'data.frame': 829 obs. of 16 variables:
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
## $ Age : Factor w/ 7 levels "[18,40)","[40,47)",..: 6 4 1 4 7 2 1 5 4 3 ...
## ..- attr(*, "discretized:breaks")= num 18 40 47 54 59 65 71 96
## ..- attr(*, "discretized:method")= chr "frequency"
## $ BI_RADS : int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density : int 3 3 3 3 3 3 2 2 3 3 ...
## $ age_birads : Factor w/ 5 levels "[0,172)","[172,220)",..: 5 4 1 4 5 1 1 3 2 1 ...
## ..- attr(*, "discretized:breaks")= num 0 172 220 264 320 480
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_shape : Factor w/ 5 levels "[18,63)","[63,118)",..: 4 4 1 1 2 2 2 3 1 3 ...
## ..- attr(*, "discretized:breaks")= num 18 63 118 200 260 352
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_margin : Factor w/ 5 levels "[18,48)","[48,108)",..: 5 5 1 5 5 1 1 2 2 3 ...
## ..- attr(*, "discretized:breaks")= num 18 48 108 213 269 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_density : Factor w/ 5 levels "[40,120)","[120,156)",..: 4 3 1 3 5 2 1 2 3 3 ...
## ..- attr(*, "discretized:breaks")= num 40 120 156 177 204 308
## ..- attr(*, "discretized:method")= chr "frequency"
## $ birads_shape : Factor w/ 5 levels "[0,4)","[4,8)",..: 3 5 2 2 2 2 3 3 2 3 ...
## ..- attr(*, "discretized:breaks")= num 0 4 8 16 20 24
## ..- attr(*, "discretized:method")= chr "frequency"
## $ birads_margin : Factor w/ 5 levels "[0,4)","[4,8)",..: 5 5 2 5 5 1 2 2 2 3 ...
## ..- attr(*, "discretized:breaks")= num 0 4 8 16 20 30
## ..- attr(*, "discretized:method")= chr "frequency"
## $ birads_density: Factor w/ 3 levels "[0,12)","[12,15)",..: 3 3 2 3 3 1 1 1 2 1 ...
## ..- attr(*, "discretized:breaks")= num 0 12 15 20
## ..- attr(*, "discretized:method")= chr "frequency"
## $ shape_margin : Factor w/ 3 levels "[1,2)","[2,16)",..: 2 3 1 2 2 2 2 2 1 2 ...
## ..- attr(*, "discretized:breaks")= num 1 2 16 20
## ..- attr(*, "discretized:method")= chr "frequency"
## $ shape_density : Factor w/ 3 levels "[1,6)","[6,12)",..: 2 3 1 1 1 2 2 1 1 2 ...
## ..- attr(*, "discretized:breaks")= num 1 6 12 16
## ..- attr(*, "discretized:method")= chr "frequency"
## $ margin_density: Factor w/ 3 levels "[1,3)","[3,12)",..: 3 3 2 3 3 2 1 1 2 3 ...
## ..- attr(*, "discretized:breaks")= num 1 3 12 20
## ..- attr(*, "discretized:method")= chr "frequency"
#Using ggvis, make a histogram for each variable.
#With this version of the dataset, the intervals of discretization are visualized in the histograms:
mammMasses2 %>% ggvis(~Class) %>% layer_bars()
## Warning: package 'bindrcpp' was built under R version 3.4.4
mammMasses2 %>% ggvis(~Age) %>% layer_bars()
mammMasses2 %>% ggvis(~BI_RADS) %>% layer_bars()
mammMasses2 %>% ggvis(~Shape) %>% layer_bars()
mammMasses2 %>% ggvis(~Density) %>% layer_bars()
mammMasses2 %>% ggvis(~age_birads) %>% layer_bars()
mammMasses2 %>% ggvis(~age_shape) %>% layer_bars()
mammMasses2 %>% ggvis(~age_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~age_density) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_shape) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_density) %>% layer_bars()
mammMasses2 %>% ggvis(~shape_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~shape_density) %>% layer_bars()
mammMasses2 %>% ggvis(~margin_density) %>% layer_bars()
#For modeling, I want these to be in integers (testing below)
#Coerce each column to integer and save as mammMassesFinal
mammMassesFinal = mammMasses2
mammMassesFinal$Age <- as.integer(mammMassesFinal$Age)
mammMassesFinal$age_birads <- as.integer(mammMassesFinal$age_birads)
mammMassesFinal$age_shape <- as.integer(mammMassesFinal$age_shape)
mammMassesFinal$age_margin <- as.integer(mammMassesFinal$age_margin)
mammMassesFinal$age_density <- as.integer(mammMassesFinal$age_density)
mammMassesFinal$birads_shape <- as.integer(mammMassesFinal$birads_shape)
mammMassesFinal$birads_margin <- as.integer(mammMassesFinal$birads_margin)
mammMassesFinal$birads_density <- as.integer(mammMassesFinal$birads_density)
mammMassesFinal$shape_margin <- as.integer(mammMassesFinal$shape_margin)
mammMassesFinal$shape_density <- as.integer(mammMassesFinal$shape_density)
mammMassesFinal$margin_density <- as.integer(mammMassesFinal$margin_density)
str(mammMassesFinal)
## 'data.frame': 829 obs. of 16 variables:
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
## $ Age : int 6 4 1 4 7 2 1 5 4 3 ...
## $ BI_RADS : int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density : int 3 3 3 3 3 3 2 2 3 3 ...
## $ age_birads : int 5 4 1 4 5 1 1 3 2 1 ...
## $ age_shape : int 4 4 1 1 2 2 2 3 1 3 ...
## $ age_margin : int 5 5 1 5 5 1 1 2 2 3 ...
## $ age_density : int 4 3 1 3 5 2 1 2 3 3 ...
## $ birads_shape : int 3 5 2 2 2 2 3 3 2 3 ...
## $ birads_margin : int 5 5 2 5 5 1 2 2 2 3 ...
## $ birads_density: int 3 3 2 3 3 1 1 1 2 1 ...
## $ shape_margin : int 2 3 1 2 2 2 2 2 1 2 ...
## $ shape_density : int 2 3 1 1 1 2 2 1 1 2 ...
## $ margin_density: int 3 3 2 3 3 2 1 1 2 3 ...
write.csv(mammMassesFinal, file = "mammMassesFinal.csv")
#Create testing and training datasets
#Using an 20:80 partition gives us 166:663
#train <- mammMassesFinal[1:166,]
#test <- mammMassesFinal[167:829,2:16]
#Call the NaiveBayes() Function on MammMasses2
NBModel <- naiveBayes(Class ~., mammMassesFinal)
NBModel
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5150784 0.4849216
##
## Conditional probabilities:
## Age
## Y [,1] [,2]
## 0 3.189696 1.830643
## 1 5.037313 1.770104
##
## BI_RADS
## Y [,1] [,2]
## 0 3.983607 0.5282739
## 1 4.703980 0.6429623
##
## Shape
## Y [,1] [,2]
## 0 2.100703 1.1012901
## 1 3.502488 0.9402307
##
## Margin
## Y [,1] [,2]
## 0 1.939110 1.384367
## 1 3.741294 1.168043
##
## Density
## Y [,1] [,2]
## 0 2.892272 0.3785458
## 1 2.940299 0.3180645
##
## age_birads
## Y [,1] [,2]
## 0 2.227166 1.126885
## 1 3.888060 1.166835
##
## age_shape
## Y [,1] [,2]
## 0 2.189696 1.152294
## 1 3.900498 1.100918
##
## age_margin
## Y [,1] [,2]
## 0 2.168618 1.162662
## 1 3.890547 1.077061
##
## age_density
## Y [,1] [,2]
## 0 2.435597 1.275426
## 1 3.694030 1.252793
##
## birads_shape
## Y [,1] [,2]
## 0 2.805621 0.8866068
## 1 4.211443 1.0605454
##
## birads_margin
## Y [,1] [,2]
## 0 2.583138 1.054610
## 1 4.164179 1.104473
##
## birads_density
## Y [,1] [,2]
## 0 1.913349 0.4742028
## 1 2.614428 0.6181711
##
## shape_margin
## Y [,1] [,2]
## 0 1.791569 0.6686291
## 1 2.532338 0.5825381
##
## shape_density
## Y [,1] [,2]
## 0 1.756440 0.7422049
## 1 2.609453 0.6427887
##
## margin_density
## Y [,1] [,2]
## 0 2.133489 0.5249438
## 1 2.659204 0.4951408
#Create a Classification Task for the model
require(mlr)
task <- makeClassifTask(data = mammMassesFinal, target = "Class")
#initialize the NB Classifier
selected_model <- makeLearner("classif.naiveBayes")
#Train the Model:
NBPred <- train(selected_model, task)
NBPred
## Model for learner.id=classif.naiveBayes; learner.class=classif.naiveBayes
## Trained on: task.id = mammMassesFinal; obs = 829; features = 15
## Hyperparameters:
#Apply Predictive model to mammMassesFinal without passing on the target variable
predictions_mlr <- as.data.frame(predict(NBPred, newdata = mammMassesFinal[2:16]))
#Create a Confusion Matrix as First Test of Accuracy
require(caret)
table1 <- table(predictions_mlr[,1], mammMassesFinal$Class)
table1
##
## 0 1
## 0 343 57
## 1 84 345
table2 <- prop.table(table1)
table2
##
## 0 1
## 0 0.41375151 0.06875754
## 1 0.10132690 0.41616405
totalSuccessRate = ((0.41375 + 0.416164) * 100)
totalSuccessRate
## [1] 82.9914
totalErrorRate = 100 - totalSuccessRate
totalErrorRate
## [1] 17.0086
#Save the model as an RDS object
#With a new dataset, the RDS model can be re-loaded and used
#in a new predict() function:
saveRDS(NBPred, file = "initialNaiveBayesModel.rds")
#to restore: readRDS(file = "initialNaiveBayesModel.rds")
Total reduction of class error: 1%.
#Create subset of mammMassesFinal for RIDIT-Transformation
mammMassesRidit <- mammMassesFinal[2:16] #Leave off the target variable
#RIDIT-ize Each Variable:
rid1 <- mammMassesRidit[1]
rid1 <- toridit(rid1)
rid2 <- mammMassesRidit[2]
rid2 <- toridit(rid2)
rid3 <- mammMassesRidit[3]
rid3 <- toridit(rid3)
rid4 <- mammMassesRidit[4]
rid4 <- toridit(rid4)
rid5 <- mammMassesRidit[5]
rid5 <- toridit(rid5)
rid6 <- mammMassesRidit[6]
rid6 <- toridit(rid6)
rid7 <- mammMassesRidit[7]
rid7 <- toridit(rid7)
rid8 <- mammMassesRidit[8]
rid8 <- toridit(rid8)
rid9 <- mammMassesRidit[9]
rid9 <- toridit(rid9)
rid10 <- mammMassesRidit[10]
rid10 <- toridit(rid10)
rid11 <- mammMassesRidit[11]
rid11 <- toridit(rid11)
rid12 <- mammMassesRidit[12]
rid12 <- toridit(rid12)
rid13 <- mammMassesRidit[13]
rid13 <- toridit(rid13)
rid14 <- mammMassesRidit[14]
rid14 <- toridit(rid14)
rid15 <- mammMassesRidit[15]
rid15 <- toridit(rid15)
#Remake mammMassesRidit by Column Binding all Ridit-ized Variables
mammMassesRidit <- cbind(rid1, rid2, rid3, rid4, rid5, rid6, rid7, rid8, rid9, rid10, rid11, rid12, rid13, rid14, rid15)
colnames(mammMassesRidit) <- c("Age_RIDIT","BI_RADS_RIDIT", "Shape_RIDIT", "Margin_RIDIT", "Density_RIDIT", "age_birads_RIDIT", "age_shape_RIDIT", "age_margin_RIDIT", "age_density_RIDIT", "birads_shape_RIDIT", "birads_margin_RIDIT", "birads_density_RIDIT", "shape_margin_RIDIT", "shape_density_RIDIT", "margin_density_RIDIT")
#Discretize the RIDITs:
mammMassesRidit$Age_RIDIT <- discretize(mammMassesRidit$Age_RIDIT, breaks = 5)
mammMassesRidit$BI_RADS_RIDIT <- discretize(mammMassesRidit$BI_RADS_RIDIT, breaks = 5)
mammMassesRidit$Shape_RIDIT <- discretize(mammMassesRidit$Shape_RIDIT, breaks = 5)
mammMassesRidit$Margin_RIDIT <- discretize(mammMassesRidit$Margin_RIDIT, breaks = 5)
mammMassesRidit$Density_RIDIT <- discretize(mammMassesRidit$Density_RIDIT, breaks = 5)
mammMassesRidit$age_birads_RIDIT <- discretize(mammMassesRidit$age_birads_RIDIT, breaks = 5)
mammMassesRidit$age_shape_RIDIT <- discretize(mammMassesRidit$age_shape_RIDIT, breaks = 5)
mammMassesRidit$age_margin_RIDIT <- discretize(mammMassesRidit$age_margin_RIDIT, breaks = 5)
mammMassesRidit$age_density_RIDIT <- discretize(mammMassesRidit$age_density_RIDIT, breaks = 5)
mammMassesRidit$birads_shape_RIDIT <- discretize(mammMassesRidit$birads_shape_RIDIT, breaks = 5)
mammMassesRidit$birads_margin_RIDIT <- discretize(mammMassesRidit$birads_margin_RIDIT, breaks = 5)
mammMassesRidit$birads_density_RIDIT <- discretize(mammMassesRidit$birads_density_RIDIT, breaks = 5)
mammMassesRidit$shape_margin_RIDIT <- discretize(mammMassesRidit$shape_margin_RIDIT, breaks = 5)
mammMassesRidit$shape_density_RIDIT <- discretize(mammMassesRidit$shape_density_RIDIT, breaks = 5)
mammMassesRidit$margin_density_RIDIT <- discretize(mammMassesRidit$margin_density_RIDIT, breaks = 5)
str(mammMassesRidit)
## 'data.frame': 829 obs. of 15 variables:
## $ Age_RIDIT : Factor w/ 5 levels "[0.000886,0.21)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000886 0.209596 0.414172 0.602273 0.800266 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ BI_RADS_RIDIT : Factor w/ 5 levels "[0.000696,0.204)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000696 0.203703 0.405763 0.606487 0.807517 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ Shape_RIDIT : Factor w/ 5 levels "[0.000651,0.204)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000651 0.204252 0.41705 0.609197 0.800998 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ Margin_RIDIT : Factor w/ 5 levels "[0.00107,0.211)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.00107 0.21055 0.41188 0.61728 0.80446 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ Density_RIDIT : Factor w/ 5 levels "[0.000621,0.194)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000621 0.194249 0.395987 0.601531 0.80211 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_birads_RIDIT : Factor w/ 5 levels "[0.000994,0.211)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000994 0.211337 0.417462 0.609626 0.81257 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_shape_RIDIT : Factor w/ 5 levels "[0.000799,0.204)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000799 0.203875 0.418698 0.606712 0.800759 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_margin_RIDIT : Factor w/ 5 levels "[0.001,0.209)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.001 0.209 0.413 0.615 0.806 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ age_density_RIDIT : Factor w/ 5 levels "[0.000792,0.199)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000792 0.198535 0.406099 0.600871 0.804554 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ birads_shape_RIDIT : Factor w/ 5 levels "[0.000519,0.205)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000519 0.204739 0.414735 0.610688 0.808544 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ birads_margin_RIDIT : Factor w/ 5 levels "[0.0009,0.21)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.0009 0.21 0.4131 0.616 0.8063 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ birads_density_RIDIT: Factor w/ 5 levels "[0.000803,0.196)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000803 0.195771 0.402623 0.613437 0.814668 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ shape_margin_RIDIT : Factor w/ 5 levels "[0.000561,0.205)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000561 0.204992 0.40903 0.606786 0.800673 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ shape_density_RIDIT : Factor w/ 5 levels "[0.000556,0.199)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000556 0.198833 0.409283 0.607504 0.802446 ...
## ..- attr(*, "discretized:method")= chr "frequency"
## $ margin_density_RIDIT: Factor w/ 5 levels "[0.000758,0.2)",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "discretized:breaks")= num 0.000758 0.200354 0.400505 0.606515 0.802778 ...
## ..- attr(*, "discretized:method")= chr "frequency"
Principal Component Analysis
#Perform Simple PCA
require(FactoMineR)
require(ggplot2)
pca1 <- PCA(mammMassesFinal, graph = T)
#More Visualizations
pca1
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 829 individuals, described by 16 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
pca1$eig
## eigenvalue percentage of variance
## comp 1 9.51082518 59.4426574
## comp 2 1.85791071 11.6119419
## comp 3 1.45355822 9.0847389
## comp 4 1.20277672 7.5173545
## comp 5 0.91716150 5.7322594
## comp 6 0.47728990 2.9830619
## comp 7 0.14531560 0.9082225
## comp 8 0.09184057 0.5740036
## comp 9 0.08055293 0.5034558
## comp 10 0.06605638 0.4128523
## comp 11 0.04855148 0.3034468
## comp 12 0.03704839 0.2315524
## comp 13 0.03519130 0.2199456
## comp 14 0.03047811 0.1904882
## comp 15 0.02407251 0.1504532
## comp 16 0.02137050 0.1335656
## cumulative percentage of variance
## comp 1 59.44266
## comp 2 71.05460
## comp 3 80.13934
## comp 4 87.65669
## comp 5 93.38895
## comp 6 96.37201
## comp 7 97.28024
## comp 8 97.85424
## comp 9 98.35770
## comp 10 98.77055
## comp 11 99.07399
## comp 12 99.30555
## comp 13 99.52549
## comp 14 99.71598
## comp 15 99.86643
## comp 16 100.00000
dimdesc(pca1)
## $Dim.1
## $Dim.1$quanti
## correlation p.value
## age_margin 0.9047214 1.026504e-308
## age_shape 0.8991275 5.385241e-299
## birads_margin 0.8765853 6.579247e-265
## Margin 0.8634782 4.918222e-248
## birads_shape 0.8615333 1.110260e-245
## Shape 0.8499196 2.460988e-232
## shape_density 0.8488544 3.617002e-231
## shape_margin 0.8368481 1.322346e-218
## age_birads 0.7805400 6.366265e-171
## margin_density 0.7645356 6.792762e-160
## Class 0.7139186 4.002362e-130
## birads_density 0.6935197 6.817905e-120
## age_density 0.6621935 9.494540e-106
## Age 0.6446897 1.445482e-98
## BI_RADS 0.6236180 1.602997e-90
## Density 0.2386945 3.328662e-12
##
##
## $Dim.2
## $Dim.2$quanti
## correlation p.value
## age_density 0.65746103 9.266088e-104
## Age 0.63214440 1.058572e-93
## age_birads 0.52767844 1.299775e-60
## birads_density 0.24632439 6.353569e-13
## Density 0.22459122 6.131868e-11
## BI_RADS 0.13757177 7.069079e-05
## Class 0.08024702 2.084710e-02
## margin_density -0.14609742 2.412646e-05
## birads_margin -0.19833580 8.449009e-09
## Margin -0.26451105 9.730901e-15
## birads_shape -0.26932208 3.047710e-15
## shape_density -0.30803451 1.113139e-19
## Shape -0.36454376 1.867179e-27
## shape_margin -0.42083559 6.486149e-37
##
##
## $Dim.3
## $Dim.3$quanti
## correlation p.value
## Density 0.85746413 7.176141e-241
## birads_density 0.48754476 1.029129e-50
## margin_density 0.42388016 1.765129e-37
## BI_RADS 0.07909796 2.275341e-02
## birads_margin 0.06861367 4.827991e-02
## age_margin -0.06887923 4.741681e-02
## shape_margin -0.07709764 2.643383e-02
## Class -0.08366773 1.597079e-02
## birads_shape -0.10356965 2.830712e-03
## Shape -0.16410717 2.032728e-06
## age_birads -0.21566134 3.518820e-10
## age_shape -0.26512425 8.403457e-15
## Age -0.33796385 1.347565e-23
#Part 4: Investigating the False Positives and False Negatives