Install Packages
#Install all packages:
install.packages("ridittools", repos = "https://cran.rstudio.com")
install.packages("Amelia", repos = "https://cran.rstudio.com")
install.packages("ggvis", repos = "https://cran.rstudio.com")
install.packages("lattice", repos = "https://cran.rstudio.com")
install.packages("ggplot2", repos = "https://cran.rstudio.com")
install.packages("e1071", repos = "https://cran.rstudio.com")
install.packages("caret", repos = "https://cran.rstudio.com")
install.packages("FactoMineR", repos = "https://cran.rstudio.com")
install.packages("ROCR", repos = "https://cran.rstduio.com")
install.packages("arules", repos = "https://cran.rstudio.com")
install.packages("mlr", repos = "https://cran.rstudio.com")
install.packages("Matrix", repos = "https://cran.rstudio.com")
install.packages("psych", repos = "https://cran.rstudio.com")
install.packages("dplyr", repos = "https://cran.rstudio.com")
Load Packages Into Memory
library("ridittools")
library("Amelia")
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library("ggvis")
library("lattice")
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
##
## resolution
library("e1071")
library("caret")
library("FactoMineR")
library("ROCR")
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("arules")
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:ggvis':
##
## band
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library("mlr")
## Loading required package: ParamHelpers
##
## Attaching package: 'mlr'
## The following object is masked from 'package:ROCR':
##
## performance
## The following object is masked from 'package:caret':
##
## train
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:ridittools':
##
## acc
library("Matrix")
library("psych")
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Load Dataset Into Working Memory
mammMasses <- read.csv("c:/users/Joshu/documents/datasets/mammMasses.csv")
str(mammMasses)
## 'data.frame': 829 obs. of 6 variables:
## $ BI_RADS: int 5 5 4 5 5 3 4 4 4 3 ...
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density: int 3 3 3 3 3 3 2 2 3 3 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
Reorder Columns so Age is First and Class is Last
mammMasses <- mammMasses[c(2,1,3,4,5,6)]
str(mammMasses)
## 'data.frame': 829 obs. of 6 variables:
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ BI_RADS: int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density: int 3 3 3 3 3 3 2 2 3 3 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
Create Dot Products
#Create copy of mammMasses, "cross"
cross = mammMasses #see str(mammMasses) above
#Make Each Column Into a Unique Vector
dot1 <- cross[,1]
dot2 <- cross[,2]
dot3 <- cross[,3]
dot4 <- cross[,4]
dot5 <- cross[,5]
str(dot1)
## int [1:829] 67 58 28 57 76 42 36 60 54 52 ...
str(dot2)
## int [1:829] 5 5 4 5 5 3 4 4 4 3 ...
str(dot3)
## int [1:829] 3 4 1 1 1 2 3 2 1 3 ...
str(dot4)
## int [1:829] 5 5 1 5 4 1 1 1 1 4 ...
str(dot5)
## int [1:829] 3 3 3 3 3 3 2 2 3 3 ...
#Create Cross Product Variables for All Variable Combinations
dotp1 <- tcrossprod(dot1, dot2)
dotp2 <- tcrossprod(dot1, dot3)
dotp3 <- tcrossprod(dot1, dot4)
dotp4 <- tcrossprod(dot1, dot5)
dotp5 <- tcrossprod(dot2, dot3)
dotp6 <- tcrossprod(dot2, dot4)
dotp7 <- tcrossprod(dot2, dot5)
dotp8 <- tcrossprod(dot3, dot4)
dotp9 <- tcrossprod(dot3, dot5)
dotp10 <- tcrossprod(dot4, dot5)
dotp1 <- diag(dotp1)
dotp2 <- diag(dotp2)
dotp3 <- diag(dotp3)
dotp4 <- diag(dotp4)
dotp5 <- diag(dotp5)
dotp6 <- diag(dotp6)
dotp7 <- diag(dotp7)
dotp8 <- diag(dotp8)
dotp9 <- diag(dotp9)
dotp10 <- diag(dotp10)
str(dotp1)
## num [1:829] 335 290 112 285 380 126 144 240 216 156 ...
str(dotp2)
## num [1:829] 201 232 28 57 76 84 108 120 54 156 ...
str(dotp3)
## num [1:829] 335 290 28 285 304 42 36 60 54 208 ...
str(dotp4)
## num [1:829] 201 174 84 171 228 126 72 120 162 156 ...
str(dotp5)
## num [1:829] 15 20 4 5 5 6 12 8 4 9 ...
str(dotp6)
## num [1:829] 25 25 4 25 20 3 4 4 4 12 ...
str(dotp7)
## num [1:829] 15 15 12 15 15 9 8 8 12 9 ...
str(dotp8)
## num [1:829] 15 20 1 5 4 2 3 2 1 12 ...
str(dotp9)
## num [1:829] 9 12 3 3 3 6 6 4 3 9 ...
str(dotp10)
## num [1:829] 15 15 3 15 12 3 2 2 3 12 ...
#Bind New Variables Into a Dataset, "cross2"
cross2 <- as.data.frame(cbind(dotp1, dotp2, dotp3, dotp4, dotp5, dotp6, dotp7, dotp8, dotp9, dotp10))
#Rename Columns to Reflect Cross Products
colnames(cross2) <- c("age_birads", "age_shape", "age_margin", "age_density", "birads_shape", "birads_margin", "birads_density", "shape_margin", "shape_density", "margin_density")
str(cross2)
## 'data.frame': 829 obs. of 10 variables:
## $ age_birads : num 335 290 112 285 380 126 144 240 216 156 ...
## $ age_shape : num 201 232 28 57 76 84 108 120 54 156 ...
## $ age_margin : num 335 290 28 285 304 42 36 60 54 208 ...
## $ age_density : num 201 174 84 171 228 126 72 120 162 156 ...
## $ birads_shape : num 15 20 4 5 5 6 12 8 4 9 ...
## $ birads_margin : num 25 25 4 25 20 3 4 4 4 12 ...
## $ birads_density: num 15 15 12 15 15 9 8 8 12 9 ...
## $ shape_margin : num 15 20 1 5 4 2 3 2 1 12 ...
## $ shape_density : num 9 12 3 3 3 6 6 4 3 9 ...
## $ margin_density: num 15 15 3 15 12 3 2 2 3 12 ...
#Cbind cross2 onto mammMasses as mammMasses2
mammMasses2 <- cbind(mammMasses, cross2)
#Reorder
mammMasses2 <- mammMasses2[c(1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,6)]
str(mammMasses2)
## 'data.frame': 829 obs. of 16 variables:
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ BI_RADS : int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density : int 3 3 3 3 3 3 2 2 3 3 ...
## $ age_birads : num 335 290 112 285 380 126 144 240 216 156 ...
## $ age_shape : num 201 232 28 57 76 84 108 120 54 156 ...
## $ age_margin : num 335 290 28 285 304 42 36 60 54 208 ...
## $ age_density : num 201 174 84 171 228 126 72 120 162 156 ...
## $ birads_shape : num 15 20 4 5 5 6 12 8 4 9 ...
## $ birads_margin : num 25 25 4 25 20 3 4 4 4 12 ...
## $ birads_density: num 15 15 12 15 15 9 8 8 12 9 ...
## $ shape_margin : num 15 20 1 5 4 2 3 2 1 12 ...
## $ shape_density : num 9 12 3 3 3 6 6 4 3 9 ...
## $ margin_density: num 15 15 3 15 12 3 2 2 3 12 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
#Export mammMasses2
write.csv(mammMasses2, "mammMasses2.csv")
Summary Statistics
describe(mammMasses2)
## vars n mean sd median trimmed mad min max range
## Age 1 829 55.79 14.68 57 56.15 14.83 18 96 78
## BI_RADS 2 829 4.33 0.69 4 4.37 0.00 0 6 6
## Shape 3 829 2.78 1.24 3 2.85 1.48 1 4 3
## Margin 4 829 2.81 1.57 3 2.77 2.97 1 5 4
## Density 5 829 2.92 0.35 3 3.00 0.00 1 4 3
## age_birads 6 829 244.97 85.05 244 243.93 88.96 0 480 480
## age_shape 7 829 162.07 92.30 159 159.39 128.99 18 352 334
## age_margin 8 829 166.63 110.24 177 160.42 164.57 18 465 447
## age_density 9 829 162.94 48.13 168 164.24 48.93 40 308 268
## birads_shape 10 829 12.40 6.46 12 12.52 11.86 0 24 24
## birads_margin 11 829 12.64 7.89 15 12.24 14.83 0 30 30
## birads_density 12 829 12.65 2.59 12 12.97 0.00 0 20 20
## shape_margin 13 829 9.26 7.13 9 8.95 10.38 1 20 19
## shape_density 14 829 8.14 3.83 9 8.29 4.45 1 16 15
## margin_density 15 829 8.27 4.78 9 8.14 8.90 1 20 19
## Class 16 829 0.48 0.50 0 0.48 0.00 0 1 1
## skew kurtosis se
## Age -0.22 -0.32 0.51
## BI_RADS -1.61 8.74 0.02
## Shape -0.30 -1.58 0.04
## Margin -0.05 -1.63 0.05
## Density -3.05 12.76 0.01
## age_birads 0.07 -0.22 2.95
## age_shape 0.14 -1.31 3.21
## age_margin 0.25 -1.16 3.83
## age_density -0.21 -0.37 1.67
## birads_shape -0.07 -1.54 0.22
## birads_margin 0.11 -1.46 0.27
## birads_density -1.22 4.04 0.09
## shape_margin 0.14 -1.58 0.25
## shape_density -0.18 -1.53 0.13
## margin_density 0.01 -1.56 0.17
## Class 0.06 -2.00 0.02
Create Tabular Summaries for Original Variables
table(mammMasses2$BI_RADS)
##
## 0 2 3 4 5 6
## 5 7 24 468 316 9
table(mammMasses2$Shape)
##
## 1 2 3 4
## 190 180 81 378
table(mammMasses2$Margin)
##
## 1 2 3 4 5
## 320 23 105 254 127
table(mammMasses2$Density)
##
## 1 2 3 4
## 11 56 754 8
table(mammMasses2$Class) #target variable
##
## 0 1
## 427 402
Visualize Cross Product New Variables
mammMasses2 %>% ggvis(~age_birads) %>% layer_bars()
## Warning: package 'bindrcpp' was built under R version 3.4.4
mammMasses2 %>% ggvis(~age_shape) %>% layer_bars()
mammMasses2 %>% ggvis(~age_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~age_density) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_shape) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~birads_density) %>% layer_bars()
mammMasses2 %>% ggvis(~shape_margin) %>% layer_bars()
mammMasses2 %>% ggvis(~shape_density) %>% layer_bars()
mammMasses2 %>% ggvis(~margin_density) %>% layer_bars()
Perform NB Classifications
#Call the NaiveBayes() Function on MammMasses2
NBModel <- naiveBayes(Class ~., mammMasses2)
NBModel
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5150784 0.4849216
##
## Conditional probabilities:
## Age
## Y [,1] [,2]
## 0 49.29742 13.70268
## 1 62.69403 12.35463
##
## BI_RADS
## Y [,1] [,2]
## 0 3.983607 0.5282739
## 1 4.703980 0.6429623
##
## Shape
## Y [,1] [,2]
## 0 2.100703 1.1012901
## 1 3.502488 0.9402307
##
## Margin
## Y [,1] [,2]
## 0 1.939110 1.384367
## 1 3.741294 1.168043
##
## Density
## Y [,1] [,2]
## 0 2.892272 0.3785458
## 1 2.940299 0.3180645
##
## age_birads
## Y [,1] [,2]
## 0 197.5012 64.92236
## 1 295.3905 74.23246
##
## age_shape
## Y [,1] [,2]
## 0 107.2248 71.07376
## 1 220.3358 74.95025
##
## age_margin
## Y [,1] [,2]
## 0 101.0796 85.14990
## 1 236.2562 89.20321
##
## age_density
## Y [,1] [,2]
## 0 142.4262 44.29734
## 1 184.7313 42.13235
##
## birads_shape
## Y [,1] [,2]
## 0 8.46370 4.892791
## 1 16.57214 5.183036
##
## birads_margin
## Y [,1] [,2]
## 0 7.838407 6.05721
## 1 17.748756 6.22953
##
## birads_density
## Y [,1] [,2]
## 0 11.53396 2.235153
## 1 13.82836 2.412125
##
## shape_margin
## Y [,1] [,2]
## 0 5.145199 5.833165
## 1 13.626866 5.624588
##
## shape_density
## Y [,1] [,2]
## 0 6.091335 3.382306
## 1 10.313433 2.997300
##
## margin_density
## Y [,1] [,2]
## 0 5.653396 4.178163
## 1 11.049751 3.681525
#Create a Classification Task for the model
require(mlr)
task <- makeClassifTask(data = mammMasses2, target = "Class")
#initialize the NB Classifier
selected_model <- makeLearner("classif.naiveBayes")
#Train the Model:
NBPred <- train(selected_model, task)
NBPred
## Model for learner.id=classif.naiveBayes; learner.class=classif.naiveBayes
## Trained on: task.id = mammMasses2; obs = 829; features = 15
## Hyperparameters:
#Apply Predictive model to mammMassesFinal without passing on the target variable
predictions_mlr <- as.data.frame(predict(NBPred, newdata = mammMasses2[1:15]))
#Create a Confusion Matrix as First Test of Accuracy
require(caret)
table1 <- table(predictions_mlr[,1], mammMasses2$Class)
table1
##
## 0 1
## 0 335 58
## 1 92 344
table2 <- prop.table(table1)
table2
##
## 0 1
## 0 0.40410133 0.06996381
## 1 0.11097708 0.41495778
#Save the model as an RDS object
#With a new dataset, the RDS model can be re-loaded and used
#in a new predict() function:
saveRDS(NBPred, file = "initialNaiveBayesModel.rds")
#to restore: readRDS(file = "initialNaiveBayesModel.rds")
Principal Component Analysis 1
require(FactoMineR)
pca1 <- PCA(mammMasses2, graph = T)
pca1
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 829 individuals, described by 16 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
pca1$eig
## eigenvalue percentage of variance
## comp 1 10.018617312 62.61635820
## comp 2 1.890055352 11.81284595
## comp 3 1.451912101 9.07445063
## comp 4 1.183667118 7.39791949
## comp 5 0.852726130 5.32953831
## comp 6 0.480116755 3.00072972
## comp 7 0.042654386 0.26658991
## comp 8 0.029902197 0.18688873
## comp 9 0.015635886 0.09772429
## comp 10 0.009461429 0.05913393
## comp 11 0.007497665 0.04686040
## comp 12 0.006433669 0.04021043
## comp 13 0.004657770 0.02911106
## comp 14 0.003040381 0.01900238
## comp 15 0.002002768 0.01251730
## comp 16 0.001619080 0.01011925
## cumulative percentage of variance
## comp 1 62.61636
## comp 2 74.42920
## comp 3 83.50365
## comp 4 90.90157
## comp 5 96.23111
## comp 6 99.23184
## comp 7 99.49843
## comp 8 99.68532
## comp 9 99.78305
## comp 10 99.84218
## comp 11 99.88904
## comp 12 99.92925
## comp 13 99.95836
## comp 14 99.97736
## comp 15 99.98988
## comp 16 100.00000
dimdesc(pca1)
## $Dim.1
## $Dim.1$quanti
## correlation p.value
## age_margin 0.9209818 0.000000e+00
## shape_margin 0.9160141 0.000000e+00
## birads_margin 0.9143718 0.000000e+00
## age_shape 0.9125522 1.976263e-323
## birads_shape 0.8981028 2.817671e-297
## margin_density 0.8876251 1.090546e-280
## Margin 0.8796297 4.190716e-269
## shape_density 0.8693846 2.064146e-255
## Shape 0.8536190 1.845006e-236
## age_birads 0.7698187 1.953404e-163
## Class 0.7129656 1.259599e-129
## age_density 0.6745759 3.944632e-111
## Age 0.6448679 1.228192e-98
## BI_RADS 0.6299985 6.830692e-93
## birads_density 0.6237140 1.477975e-90
## Density 0.2280895 3.029858e-11
##
##
## $Dim.2
## $Dim.2$quanti
## correlation p.value
## age_density 0.6254126 3.497463e-91
## Age 0.5310425 1.669291e-61
## age_birads 0.5273575 1.578991e-60
## birads_density 0.4326804 3.803449e-39
## Density 0.3874173 4.438235e-31
## BI_RADS 0.2590466 3.535899e-14
## birads_margin -0.1748138 4.089259e-07
## birads_shape -0.2087115 1.301851e-09
## margin_density -0.2187240 1.949007e-10
## shape_density -0.2474631 4.938116e-13
## Margin -0.2964410 2.804633e-18
## shape_margin -0.3565378 2.966786e-26
## Shape -0.3622267 4.191111e-27
##
##
## $Dim.3
## $Dim.3$quanti
## correlation p.value
## Density 0.68456693 1.143104e-115
## birads_density 0.62059307 2.040003e-89
## BI_RADS 0.28578247 4.797254e-17
## shape_density 0.13546747 9.131063e-05
## margin_density 0.10823520 1.803502e-03
## birads_margin 0.08716924 1.204573e-02
## birads_shape 0.08505383 1.429970e-02
## age_margin -0.17867355 2.238023e-07
## age_density -0.19412606 1.754657e-08
## age_shape -0.22953446 2.256775e-11
## age_birads -0.25634644 6.615855e-14
## Age -0.53007909 3.011728e-61
Alternative PCA Method Get Loadings of Best Principal Components
model <- princomp(~.,mammMasses2[1:829,1:15], na.action = na.omit)
model$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Age 0.115 0.122 0.705 0.565 0.197
## BI_RADS
## Shape -0.151
## Margin 0.102
## Density
## age_birads -0.463 0.743 -0.163 -0.446
## age_shape -0.536 -0.164 0.822
## age_margin -0.655 -0.524 -0.534
## age_density -0.240 0.352 0.879 -0.184
## birads_shape -0.329 0.264 0.305 -0.561
## birads_margin -0.346 0.422 0.360 0.599
## birads_density -0.319 -0.123
## shape_margin -0.216 0.550 -0.802
## shape_density -0.201 0.149 0.193 -0.509
## margin_density -0.199 0.260 0.217 0.150
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## Age -0.315
## BI_RADS -0.163 -0.111 -0.395 -0.652 0.280 -0.546
## Shape -0.545 -0.808
## Margin 0.109 -0.710 0.483 0.443 0.149
## Density 0.560 -0.162 -0.800
## age_birads
## age_shape
## age_margin
## age_density
## birads_shape 0.484 -0.389
## birads_margin 0.270 0.346
## birads_density -0.206 -0.860 -0.213 0.100 0.189
## shape_margin
## shape_density -0.441 0.642 0.160
## margin_density -0.665 0.317 -0.495 0.137
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.067 0.067 0.067 0.067 0.067 0.067 0.067 0.067
## Cumulative Var 0.067 0.133 0.200 0.267 0.333 0.400 0.467 0.533
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.067 0.067 0.067 0.067 0.067 0.067 0.067
## Cumulative Var 0.600 0.667 0.733 0.800 0.867 0.933 1.000
Create mammMasses3
predictions_mlr[,1]
## [1] 1 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1
## [36] 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1
## [71] 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 1
## [106] 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1
## [141] 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 0
## [176] 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 0 1
## [211] 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 1 1 1 0 1 0 0 0
## [246] 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0
## [281] 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 0 1 1 0 0 0
## [351] 1 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0
## [386] 0 0 1 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1
## [421] 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1
## [456] 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0
## [491] 1 1 0 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1 1 0
## [526] 1 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 0
## [561] 0 1 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 1
## [596] 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 1
## [631] 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0
## [666] 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1
## [701] 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 1 1 0 1
## [736] 1 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 1 1 0 1 1
## [771] 1 0 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1
## [806] 1 0 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1
## Levels: 0 1
Preds <- as.integer(predictions_mlr[,1])
Preds <- ifelse(Preds == 2, 1, 0) #evaluates if preds = 2 and returns a boolean. T = re-evaluates to 1. F = 0.
#therefore, all 2s evaluate to 1s and all 1s evaluate to 0
Preds
## [1] 1 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1
## [36] 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1
## [71] 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 1
## [106] 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1
## [141] 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 0
## [176] 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 0 1
## [211] 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 1 1 1 0 1 0 0 0
## [246] 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0
## [281] 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 0 1 1 0 0 0
## [351] 1 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0
## [386] 0 0 1 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1
## [421] 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1
## [456] 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0
## [491] 1 1 0 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1 1 0
## [526] 1 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 0
## [561] 0 1 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 1
## [596] 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 1
## [631] 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0
## [666] 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1
## [701] 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 1 1 0 1
## [736] 1 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 1 1 0 1 1
## [771] 1 0 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1
## [806] 1 0 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1
table(Preds)
## Preds
## 0 1
## 393 436
mammMasses3 <- cbind(mammMasses2, Preds)
str(mammMasses3)
## 'data.frame': 829 obs. of 17 variables:
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ BI_RADS : int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density : int 3 3 3 3 3 3 2 2 3 3 ...
## $ age_birads : num 335 290 112 285 380 126 144 240 216 156 ...
## $ age_shape : num 201 232 28 57 76 84 108 120 54 156 ...
## $ age_margin : num 335 290 28 285 304 42 36 60 54 208 ...
## $ age_density : num 201 174 84 171 228 126 72 120 162 156 ...
## $ birads_shape : num 15 20 4 5 5 6 12 8 4 9 ...
## $ birads_margin : num 25 25 4 25 20 3 4 4 4 12 ...
## $ birads_density: num 15 15 12 15 15 9 8 8 12 9 ...
## $ shape_margin : num 15 20 1 5 4 2 3 2 1 12 ...
## $ shape_density : num 9 12 3 3 3 6 6 4 3 9 ...
## $ margin_density: num 15 15 3 15 12 3 2 2 3 12 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
## $ Preds : num 1 1 0 1 1 0 0 0 0 0 ...
Create errorTable A subset of mammMasses3 where Class != Preds
#copy mammMasses3
errorTable <- mammMasses3
#filter out correct predictions, reassign to errortable
errortable <- errorTable %>% filter(errorTable$Class != errorTable$Preds)
#get str
str(errorTable)
## 'data.frame': 829 obs. of 17 variables:
## $ Age : int 67 58 28 57 76 42 36 60 54 52 ...
## $ BI_RADS : int 5 5 4 5 5 3 4 4 4 3 ...
## $ Shape : int 3 4 1 1 1 2 3 2 1 3 ...
## $ Margin : int 5 5 1 5 4 1 1 1 1 4 ...
## $ Density : int 3 3 3 3 3 3 2 2 3 3 ...
## $ age_birads : num 335 290 112 285 380 126 144 240 216 156 ...
## $ age_shape : num 201 232 28 57 76 84 108 120 54 156 ...
## $ age_margin : num 335 290 28 285 304 42 36 60 54 208 ...
## $ age_density : num 201 174 84 171 228 126 72 120 162 156 ...
## $ birads_shape : num 15 20 4 5 5 6 12 8 4 9 ...
## $ birads_margin : num 25 25 4 25 20 3 4 4 4 12 ...
## $ birads_density: num 15 15 12 15 15 9 8 8 12 9 ...
## $ shape_margin : num 15 20 1 5 4 2 3 2 1 12 ...
## $ shape_density : num 9 12 3 3 3 6 6 4 3 9 ...
## $ margin_density: num 15 15 3 15 12 3 2 2 3 12 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
## $ Preds : num 1 1 0 1 1 0 0 0 0 0 ...
describe(errorTable)
## vars n mean sd median trimmed mad min max range
## Age 1 829 55.79 14.68 57 56.15 14.83 18 96 78
## BI_RADS 2 829 4.33 0.69 4 4.37 0.00 0 6 6
## Shape 3 829 2.78 1.24 3 2.85 1.48 1 4 3
## Margin 4 829 2.81 1.57 3 2.77 2.97 1 5 4
## Density 5 829 2.92 0.35 3 3.00 0.00 1 4 3
## age_birads 6 829 244.97 85.05 244 243.93 88.96 0 480 480
## age_shape 7 829 162.07 92.30 159 159.39 128.99 18 352 334
## age_margin 8 829 166.63 110.24 177 160.42 164.57 18 465 447
## age_density 9 829 162.94 48.13 168 164.24 48.93 40 308 268
## birads_shape 10 829 12.40 6.46 12 12.52 11.86 0 24 24
## birads_margin 11 829 12.64 7.89 15 12.24 14.83 0 30 30
## birads_density 12 829 12.65 2.59 12 12.97 0.00 0 20 20
## shape_margin 13 829 9.26 7.13 9 8.95 10.38 1 20 19
## shape_density 14 829 8.14 3.83 9 8.29 4.45 1 16 15
## margin_density 15 829 8.27 4.78 9 8.14 8.90 1 20 19
## Class 16 829 0.48 0.50 0 0.48 0.00 0 1 1
## Preds 17 829 0.53 0.50 1 0.53 0.00 0 1 1
## skew kurtosis se
## Age -0.22 -0.32 0.51
## BI_RADS -1.61 8.74 0.02
## Shape -0.30 -1.58 0.04
## Margin -0.05 -1.63 0.05
## Density -3.05 12.76 0.01
## age_birads 0.07 -0.22 2.95
## age_shape 0.14 -1.31 3.21
## age_margin 0.25 -1.16 3.83
## age_density -0.21 -0.37 1.67
## birads_shape -0.07 -1.54 0.22
## birads_margin 0.11 -1.46 0.27
## birads_density -1.22 4.04 0.09
## shape_margin 0.14 -1.58 0.25
## shape_density -0.18 -1.53 0.13
## margin_density 0.01 -1.56 0.17
## Class 0.06 -2.00 0.02
## Preds -0.10 -1.99 0.02
Ridit Transformation
require(ridittools)
rid1 <- mammMasses2[,1]
rid2 <- mammMasses2[,2]
rid3 <- mammMasses2[,3]
rid4 <- mammMasses2[,4]
rid5 <- mammMasses2[,5]
rid6 <- mammMasses2[,6]
rid7 <- mammMasses2[,7]
rid8 <- mammMasses2[,8]
rid9 <- mammMasses2[,9]
rid10 <- mammMasses2[,10]
rid11 <- mammMasses2[,11]
rid12 <- mammMasses2[,12]
rid13 <- mammMasses2[,13]
rid14 <- mammMasses2[,14]
rid15 <- mammMasses2[,15]
ridClass <- mammMasses2[,16]
rid1 <- as.data.frame(toridit(rid1))
rid2 <- as.data.frame(toridit(rid2))
rid3 <- as.data.frame(toridit(rid3))
rid4 <- as.data.frame(toridit(rid4))
rid5 <- as.data.frame(toridit(rid5))
rid6 <- as.data.frame(toridit(rid6))
rid7 <- as.data.frame(toridit(rid7))
rid8 <- as.data.frame(toridit(rid8))
rid9 <- as.data.frame(toridit(rid9))
rid10 <- as.data.frame(toridit(rid10))
rid11 <- as.data.frame(toridit(rid11))
rid12 <- as.data.frame(toridit(rid12))
rid13 <- as.data.frame(toridit(rid13))
rid14 <- as.data.frame(toridit(rid14))
rid15 <- as.data.frame(toridit(rid15))
ridClass <- as.data.frame(ridClass)
Bind into mammMassesRidit and Rename Columns
mammMassesRidit <- cbind(rid1, rid2, rid3, rid4, rid5, rid6, rid7, rid8, rid9, rid10, rid11, rid12, rid13, rid14, rid15, ridClass)
colnames(mammMassesRidit) <- c("Age_RIDIT","BI_RADS_RIDIT", "Shape_RIDIT", "Margin_RIDIT", "Density_RIDIT", "age_birads_RIDIT", "age_shape_RIDIT", "age_margin_RIDIT", "age_density_RIDIT", "birads_shape_RIDIT", "birads_margin_RIDIT", "birads_density_RIDIT", "shape_margin_RIDIT", "shape_density_RIDIT", "margin_density_RIDIT", "Class")
str(mammMassesRidit)
## 'data.frame': 829 obs. of 16 variables:
## $ Age_RIDIT : num 0.000724 0.002076 0.003005 0.003924 0.005362 ...
## $ BI_RADS_RIDIT : num 0.000696 0.002088 0.003341 0.004594 0.005986 ...
## $ Shape_RIDIT : num 0.000651 0.002169 0.003254 0.003688 0.004121 ...
## $ Margin_RIDIT : num 0.00107 0.00322 0.0045 0.00579 0.00772 ...
## $ Density_RIDIT : num 0.000621 0.001862 0.003103 0.004344 0.005585 ...
## $ age_birads_RIDIT : num 0.000825 0.002364 0.003353 0.004331 0.005968 ...
## $ age_shape_RIDIT : num 0.000748 0.002359 0.003327 0.003643 0.004138 ...
## $ age_margin_RIDIT : num 0.00121 0.00347 0.00463 0.00576 0.00789 ...
## $ age_density_RIDIT : num 0.000744 0.002132 0.003087 0.004031 0.005508 ...
## $ birads_shape_RIDIT : num 0.00073 0.00243 0.0036 0.00404 0.00453 ...
## $ birads_margin_RIDIT : num 0.00119 0.00358 0.00496 0.00634 0.00849 ...
## $ birads_density_RIDIT: num 0.000715 0.002146 0.003434 0.004721 0.006152 ...
## $ shape_margin_RIDIT : num 0.000977 0.003257 0.004625 0.005016 0.005603 ...
## $ shape_density_RIDIT : num 0.000667 0.002223 0.003335 0.003779 0.004224 ...
## $ margin_density_RIDIT: num 0.00109 0.00328 0.00459 0.00591 0.00788 ...
## $ Class : int 1 1 0 1 1 1 0 0 0 0 ...
write.csv(mammMassesRidit, "mammMassesRidit.csv")
PCA for mammMassesRidit
require(FactoMineR)
pca2 <- PCA(mammMassesRidit, graph = T)
pca2
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 829 individuals, described by 16 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
pca2$eig
## eigenvalue percentage of variance
## comp 1 1.499805e+01 9.373780e+01
## comp 2 9.993216e-01 6.245760e+00
## comp 3 1.758248e-03 1.098905e-02
## comp 4 6.419525e-04 4.012203e-03
## comp 5 1.406426e-04 8.790164e-04
## comp 6 6.017135e-05 3.760709e-04
## comp 7 1.549394e-05 9.683716e-05
## comp 8 8.063074e-06 5.039421e-05
## comp 9 3.300163e-06 2.062602e-05
## comp 10 1.102408e-06 6.890050e-06
## comp 11 1.045593e-06 6.534955e-06
## comp 12 5.697123e-07 3.560702e-06
## comp 13 1.701547e-07 1.063467e-06
## comp 14 1.201093e-07 7.506832e-07
## comp 15 8.450345e-08 5.281466e-07
## comp 16 3.727988e-08 2.329993e-07
## cumulative percentage of variance
## comp 1 93.73780
## comp 2 99.98356
## comp 3 99.99455
## comp 4 99.99856
## comp 5 99.99944
## comp 6 99.99981
## comp 7 99.99991
## comp 8 99.99996
## comp 9 99.99998
## comp 10 99.99999
## comp 11 99.99999
## comp 12 100.00000
## comp 13 100.00000
## comp 14 100.00000
## comp 15 100.00000
## comp 16 100.00000
dimdesc(pca2)
## $Dim.1
## $Dim.1$quanti
## correlation p.value
## shape_density_RIDIT 0.9999624 0
## birads_shape_RIDIT 0.9999615 0
## Shape_RIDIT 0.9999596 0
## Margin_RIDIT 0.9999554 0
## age_margin_RIDIT 0.9999373 0
## age_birads_RIDIT 0.9999343 0
## age_density_RIDIT 0.9999307 0
## birads_density_RIDIT 0.9999134 0
## BI_RADS_RIDIT 0.9999117 0
## margin_density_RIDIT 0.9999010 0
## age_shape_RIDIT 0.9998903 0
## Age_RIDIT 0.9998860 0
## birads_margin_RIDIT 0.9998857 0
## Density_RIDIT 0.9998611 0
## shape_margin_RIDIT 0.9997687 0
##
##
## $Dim.2
## $Dim.2$quanti
## correlation p.value
## Class 0.9996353 0
Alternative PCA Method Get Loadings of Best Principal Components
model2 <- princomp(~.,mammMassesRidit[1:829,1:15], na.action = na.omit)
model2$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## Age_RIDIT -0.256 -0.317 -0.199 -0.240 0.415 -0.138
## BI_RADS_RIDIT -0.259 -0.304 -0.144 0.408 0.261
## Shape_RIDIT -0.258 -0.324 0.176 0.343 -0.272
## Margin_RIDIT -0.256 0.159 0.225 0.347 0.279 -0.302
## Density_RIDIT -0.260 -0.356 0.211 0.405 0.145 0.264
## age_birads_RIDIT -0.258 -0.224 -0.102 -0.472
## age_shape_RIDIT -0.257 0.130 -0.530 -0.370 0.129
## age_margin_RIDIT -0.257 0.221 0.178 -0.266 0.235 -0.423 0.138
## age_density_RIDIT -0.259 -0.273 -0.429 -0.173
## birads_shape_RIDIT -0.259 -0.238 -0.106 -0.530 0.195 -0.180
## birads_margin_RIDIT -0.258 0.271 0.342 -0.344 -0.318
## birads_density_RIDIT -0.261 -0.249 0.280 0.166 -0.390 -0.126 0.268
## shape_margin_RIDIT -0.257 0.501 -0.114 0.197 0.208 0.592
## shape_density_RIDIT -0.259 0.102 -0.181 0.454 -0.183 -0.149
## margin_density_RIDIT -0.258 0.241 0.362 0.242 -0.104 -0.359
## Comp.8 Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## Age_RIDIT -0.140 0.268 0.166 -0.185 0.519
## BI_RADS_RIDIT -0.230 0.265 -0.252 -0.274
## Shape_RIDIT 0.321 0.147 -0.175 -0.171 -0.271 -0.519
## Margin_RIDIT -0.349 0.321 -0.125 0.557
## Density_RIDIT 0.314 0.177 -0.128 0.563 0.140
## age_birads_RIDIT -0.274 -0.139 -0.330 0.159 0.230 0.484 -0.332
## age_shape_RIDIT 0.161 -0.107 0.574 0.308 -0.102
## age_margin_RIDIT 0.510 -0.248 -0.362 -0.173 -0.120 -0.129 0.124
## age_density_RIDIT -0.333 0.228 -0.491 -0.216 -0.181
## birads_shape_RIDIT -0.243 -0.429 -0.101 0.179 0.462
## birads_margin_RIDIT 0.184 0.640 0.180 0.148
## birads_density_RIDIT -0.107 -0.183 0.139 -0.359 -0.106
## shape_margin_RIDIT -0.409 0.141 -0.185
## shape_density_RIDIT -0.468 0.352 0.419 -0.181 0.183
## margin_density_RIDIT -0.300 -0.169 0.291 -0.522 0.247
## Comp.15
## Age_RIDIT -0.323
## BI_RADS_RIDIT 0.553
## Shape_RIDIT -0.263
## Margin_RIDIT
## Density_RIDIT
## age_birads_RIDIT -0.104
## age_shape_RIDIT
## age_margin_RIDIT
## age_density_RIDIT 0.384
## birads_shape_RIDIT
## birads_margin_RIDIT
## birads_density_RIDIT -0.564
## shape_margin_RIDIT
## shape_density_RIDIT 0.170
## margin_density_RIDIT
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.067 0.067 0.067 0.067 0.067 0.067 0.067 0.067
## Cumulative Var 0.067 0.133 0.200 0.267 0.333 0.400 0.467 0.533
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.067 0.067 0.067 0.067 0.067 0.067 0.067
## Cumulative Var 0.600 0.667 0.733 0.800 0.867 0.933 1.000
Other Modeling to be Done in Rattle