The data-set for this model was downloaded from Kaggle.com. I wanted to use Linear Discriminant Analysis model for breast cancer type prediction. First I split the dataset into train and test dataset. I created a LDA model and used that model on my train dataset.

My LDA model was 96% accurate on the train dataset

data <- read.csv("~/1 UW Tacoma/560 data mining/data/Data sets/breast-cancer.csv")
str(data)

## 'data.frame':    569 obs. of  32 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

data <- data[complete.cases(data), ]
any(is.na(data))

## [1] FALSE

data <- data[,-1]
data$diagnosis <- factor(ifelse(data$diagnosis=="B","Benign","Malignant"))

table(data$diagnosis)

## 
##    Benign Malignant 
##       357       212

library(caTools)
library(ISLR)
library(MASS)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.4.4

set.seed(88)
set.seed(101) 
split = sample.split(data$diagnosis, SplitRatio = .75)

#get training and test data
datatrain <- subset(data, split == TRUE)
datatest <- subset(data, split == FALSE)

LDA <- train(diagnosis~., data=datatrain,
             method='lda', 
             preProcess=c('scale', 'center'))
LDA

## Linear Discriminant Analysis 
## 
## 427 samples
##  30 predictor
##   2 classes: 'Benign', 'Malignant' 
## 
## Pre-processing: scaled (30), centered (30) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 427, 427, 427, 427, 427, 427, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9462222  0.8826542

confusionMatrix(predict(LDA, datatrain), datatrain$diagnosis)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       267        14
##   Malignant      1       145
##                                           
##                Accuracy : 0.9649          
##                  95% CI : (0.9427, 0.9802)
##     No Information Rate : 0.6276          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9236          
##  Mcnemar's Test P-Value : 0.001946        
##                                           
##             Sensitivity : 0.9963          
##             Specificity : 0.9119          
##          Pos Pred Value : 0.9502          
##          Neg Pred Value : 0.9932          
##              Prevalence : 0.6276          
##          Detection Rate : 0.6253          
##    Detection Prevalence : 0.6581          
##       Balanced Accuracy : 0.9541          
##                                           
##        'Positive' Class : Benign          
##

confusionMatrix(predict(LDA, datatest), datatest$diagnosis)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        88         8
##   Malignant      1        45
##                                           
##                Accuracy : 0.9366          
##                  95% CI : (0.8831, 0.9706)
##     No Information Rate : 0.6268          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8608          
##  Mcnemar's Test P-Value : 0.0455          
##                                           
##             Sensitivity : 0.9888          
##             Specificity : 0.8491          
##          Pos Pred Value : 0.9167          
##          Neg Pred Value : 0.9783          
##              Prevalence : 0.6268          
##          Detection Rate : 0.6197          
##    Detection Prevalence : 0.6761          
##       Balanced Accuracy : 0.9189          
##                                           
##        'Positive' Class : Benign          
##

library(PerformanceAnalytics)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

chart.Correlation(data[,c(2:11)],histogram=TRUE)

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

pairs.panels(data[,c(12:21)], ellipses=TRUE, pch=1, lm=TRUE, cex.cor=1, smoother=F, stars = T, main="Cancer SE")

Breast cancer classification

My LDA model was 96% accurate on the train dataset