1. Intro & Purpose

In this assignment we will use the lda algorithm from the MASS package, the knn algorithm from the class package, the glm function for logistic regression, and the svm function from the e1071 package in R to classify malignantvs. benign breast tumors.

The dataset we will use can be found at the UCI Machine Learning Repository, https://archive.ics.uci.edu/ml/datasets.html, and is labeled Breast Cancer Wisconsin (Diagnostic).It contains 569 cases of cancer biopsies, each with 32 features. The first feature is an ID number, the second is the cancer diagnosis, and 30 are numeric-valued laboratory measurements. The diagnosis is coded as “B” to indicate benignor “M” to indicate malignant.The remaining 30 numeric measurements consist of the mean, standard error, and worst value for 10 different characteristics of the digitized cell nuclei which are: Radius, Texture, Perimeter, Area, Smoothness, Compactness, Concavity, Concave points, Symmetry, and Fractal dimension.

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(latticeExtra)
## Loading required package: lattice
## Loading required package: RColorBrewer
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(funModeling)# for Exploratory Data Analysis and Data Preparation
## Loading required package: Hmisc
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.7 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:funModeling':
## 
##     range01
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(corrplot)
## corrplot 0.84 loaded
library(class)
library(gmodels)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(leaps)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

since each variables has 10 features ,we are going to create a feature’vector.

setwd("~/myRprojects")
cancer <- read.csv("wdbc.txt")
#since each variables has 10 features ,we are going to create a feature'vector
feature <- c("radius","texture","perimeter","area","smoothness","compactness","concavity",
             "concave.points","symmetry","fractal.dimension")
#We are creating a vector with which we are going to compute with each vector
measure <-c("mean","se","worst")
attribute <- paste(rep(feature, 3),".",rep(measure, each=10))
colnames(cancer) <- c("id","diagnosis",attribute)
wdbc <- cancer[-1]#we are deleting the id column 
str(wdbc)
## 'data.frame':    568 obs. of  31 variables:
##  $ diagnosis                : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius . mean            : num  20.6 19.7 11.4 20.3 12.4 ...
##  $ texture . mean           : num  17.8 21.2 20.4 14.3 15.7 ...
##  $ perimeter . mean         : num  132.9 130 77.6 135.1 82.6 ...
##  $ area . mean              : num  1326 1203 386 1297 477 ...
##  $ smoothness . mean        : num  0.0847 0.1096 0.1425 0.1003 0.1278 ...
##  $ compactness . mean       : num  0.0786 0.1599 0.2839 0.1328 0.17 ...
##  $ concavity . mean         : num  0.0869 0.1974 0.2414 0.198 0.1578 ...
##  $ concave.points . mean    : num  0.0702 0.1279 0.1052 0.1043 0.0809 ...
##  $ symmetry . mean          : num  0.181 0.207 0.26 0.181 0.209 ...
##  $ fractal.dimension . mean : num  0.0567 0.06 0.0974 0.0588 0.0761 ...
##  $ radius . se              : num  0.543 0.746 0.496 0.757 0.335 ...
##  $ texture . se             : num  0.734 0.787 1.156 0.781 0.89 ...
##  $ perimeter . se           : num  3.4 4.58 3.44 5.44 2.22 ...
##  $ area . se                : num  74.1 94 27.2 94.4 27.2 ...
##  $ smoothness . se          : num  0.00522 0.00615 0.00911 0.01149 0.00751 ...
##  $ compactness . se         : num  0.0131 0.0401 0.0746 0.0246 0.0335 ...
##  $ concavity . se           : num  0.0186 0.0383 0.0566 0.0569 0.0367 ...
##  $ concave.points . se      : num  0.0134 0.0206 0.0187 0.0188 0.0114 ...
##  $ symmetry . se            : num  0.0139 0.0225 0.0596 0.0176 0.0216 ...
##  $ fractal.dimension . se   : num  0.00353 0.00457 0.00921 0.00511 0.00508 ...
##  $ radius . worst           : num  25 23.6 14.9 22.5 15.5 ...
##  $ texture . worst          : num  23.4 25.5 26.5 16.7 23.8 ...
##  $ perimeter . worst        : num  158.8 152.5 98.9 152.2 103.4 ...
##  $ area . worst             : num  1956 1709 568 1575 742 ...
##  $ smoothness . worst       : num  0.124 0.144 0.21 0.137 0.179 ...
##  $ compactness . worst      : num  0.187 0.424 0.866 0.205 0.525 ...
##  $ concavity . worst        : num  0.242 0.45 0.687 0.4 0.535 ...
##  $ concave.points . worst   : num  0.186 0.243 0.258 0.163 0.174 ...
##  $ symmetry . worst         : num  0.275 0.361 0.664 0.236 0.399 ...
##  $ fractal.dimension . worst: num  0.089 0.0876 0.173 0.0768 0.1244 ...

EXPLORATORY ANALYSIS

summary

summary(wdbc)
##  diagnosis radius . mean    texture . mean  perimeter . mean
##  B:357     Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  M:211     1st Qu.:11.697   1st Qu.:16.18   1st Qu.: 75.14  
##            Median :13.355   Median :18.86   Median : 86.21  
##            Mean   :14.120   Mean   :19.31   Mean   : 91.91  
##            3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:103.88  
##            Max.   :28.110   Max.   :39.28   Max.   :188.50  
##   area . mean     smoothness . mean compactness . mean concavity . mean 
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938    Min.   :0.00000  
##  1st Qu.: 420.2   1st Qu.:0.08629   1st Qu.:0.06481    1st Qu.:0.02954  
##  Median : 548.8   Median :0.09587   Median :0.09252    Median :0.06140  
##  Mean   : 654.3   Mean   :0.09632   Mean   :0.10404    Mean   :0.08843  
##  3rd Qu.: 782.6   3rd Qu.:0.10530   3rd Qu.:0.13040    3rd Qu.:0.12965  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540    Max.   :0.42680  
##  concave.points . mean symmetry . mean  fractal.dimension . mean
##  Min.   :0.00000       Min.   :0.1060   Min.   :0.04996         
##  1st Qu.:0.02031       1st Qu.:0.1619   1st Qu.:0.05770         
##  Median :0.03345       Median :0.1792   Median :0.06152         
##  Mean   :0.04875       Mean   :0.1811   Mean   :0.06277         
##  3rd Qu.:0.07373       3rd Qu.:0.1956   3rd Qu.:0.06612         
##  Max.   :0.20120       Max.   :0.3040   Max.   :0.09744         
##   radius . se      texture . se    perimeter . se     area . se      
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8331   1st Qu.: 1.605   1st Qu.: 17.850  
##  Median :0.3240   Median :1.1095   Median : 2.285   Median : 24.485  
##  Mean   :0.4040   Mean   :1.2174   Mean   : 2.856   Mean   : 40.138  
##  3rd Qu.:0.4773   3rd Qu.:1.4743   3rd Qu.: 3.337   3rd Qu.: 45.017  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness . se    compactness . se   concavity . se   
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.:0.005166   1st Qu.:0.013048   1st Qu.:0.01506  
##  Median :0.006374   Median :0.020435   Median :0.02587  
##  Mean   :0.007042   Mean   :0.025437   Mean   :0.03186  
##  3rd Qu.:0.008151   3rd Qu.:0.032218   3rd Qu.:0.04176  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##  concave.points . se symmetry . se      fractal.dimension . se
##  Min.   :0.000000    Min.   :0.007882   Min.   :0.0008948     
##  1st Qu.:0.007634    1st Qu.:0.015128   1st Qu.:0.0022445     
##  Median :0.010920    Median :0.018725   Median :0.0031615     
##  Mean   :0.011789    Mean   :0.020526   Mean   :0.0037907     
##  3rd Qu.:0.014710    3rd Qu.:0.023398   3rd Qu.:0.0045258     
##  Max.   :0.052790    Max.   :0.078950   Max.   :0.0298400     
##  radius . worst  texture . worst perimeter . worst  area . worst   
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41    Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.09   1st Qu.: 84.10    1st Qu.: 515.0  
##  Median :14.96   Median :25.43   Median : 97.66    Median : 685.5  
##  Mean   :16.25   Mean   :25.69   Mean   :107.13    Mean   : 878.6  
##  3rd Qu.:18.77   3rd Qu.:29.76   3rd Qu.:125.17    3rd Qu.:1073.5  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20    Max.   :4254.0  
##  smoothness . worst compactness . worst concavity . worst
##  Min.   :0.07117    Min.   :0.02729     Min.   :0.0000   
##  1st Qu.:0.11660    1st Qu.:0.14690     1st Qu.:0.1145   
##  Median :0.13130    Median :0.21185     Median :0.2266   
##  Mean   :0.13232    Mean   :0.25354     Mean   :0.2714   
##  3rd Qu.:0.14600    3rd Qu.:0.33760     3rd Qu.:0.3814   
##  Max.   :0.22260    Max.   :1.05800     Max.   :1.2520   
##  concave.points . worst symmetry . worst fractal.dimension . worst
##  Min.   :0.00000        Min.   :0.1565   Min.   :0.05504          
##  1st Qu.:0.06473        1st Qu.:0.2504   1st Qu.:0.07141          
##  Median :0.09984        Median :0.2821   Median :0.08002          
##  Mean   :0.11434        Mean   :0.2898   Mean   :0.08388          
##  3rd Qu.:0.16132        3rd Qu.:0.3177   3rd Qu.:0.09206          
##  Max.   :0.29100        Max.   :0.6638   Max.   :0.20750

Correlation between each variables

We have 30 features with specific charateristics(mean standard,deviation and worst). We chose to check the correlations within each characteristic

Mean

ggpairs(wdbc[,c(1,2:11)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

SE

ggpairs(wdbc[,c(1,12:21)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

WORST

ggpairs(wdbc[,c(1,22:31)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histogram

plot_num(wdbc)

Correlation plot

Mean

# Correlation plot with color and correlation MIC
aa <- wdbc[,2:11]
names(aa) <- c("radius","texture","perimeter","area","smoothness","compactness","concavity",
             "concave.points","symmetry","frac.dimension")
ggcorr(aa[,1:10], geom = "blank", label = TRUE, hjust = 0.75) +
  geom_point(size = 10, aes(color = coefficient > 0, alpha = abs(coefficient) > 0.5)) +
  scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
  guides(color = FALSE, alpha = FALSE)

SE

ab <- wdbc[,12:21]
names(ab) <- c("radius","texture","perimeter","area","smoothness","compactness","concavity",
             "concave.points","symmetry","frac.dimension")
ggcorr(ab[,1:10], geom = "blank", label = TRUE, hjust = 0.75) +
  geom_point(size = 10, aes(color = coefficient > 0, alpha = abs(coefficient) > 0.5)) +
  scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
  guides(color = FALSE, alpha = FALSE)

WORST

# Correlation plot with color and correlation MIC
ac <- wdbc[,22:31]
names(ac) <- c("radius","texture","perimeter","area","smoothness","compactness","concavity",
             "concave.points","symmetry","frac.dimension")
ggcorr(ac[,1:10], geom = "blank", label = TRUE, hjust = 0.75) +
  geom_point(size = 10, aes(color = coefficient > 0, alpha = abs(coefficient) > 0.5)) +
  scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
  guides(color = FALSE, alpha = FALSE)

Graphical Analysis

We are going to analyse the

MEAN

#varquant1 <- cbind(varquant,AHD=varqual$AHD)
cross_plot(wdbc[,1:11], input=names(wdbc[,1:11]), target="diagnosis")
## Plotting transformed variable 'radius . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'texture . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'perimeter . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'area . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'smoothness . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'compactness . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'concavity . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'concave.points . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'symmetry . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'fractal.dimension . mean' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

Radius :Frome the plot on the leftf side we remark that the Malignant’s diagnosid increase as the radius.mean increase.When a patient’s radius.mean is between [6.99,10.3),the probability to be diagnosed with a benign tumor reaches its highest level (almost 100%). The likelihood to be diagnosed with a malignant given the patient radius’ mean is 100% for radius.mean in [19.55,28.1] and 98.2% for radius.mean in [15.06,17.1).56 of the 57 patient in this range [15.06,17.1) are diadnosed with a malignant tumor.

Texture: The Malignant diagnosis increase as the textur’s mean increase.We noticed a decreasing motion after a texture’mean in [24.47,25.1). This could be a noise in the data that we will need to focus on.Patient with a texture’s mean between [24.47,25.1) are more likely to be diagnosed with a malignant tumor(42 case out of 57 patients). When the patient texture.mean is in [9.71,14.1) the chance to be diagnosed with a malignant tumor is 1.8%

Perimeter:The perimeter’mean and Malignant diagnosis are positively increasing in the same direction.When a perimeter.mean is between [43.8,73.3) the malignant diagnosis rate is at its lowest (1.8%).This is the complete opposit when the perimeter mean is [111.5,129.5) which means that 56 of the 57 are diagnosed with malignant tumor.

Area: The likelihood to be diagnosed with a malignant tumor increase as the area.mean increase.This output shows that when the patient’s result is in [919,1192),the likelihood to be diagnosed with a malignant become 98.2%.

Smoothness:The likelihood to be diagnosed with a malignant tumor increase as the smoothness.mean increase.64.3% of the patient in the smoothnessmean group are diagnosed with a malignant tumor.In other words 36 of the 56 patient in this range have a malignant tumor.

Compactness: 89.3% in the compactness mean range [0.1765,0.3454)are diagnosed with a malignant tumor.We also noticed that the malignant diagnosis increase when the compactness.mean increase.

Concavity:As the cavity mean increase the malignant diagnosis increase.The highest malignant diagnosis rate happened to be in the range of cavity.mean[0.1508,0.2032) which is 94.7%.

Concave.point: All the patients with a concave point mean between [0,0.0179) are diagnosed with a Benign tumor.The likelihood to be diagnosed with a malignant tumor given a concave.point.mean in [0.0847,0.1009) is 98.2% . 56 out of 57 people in this concave.point.mean range are diagnosed with a malignant tumor.

Symmetry:Here the lowest positive diagnosis rate is 8.8% ; it correspond to the symmetry.mean range [0.106,0.150). 69.6% is the highest rate for the malignant diagnosis. 17 of 56 people are diagnosed with a benign tumor where as 36 malignant case are recorded among the patient with symmetry.mean in [0.215,0.304] .

Fractal.dimension:This is the only one within the mean group that we do not observe a parallel evolution between the numeric measurement and the diagnosis factor.Here we obseved two different motions .The first one is decreasing motion with a highest positve rate of 61.4% and a lowest positive rate of 20.7% of the patient in [0.0500,0.0553).The second motion looks llike an increasing motion. [0.0631,0.0651) seems to be a noise in our data set

SE

cross_plot(wdbc[,c(1,12:21)], input=names(wdbc[,c(1,12:21)]), target="diagnosis")
## Plotting transformed variable 'radius . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'texture . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'perimeter . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'area . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'smoothness . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'compactness . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'concavity . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'concave.points . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'symmetry . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'fractal.dimension . se' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

Radius :The malignant tumor diagnosis increase as the radius standard deviation increase.Radius.se in [0.755,2.873] happen to have the highest diagnostis rate of malignant tumor(96.4%).When a patient is diagnosed with a radius.se in [0.112,0.183) ; he is 100% more likely to be classified as a benign tumor.

Texture: Here we noticed that the likelihood to be diagnosed with a malignant tumor does not evoluate that much along side the texture.se.When a patient texture.se is in [0.380-1.916] , she/he is 40% likely to be diagnosed with a malignant tumor.

Perimeter:The likelihood to be diagnosed with a malignant tumor increases as the perimeter.se increases.Patients with a perimeter.se in [0.757,1.28)are more likely to be diagnosed with a benign tumor(100%).77% of the patients in the perimeter range[3.769,5.14) are diagnosed with a malignant tumor whereas all the patient that the perimeter.se fall into [5.144,21.98]are diagnosed with a malignant tumor.

Area:This output shows that likelihood tobe diagnosed with the benign tumor decreases as the area.se increase.The opposit phenomen is observed with the malignant tumor.The likelihood to be fully diagnosed with a benign tumor/ malignant tumor given the area.se fall respectively in [6.8,13.2) ,[92.8,542.2] is 100%. 98.2% of the patients in the area.se range[54.2,92.8) are positively diadnosed with malignant tumor.

Smoothness:The likelihood to be diagnosed with a malignant tumore does not evoluate along side the smoothness.se.Here we noticed that the likelihood to be diagnosis with a malignant tumor is les than 50% across the smoothness.se metric. 42 of 56 patients in the section are diagnosed with a benign tumor which is 75% in [0.01052,0.03113]

Compactness: Patient in [0.00225,0.00918) are 96.6% more likely to be diagnosed with a benign tumor than the one outside of that range.We also observe that the benign dianosis and the malignant diagnosis respectively decreases and increase along side the compactness.se.For any patient with compactness.se in [0.02045,0.02449) the probability of getting malignant diagnosis is 63.2% which is the highest under this metric.

Concavity:Any concavity.se that is fallen into [0,0.00774) is 100% subjected to a benign tumor.66.7% (38 of 57) of the patients in [0.04644,0.05915) are diagnosed with a malignant tumor.This latter happen to be the highest one under the concavity.se metric

Concave.point: The chance to be classified as malignant given the concave.point.se metric .When the concsve.poin.se is between [0.01583,0.05279] the likelihhod of being malignant is around 66%. in other words 38 of 56 in that segement are diagnosed with malignant.

Symmetry:The likelihood of being diagnosed with a malignant tumor given the symmetry.se is less than 50%.Thus this metric shows that most of the patients are most likely to be diagnosed with benign tumor.Here the highest probability of being diagnosed with benign tumor is 77.2% given the patients symmetry.se is in [0.02571,0.0306).

Fractal.dimension:When the fractal.dimension.se is in [0.003614,0.00361) or [0.004831,0.00619) the likelihood of being diagnosed with malignant is respectively 52.6% and 54.4%.

WORST

cross_plot(wdbc[,c(1,22:31)], input=names(wdbc[,c(1,22:31)]), target="diagnosis")
## Plotting transformed variable 'radius . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'texture . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'perimeter . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'area . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'smoothness . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'compactness . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'concavity . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'concave.points . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'symmetry . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

## Plotting transformed variable 'fractal.dimension . worst' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

Radius :The likelihood of being diagnosed with a malignant tumor given the radius.worst is in [20.33,23.7) or [23.69,36.0] is 100%.All the 57 patients in the sample that fall in radius.worst [7.93,11.2) and [11.24,12.5) are diagnosed with benign tumor.

Texture: The likelihood of being diagnosed with malignant tumor tumor given the texture.worse increases as the the texture.worse increases.Patients in the texture.worse range [33.8,49.5] have the highest probability(67.9%) of being diagnosed with malignant tumor. we also noted that 5.2% which is the lowest rate of having a malignant tumor is happening in the texture.worse[12,17.8)

Perimeter:When the perimeter.worse is either in [50.4,72.2) or [134.9,251.2] we respectively note 100% of benign tumor and 100% of malignant tumor. 54 out of the 57 which is 94.7% patients in the perimeter.worst [116.2,134.9)are diagnosed with malignant tumor.

Smoothness:Patients i the smoothness.worst range in [0.1624,0.223] recode the the highest likelihood to be diagnosed with a malignant tumor(75%) while patients in [0.0712,0.103) have the highest likelihood to be diagnosed with a benign tumor(93%)

Compactness: 52 of 56 of the patients with a compactness.worst in [0.4480,1.0580] are diagnosed with malignant tumor. This repesent 92.9% of the patients in that range.The lowest likelihood to be diagnosed with a malignant tumor(3.5%) is noticed in the group [0.0273,0.0937) in which we have 96.5% of the patient with a benign tumor(55 cases of 57)

Concavity:When the patients is iether in [0,0.0458)or, [0.0458,0.0920)or,[0.0920,0.1373), she/he is 98.2% more likely to be benign than malignant.Thus 1 patient out of 57 in these groups is malignant.When patien’s concavity.worst is greater or equal to 0.3508 the likelihood to be diagnosed with malignant become greater than 80%

Concave.point: All patients with a concave.point.worst in iether [a.1776,o.2091) or, [0.2091,0.2910] are malignant. We also noticed that all the 56 patients in [0.0392,0.0581) are diagnosed with a benign tumor.

Symmetry:here we noticed that when the symmetry.worst increases , the malignant diagnosis increase and the benign diagnosis decreases.Patient’s symmetry.worst in [0.360,0.684] are 91.1% more likely to be malignant that the one in the remaining groups.The highest likelihood to be diagnosed with a benign tumor given the symmetry.worst is 86% and is noted in the range [0.156,0.228).

Fractal.dimension:87.7% is the hihgest likelihood to be diagnosed with a benign tumor under this fractal.dimension metric,thus 50 patients out of 57 are positively diagnosed “Benign”.Patients in [0.1064,0.2075] are 69.6% more likely to be identified as malignant with 39 cases of 56

Normalizatin

wdbc.nor <- as.data.frame(lapply(wdbc[2:31],function(x){return((x-min(x))/(max(x)))}))

Function to plot the ROC

plot_roc <- function(train_roc,train_auc,test_roc,test_auc){
  plot(train_roc,col="blue",lty="solid",main="",lwd=2,xlab="False Positive Rate",
       ylab="True Positive Rate")
  plot(test_roc,col="red",lty="dashed",lwd=2,add=TRUE)
  abline(c(0,1))  
  #legend
  train.legend <- paste("Training AUC = ", round(train_auc,digits = 3))
  test.legend <- paste("Test AUC = ", round(test_auc,digits = 3))
  legend("bottomright",legend = c(train.legend,test.legend),
         lty=c("solid","dashed"),lwd=2,col=c("blue","red"))
}

KNN

The knn() function in the class package provides a standard, classic implementation of the KNN algorithm. For each instance in the test data, the function will identify the k-Nearest Neighbors, using Euclidean distance, where k is a user-specified number.

Creation of training and test datasets

We will divide our data into two portions: a training dataset that will be used to build the KNN model and a test dataset that will be used to evaluate the predictive accuracy of the model.we will consider 80% of data as training dataset while the other 20% as test dataset to simulate new patients.

set.seed(123) # set seed to replicate results
trainingIndex <- sample(1:nrow(wdbc.nor), 0.8*nrow(wdbc.nor)) # indices for 80% training data
wdbctraining <- wdbc.nor[trainingIndex, ] # training data
wdbctest <- wdbc.nor[-trainingIndex, ] # test data

Note that we excluded the target diagnosis variable. For training the KNN model, we will need to store these class labels in factor vectors, split between the training and test datasets.

train_label <- wdbc[trainingIndex,1]
test_label <- wdbc[-trainingIndex,1]
length(train_label)
## [1] 454

Value of K

For the value of k we are using an odd number near the square root of size of the training set.

sqrt(nrow(wdbctraining))
## [1] 21.30728

In the following code, I arbitrary choose a k value of 21. The results are stored in the vector pred.

set.seed(999)
wdbc_pred <- knn(train = wdbctraining, test = wdbctest, cl = train_label, k = 21)
#wdbc_pred1 <- knn(train = wdbctraining, test = wdbctest, cl = train_label, k = 10)
plot(tune.knn(wdbctraining,train_label,k=2:21))

As we can see this give a factor vector of predicted labels for each of the examples in the test dataset.

Model performance evaluation

We do this by preparing a confusion matrix or a cross table which helps us understand how the predicted labels are with respect to the known labels of the test data.Here we use CrossTable() function which is present in the gmodels library.The This function returns the result of cross tabulation of predicted and observed classifications. The number in each cell can be used for the calculation of four basic parameters true positive (TP), true negative (TN),false negative (FN) and false positive (FP).

table <- CrossTable(x = test_label, y = wdbc_pred, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  114 
## 
##  
##              | wdbc_pred 
##   test_label |         B |         M | Row Total | 
## -------------|-----------|-----------|-----------|
##            B |        74 |         0 |        74 | 
##              |     1.000 |     0.000 |     0.649 | 
##              |     0.937 |     0.000 |           | 
##              |     0.649 |     0.000 |           | 
## -------------|-----------|-----------|-----------|
##            M |         5 |        35 |        40 | 
##              |     0.125 |     0.875 |     0.351 | 
##              |     0.063 |     1.000 |           | 
##              |     0.044 |     0.307 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        79 |        35 |       114 | 
##              |     0.693 |     0.307 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
#table1 <- CrossTable(x = test_label, y = wdbc_pred1, prop.chisq=FALSE)

The values that we usually see on the diagonal of the confusion matrix are 74 and 35.The firrst one(74) indicates the true negative results which mean that 74 of 114 values are cases where the mass was benign (B) and the KNN algorithm correctly identified it also as benign (B). The seconde one(35) indicates the true positive results, where our classifier labels and the true label agree that the mass is malignant.Here we noticed that 35 of 114 prediction were true positives.shows number of correctly classified instances out of total 114 instances. The values not on the diagonal implies that they have been incorrectly instances.

The lower-left cell are false negative results,here we see that 5 of the 114 predicted value were benign, but the tumors were actually malignant. False negative errors could be extremely dangerous and not accepted as they might lead a patient to believe that she is cancer-free, but in reality, the disease may continue to spread.

The top-right cell are false positive results,here we do not have any cases. If we had a case ; it could mean that the model classifies as malignant, but in reality, they were benign.We also noted that these kind of errors are less dangerous than a false negative result.Nevertheless they could lead to additional cost on the health care system or additional stress for the patient as additional tests or treatment may have to be provided. ### Accuracy and Performance

aa <- table(wdbc_pred,test_label)
accuracy <- (aa[1,1]+aa[2,2])/sum(aa)
accuracy
## [1] 0.9561404

The process repeated for each category. Finally, the accuracy is 0.97.

SUPPORT VECTOR MACHINE MODEL

Determine the tuning parametre prior to fitting model to train set

fit the svm to the train set

wdbc_completed <- cbind(wdbc$diagnosis  ,wdbc.nor)# bind the label to the normalized data
names(wdbc_completed)[names(wdbc_completed) == "wdbc$diagnosis"] <- "diagnosis"

We split the data into training set and test set

set.seed(123) # set seed to replicate results
trainingIndex <- sample(1:nrow(wdbc_completed), 0.8*nrow(wdbc_completed)) # indices for 80% training data
wdbctraining <- wdbc_completed[trainingIndex, ] # training data
wdbctest <- wdbc_completed[-trainingIndex, ] # test data

We tune the model using a linear kernel

att_spec <- {diagnosis ~ .}
tun.train <- tune(svm,att_spec,data=wdbctraining,kernel="linear",ranges = list(gamma=2^(-8:1),cost=2^(0:4)))
plot(tun.train)

#fit the svm to the train set
train_svm_fit <- svm(att_spec,data=wdbctraining,cost=tun.train$best.parameters$cost,
                     gamma=tun.train$best.parameters$gamma,probability=TRUE)
svm_predict_train <- predict(train_svm_fit,wdbctraining,probability=TRUE)
svm_predict_prob <- attr(svm_predict_train,"probabilities")[,1]
svm_train_prediction <- prediction(svm_predict_prob, wdbctraining$diagnosis)
svm_train_AUC <- as.numeric(performance(svm_train_prediction,"auc")@y.values)

Evaluation on the test data

#we evaluation on the test data
svm_predict_test <- predict(train_svm_fit,wdbctest,probability=TRUE)
svm_predict_testprob <- attr(svm_predict_test,"probabilities")[,1]
svm_test_prediction <- prediction(svm_predict_testprob,wdbctest$diagnosis)
svm_test_AUC <- as.numeric(performance(svm_test_prediction ,"auc")@y.values)
svm_test_AUC
## [1] 0.997973

Confusion matrix and accuracy

#Checking over all prediction
confusion_matrix <- table(svm_predict_test,wdbctest$diagnosis)
print(confusion_matrix)
##                 
## svm_predict_test  B  M
##                B 74  1
##                M  0 39
#Checking over all accuracy of the prediction
predictive_accuracy <- (confusion_matrix[1,1]+confusion_matrix[2,2])/sum(confusion_matrix)
print(predictive_accuracy)
## [1] 0.9912281

Based on the Confusion matrix, our overall accuracy of our SVM model is 90%, this states that 90% of the time our prediction of AHD is accurate; which is really good, even better than our logistic regression model in part 1 . Let’s look at our ROC curve ###ROC for svm

#ROC for svm
trains_vm_roc <- performance(svm_train_prediction,"tpr","fpr")  
test_svm_roc <- performance(svm_test_prediction,"tpr","fpr")
plot_roc(trains_vm_roc,svm_train_AUC,test_svm_roc,svm_test_AUC)

## LOGISTIC REGRESSION MODEL ###Determine the tuning parametre prior to fitting model to train set ###fit the logistic regression to the train set

#specify and fit the logistic regresion
cancer_model <- {diagnosis ~ .}
cancer_fit <- glm(cancer_model,family ="binomial",data=wdbctraining)
print(summary(cancer_fit))
## 
## Call:
## glm(formula = cancer_model, family = "binomial", data = wdbctraining)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -2.12e-04  -2.10e-08  -2.10e-08   2.10e-08   2.40e-04  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)
## (Intercept)                  -852.75  157975.37  -0.005    0.996
## radius...mean              -31371.92 6140718.62  -0.005    0.996
## texture...mean                -34.18  211541.28   0.000    1.000
## perimeter...mean            25113.96 5691855.22   0.004    0.996
## area...mean                  4630.98 1653376.32   0.003    0.998
## smoothness...mean            1179.58  351258.27   0.003    0.997
## compactness...mean          -2574.87  319795.72  -0.008    0.994
## concavity...mean              183.77  441771.93   0.000    1.000
## concave.points...mean         542.48  439809.86   0.001    0.999
## symmetry...mean              -671.07  214720.58  -0.003    0.998
## fractal.dimension...mean      798.48  414418.20   0.002    0.998
## radius...se                   459.39 3269385.79   0.000    1.000
## texture...se                 -234.12  146111.16  -0.002    0.999
## perimeter...se                -98.81 1698930.49   0.000    1.000
## area...se                     931.47 3334314.57   0.000    1.000
## smoothness...se              -307.05  415725.57  -0.001    0.999
## compactness...se              792.32  531122.60   0.001    0.999
## concavity...se              -1943.23  404053.88  -0.005    0.996
## concave.points...se          2409.80  861590.01   0.003    0.998
## symmetry...se                -855.38  207860.27  -0.004    0.997
## fractal.dimension...se      -2651.56  695744.95  -0.004    0.997
## radius...worst              10360.54 4103380.13   0.003    0.998
## texture...worst               731.43  187911.66   0.004    0.997
## perimeter...worst           -2856.24 2347552.33  -0.001    0.999
## area...worst                -4053.35 2959602.61  -0.001    0.999
## smoothness...worst           -297.23  265297.54  -0.001    0.999
## compactness...worst          -471.14  557631.91  -0.001    0.999
## concavity...worst            1091.89  294117.30   0.004    0.997
## concave.points...worst       -538.94  537608.88  -0.001    0.999
## symmetry...worst             1194.67  221500.26   0.005    0.996
## fractal.dimension...worst    1229.28  313434.00   0.004    0.997
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.0146e+02  on 453  degrees of freedom
## Residual deviance: 5.3022e-07  on 423  degrees of freedom
## AIC: 62
## 
## Number of Fisher Scoring iterations: 25

Check which features are significant?

AIC can be used for model selection. Here we are performing the Stepwise Variable Selection

step_model <- step(cancer_fit, data = wdbctraining)
## Start:  AIC=62
## diagnosis ~ radius...mean + texture...mean + perimeter...mean + 
##     area...mean + smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + texture...se + perimeter...se + area...se + 
##     smoothness...se + compactness...se + concavity...se + concave.points...se + 
##     symmetry...se + fractal.dimension...se + radius...worst + 
##     texture...worst + perimeter...worst + area...worst + smoothness...worst + 
##     compactness...worst + concavity...worst + concave.points...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - texture...mean             1     0.00  60.00
## - perimeter...se             1     0.00  60.00
## - radius...se                1     0.00  60.00
## - area...se                  1     0.00  60.00
## - concavity...mean           1     0.00  60.00
## - concave.points...mean      1     0.00  60.00
## - concave.points...worst     1     0.00  60.00
## - smoothness...se            1     0.00  60.00
## - compactness...worst        1     0.00  60.00
## - perimeter...worst          1     0.00  60.00
## - texture...se               1     0.00  60.00
## - smoothness...worst         1     0.00  60.00
## - area...worst               1     0.00  60.00
## - compactness...se           1     0.00  60.00
## - fractal.dimension...mean   1     0.00  60.00
## - texture...worst            1     0.00  60.00
## - area...mean                1     0.00  60.00
## - radius...worst             1     0.00  60.00
## - concavity...worst          1     0.00  60.00
## - fractal.dimension...worst  1     0.00  60.00
## - smoothness...mean          1     0.00  60.00
## - symmetry...se              1     0.00  60.00
## - symmetry...mean            1     0.00  60.00
## - perimeter...mean           1     0.00  60.00
## - fractal.dimension...se     1     0.00  60.00
## - concavity...se             1     0.00  60.00
## - compactness...mean         1     0.00  60.00
## - radius...mean              1     0.00  60.00
## - concave.points...se        1     0.00  60.00
## <none>                             0.00  62.00
## - symmetry...worst           1   865.05 925.05
## 
## Step:  AIC=60
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + texture...se + perimeter...se + area...se + 
##     smoothness...se + compactness...se + concavity...se + concave.points...se + 
##     symmetry...se + fractal.dimension...se + radius...worst + 
##     texture...worst + perimeter...worst + area...worst + smoothness...worst + 
##     compactness...worst + concavity...worst + concave.points...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df   Deviance AIC
## - area...se                  1 5.3192e-07  58
## - radius...se                1 5.3754e-07  58
## - perimeter...se             1 5.3781e-07  58
## - concavity...mean           1 5.4135e-07  58
## - concave.points...worst     1 5.4863e-07  58
## - smoothness...se            1 5.5625e-07  58
## - smoothness...worst         1 5.6402e-07  58
## - compactness...worst        1 5.6409e-07  58
## - texture...se               1 5.6776e-07  58
## - perimeter...worst          1 5.6919e-07  58
## - concave.points...mean      1 5.7018e-07  58
## - area...worst               1 5.8809e-07  58
## - compactness...se           1 5.9913e-07  58
## - fractal.dimension...mean   1 6.3804e-07  58
## - area...mean                1 6.5366e-07  58
## - concavity...worst          1 6.9519e-07  58
## - fractal.dimension...worst  1 7.1294e-07  58
## - radius...worst             1 7.3362e-07  58
## - smoothness...mean          1 7.4246e-07  58
## - symmetry...se              1 8.6574e-07  58
## - symmetry...mean            1 1.0315e-06  58
## - texture...worst            1 1.0369e-06  58
## - fractal.dimension...se     1 1.1301e-06  58
## - compactness...mean         1 1.1631e-06  58
## - symmetry...worst           1 1.1975e-06  58
## - concavity...se             1 1.2396e-06  58
## - perimeter...mean           1 1.3249e-06  58
## - radius...mean              1 1.8042e-06  58
## - concave.points...se        1 1.9708e-06  58
## <none>                         5.3235e-07  60
## 
## Step:  AIC=58
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + texture...se + perimeter...se + smoothness...se + 
##     compactness...se + concavity...se + concave.points...se + 
##     symmetry...se + fractal.dimension...se + radius...worst + 
##     texture...worst + perimeter...worst + area...worst + smoothness...worst + 
##     compactness...worst + concavity...worst + concave.points...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df   Deviance AIC
## - perimeter...se             1 5.4174e-07  56
## - concavity...mean           1 5.4468e-07  56
## - smoothness...worst         1 5.6238e-07  56
## - radius...se                1 5.6589e-07  56
## - texture...se               1 5.6615e-07  56
## - smoothness...se            1 5.6618e-07  56
## - perimeter...worst          1 5.6653e-07  56
## - concave.points...worst     1 5.7018e-07  56
## - concave.points...mean      1 5.7298e-07  56
## - area...worst               1 5.9355e-07  56
## - compactness...worst        1 6.0433e-07  56
## - area...mean                1 6.4990e-07  56
## - compactness...se           1 6.6142e-07  56
## - concavity...worst          1 7.0657e-07  56
## - fractal.dimension...mean   1 7.1202e-07  56
## - radius...worst             1 7.3578e-07  56
## - smoothness...mean          1 7.5866e-07  56
## - fractal.dimension...worst  1 8.4955e-07  56
## - symmetry...se              1 1.0031e-06  56
## - symmetry...mean            1 1.0968e-06  56
## - compactness...mean         1 1.1594e-06  56
## - symmetry...worst           1 1.2022e-06  56
## - concavity...se             1 1.2830e-06  56
## - perimeter...mean           1 1.3390e-06  56
## - fractal.dimension...se     1 1.3503e-06  56
## - texture...worst            1 1.3746e-06  56
## - radius...mean              1 1.7801e-06  56
## - concave.points...se        1 2.0890e-06  56
## <none>                         5.3192e-07  58
## 
## Step:  AIC=56
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + texture...se + smoothness...se + compactness...se + 
##     concavity...se + concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + area...worst + 
##     smoothness...worst + compactness...worst + concavity...worst + 
##     concave.points...worst + symmetry...worst + fractal.dimension...worst
## 
##                             Df   Deviance AIC
## - smoothness...worst         1 5.5930e-07  54
## - concavity...mean           1 5.7250e-07  54
## - texture...se               1 5.7520e-07  54
## - concave.points...worst     1 5.7730e-07  54
## - smoothness...se            1 5.7740e-07  54
## - concave.points...mean      1 5.8190e-07  54
## - area...worst               1 5.9710e-07  54
## - compactness...worst        1 6.0140e-07  54
## - area...mean                1 6.4760e-07  54
## - compactness...se           1 6.5900e-07  54
## - fractal.dimension...mean   1 7.1120e-07  54
## - perimeter...worst          1 7.1470e-07  54
## - radius...se                1 7.3040e-07  54
## - concavity...worst          1 7.3200e-07  54
## - smoothness...mean          1 7.7820e-07  54
## - radius...worst             1 8.1710e-07  54
## - fractal.dimension...worst  1 8.9370e-07  54
## - symmetry...mean            1 1.0773e-06  54
## - symmetry...se              1 1.1611e-06  54
## - compactness...mean         1 1.1846e-06  54
## - symmetry...worst           1 1.1936e-06  54
## - fractal.dimension...se     1 1.2920e-06  54
## - concavity...se             1 1.3181e-06  54
## - perimeter...mean           1 1.3914e-06  54
## - texture...worst            1 1.7137e-06  54
## - radius...mean              1 1.8495e-06  54
## - concave.points...se        1 3.1663e-06  54
## <none>                         5.4170e-07  56
## 
## Step:  AIC=54
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + texture...se + smoothness...se + compactness...se + 
##     concavity...se + concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + area...worst + 
##     compactness...worst + concavity...worst + concave.points...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df   Deviance AIC
## - texture...se               1 6.0580e-07  52
## - area...worst               1 6.0740e-07  52
## - compactness...worst        1 6.0790e-07  52
## - concavity...mean           1 6.1100e-07  52
## - concave.points...worst     1 6.3770e-07  52
## - area...mean                1 6.5650e-07  52
## - compactness...se           1 6.6190e-07  52
## - concave.points...mean      1 6.6590e-07  52
## - radius...se                1 7.0370e-07  52
## - concavity...worst          1 7.2940e-07  52
## - smoothness...se            1 7.6560e-07  52
## - fractal.dimension...mean   1 7.7820e-07  52
## - smoothness...mean          1 8.0050e-07  52
## - perimeter...worst          1 8.4980e-07  52
## - radius...worst             1 8.7930e-07  52
## - symmetry...mean            1 1.0930e-06  52
## - fractal.dimension...worst  1 1.1079e-06  52
## - symmetry...worst           1 1.1602e-06  52
## - concavity...se             1 1.3094e-06  52
## - fractal.dimension...se     1 1.4397e-06  52
## - symmetry...se              1 1.4577e-06  52
## - compactness...mean         1 1.5559e-06  52
## - perimeter...mean           1 1.5932e-06  52
## - texture...worst            1 1.6975e-06  52
## - radius...mean              1 1.9991e-06  52
## - concave.points...se        1 5.5434e-06  52
## <none>                         5.5930e-07  54
## 
## Step:  AIC=52
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + smoothness...se + compactness...se + concavity...se + 
##     concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + area...worst + 
##     compactness...worst + concavity...worst + concave.points...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - area...worst               1    0.000 50.000
## - concave.points...worst     1    0.000 50.000
## - compactness...worst        1    0.000 50.000
## - concavity...mean           1    0.000 50.000
## - concave.points...mean      1    0.000 50.000
## - compactness...se           1    0.000 50.000
## - area...mean                1    0.000 50.000
## - smoothness...se            1    0.000 50.000
## - smoothness...mean          1    0.000 50.000
## - radius...se                1    0.000 50.000
## - concavity...worst          1    0.000 50.000
## - radius...worst             1    0.000 50.000
## - perimeter...worst          1    0.000 50.000
## - fractal.dimension...mean   1    0.000 50.000
## - symmetry...mean            1    0.000 50.000
## - fractal.dimension...worst  1    0.000 50.000
## - concavity...se             1    0.000 50.000
## - symmetry...se              1    0.000 50.000
## - compactness...mean         1    0.000 50.000
## - symmetry...worst           1    0.000 50.000
## - fractal.dimension...se     1    0.000 50.000
## - perimeter...mean           1    0.000 50.000
## - radius...mean              1    0.000 50.000
## - concave.points...se        1    0.000 50.000
## <none>                            0.000 52.000
## - texture...worst            1   34.679 84.679
## 
## Step:  AIC=50
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + smoothness...se + compactness...se + concavity...se + 
##     concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + compactness...worst + 
##     concavity...worst + concave.points...worst + symmetry...worst + 
##     fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - concave.points...worst     1    0.000 48.000
## - compactness...worst        1    0.000 48.000
## - concavity...mean           1    0.000 48.000
## - concave.points...mean      1    0.000 48.000
## - compactness...se           1    0.000 48.000
## - smoothness...se            1    0.000 48.000
## - area...mean                1    0.000 48.000
## - concavity...worst          1    0.000 48.000
## - radius...se                1    0.000 48.000
## - fractal.dimension...mean   1    0.000 48.000
## - smoothness...mean          1    0.000 48.000
## - perimeter...worst          1    0.000 48.000
## - fractal.dimension...worst  1    0.000 48.000
## - symmetry...mean            1    0.000 48.000
## - concavity...se             1    0.000 48.000
## - symmetry...se              1    0.000 48.000
## - symmetry...worst           1    0.000 48.000
## - fractal.dimension...se     1    0.000 48.000
## - perimeter...mean           1    0.000 48.000
## - concave.points...se        1    0.000 48.000
## - compactness...mean         1    0.001 48.001
## - radius...worst             1    0.145 48.145
## <none>                            0.000 50.000
## - radius...mean              1   25.579 73.579
## - texture...worst            1   38.192 86.192
## 
## Step:  AIC=48
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     concave.points...mean + symmetry...mean + fractal.dimension...mean + 
##     radius...se + smoothness...se + compactness...se + concavity...se + 
##     concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + compactness...worst + 
##     concavity...worst + symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - concave.points...mean      1    0.000 46.000
## - concavity...mean           1    0.000 46.000
## - compactness...se           1    0.000 46.000
## - smoothness...se            1    0.000 46.000
## - compactness...worst        1    0.000 46.000
## - area...mean                1    0.000 46.000
## - concavity...worst          1    0.000 46.000
## - smoothness...mean          1    0.000 46.000
## - fractal.dimension...mean   1    0.000 46.000
## - perimeter...worst          1    0.000 46.000
## - fractal.dimension...worst  1    0.000 46.000
## - concavity...se             1    0.000 46.000
## - symmetry...mean            1    0.000 46.000
## - symmetry...se              1    0.000 46.000
## - fractal.dimension...se     1    0.000 46.000
## - symmetry...worst           1    0.000 46.000
## - perimeter...mean           1    0.000 46.000
## <none>                            0.000 48.000
## - compactness...mean         1   18.578 64.578
## - radius...worst             1   22.506 68.506
## - radius...se                1   24.532 70.532
## - radius...mean              1   25.587 71.587
## - concave.points...se        1   33.177 79.177
## - texture...worst            1   38.437 84.437
## 
## Step:  AIC=46
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     symmetry...mean + fractal.dimension...mean + radius...se + 
##     smoothness...se + compactness...se + concavity...se + concave.points...se + 
##     symmetry...se + fractal.dimension...se + radius...worst + 
##     texture...worst + perimeter...worst + compactness...worst + 
##     concavity...worst + symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - compactness...worst        1    0.000 44.000
## - compactness...se           1    0.000 44.000
## - smoothness...se            1    0.000 44.000
## - area...mean                1    0.000 44.000
## - concavity...worst          1    0.000 44.000
## - concavity...mean           1    0.000 44.000
## - perimeter...worst          1    0.000 44.000
## - smoothness...mean          1    0.000 44.000
## - fractal.dimension...mean   1    0.000 44.000
## - fractal.dimension...worst  1    0.000 44.000
## - symmetry...se              1    0.000 44.000
## - symmetry...mean            1    0.000 44.000
## - concavity...se             1    0.000 44.000
## - symmetry...worst           1    0.000 44.000
## - fractal.dimension...se     1    0.000 44.000
## - perimeter...mean           1    0.000 44.000
## <none>                            0.000 46.000
## - compactness...mean         1   21.512 65.512
## - radius...se                1   24.585 68.585
## - radius...worst             1   24.664 68.664
## - radius...mean              1   30.046 74.046
## - texture...worst            1   38.901 82.901
## - concave.points...se        1   41.193 85.193
## 
## Step:  AIC=44
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     symmetry...mean + fractal.dimension...mean + radius...se + 
##     smoothness...se + compactness...se + concavity...se + concave.points...se + 
##     symmetry...se + fractal.dimension...se + radius...worst + 
##     texture...worst + perimeter...worst + concavity...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - compactness...se           1    0.000 42.000
## - smoothness...se            1    0.000 42.000
## - area...mean                1    0.000 42.000
## - concavity...worst          1    0.000 42.000
## - perimeter...worst          1    0.000 42.000
## - fractal.dimension...worst  1    0.000 42.000
## - smoothness...mean          1    0.000 42.000
## - symmetry...se              1    0.000 42.000
## - concavity...mean           1    0.000 42.000
## - symmetry...mean            1    0.000 42.000
## - concavity...se             1    0.000 42.000
## - fractal.dimension...mean   1    0.000 42.000
## - symmetry...worst           1    0.000 42.000
## - fractal.dimension...se     1    0.000 42.000
## - perimeter...mean           1    0.000 42.000
## <none>                            0.000 44.000
## - radius...worst             1   25.691 67.691
## - radius...se                1   26.265 68.265
## - compactness...mean         1   26.560 68.560
## - radius...mean              1   30.402 72.402
## - texture...worst            1   40.001 82.001
## - concave.points...se        1   41.288 83.288
## 
## Step:  AIC=42
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     symmetry...mean + fractal.dimension...mean + radius...se + 
##     smoothness...se + concavity...se + concave.points...se + 
##     symmetry...se + fractal.dimension...se + radius...worst + 
##     texture...worst + perimeter...worst + concavity...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - smoothness...se            1     0.00  40.00
## - concavity...worst          1     0.00  40.00
## - fractal.dimension...worst  1     0.00  40.00
## - perimeter...worst          1     0.00  40.00
## - symmetry...se              1     0.00  40.00
## - concavity...mean           1     0.00  40.00
## - area...mean                1     0.00  40.00
## - symmetry...mean            1     0.00  40.00
## - concavity...se             1     0.00  40.00
## - symmetry...worst           1     0.00  40.00
## <none>                             0.00  42.00
## - perimeter...mean           1    21.00  61.00
## - fractal.dimension...se     1    23.14  63.14
## - radius...se                1    26.43  66.43
## - radius...worst             1    29.53  69.53
## - radius...mean              1    30.43  70.43
## - compactness...mean         1    33.67  73.67
## - texture...worst            1    40.00  80.00
## - concave.points...se        1    41.93  81.93
## - fractal.dimension...mean   1   576.70 616.70
## - smoothness...mean          1   720.87 760.87
## 
## Step:  AIC=40
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     symmetry...mean + fractal.dimension...mean + radius...se + 
##     concavity...se + concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + concavity...worst + 
##     symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - concavity...worst          1     0.00  38.00
## - perimeter...worst          1     0.00  38.00
## - fractal.dimension...worst  1     0.00  38.00
## - concavity...mean           1     0.00  38.00
## - symmetry...mean            1     0.00  38.00
## - area...mean                1     0.00  38.00
## - symmetry...se              1     0.00  38.00
## - concavity...se             1     0.00  38.00
## - fractal.dimension...mean   1     0.00  38.00
## - symmetry...worst           1     0.00  38.00
## <none>                             0.00  40.00
## - perimeter...mean           1    23.39  61.39
## - fractal.dimension...se     1    27.53  65.53
## - radius...se                1    28.47  66.47
## - radius...mean              1    30.44  68.44
## - radius...worst             1    30.85  68.85
## - compactness...mean         1    33.68  71.68
## - texture...worst            1    40.02  78.02
## - concave.points...se        1    47.01  85.01
## - smoothness...mean          1   576.70 614.70
## 
## Step:  AIC=38
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     symmetry...mean + fractal.dimension...mean + radius...se + 
##     concavity...se + concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + perimeter...worst + symmetry...worst + 
##     fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - perimeter...worst          1     0.00  36.00
## - area...mean                1     0.00  36.00
## - fractal.dimension...worst  1     0.00  36.00
## - symmetry...mean            1     0.00  36.00
## - concavity...se             1     0.00  36.00
## - concavity...mean           1     0.00  36.00
## <none>                             0.00  38.00
## - symmetry...se              1    21.82  57.82
## - perimeter...mean           1    23.48  59.48
## - symmetry...worst           1    24.36  60.36
## - radius...se                1    28.65  64.65
## - fractal.dimension...se     1    30.59  66.59
## - radius...mean              1    30.85  66.85
## - radius...worst             1    30.86  66.86
## - compactness...mean         1    33.97  69.97
## - texture...worst            1    41.06  77.06
## - concave.points...se        1    47.13  83.13
## - smoothness...mean          1   648.79 684.79
## - fractal.dimension...mean   1   648.79 684.79
## 
## Step:  AIC=36
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     symmetry...mean + fractal.dimension...mean + radius...se + 
##     concavity...se + concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## - symmetry...mean            1    0.000 34.000
## - area...mean                1    0.000 34.000
## - fractal.dimension...worst  1    0.000 34.000
## - concavity...se             1    0.001 34.001
## <none>                            0.000 36.000
## - concavity...mean           1   18.900 52.900
## - smoothness...mean          1   19.477 53.477
## - fractal.dimension...mean   1   19.647 53.647
## - symmetry...se              1   21.997 55.997
## - perimeter...mean           1   24.052 58.052
## - symmetry...worst           1   25.964 59.964
## - radius...se                1   29.762 63.762
## - radius...mean              1   33.496 67.496
## - compactness...mean         1   35.218 69.218
## - fractal.dimension...se     1   35.328 69.328
## - radius...worst             1   35.549 69.549
## - texture...worst            1   45.393 79.393
## - concave.points...se        1   47.647 81.647
## 
## Step:  AIC=34
## diagnosis ~ radius...mean + perimeter...mean + area...mean + 
##     smoothness...mean + compactness...mean + concavity...mean + 
##     fractal.dimension...mean + radius...se + concavity...se + 
##     concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + symmetry...worst + fractal.dimension...worst
## 
##                             Df Deviance    AIC
## <none>                            0.000 34.000
## - concavity...se             1   19.441 51.441
## - area...mean                1   19.496 51.496
## - concavity...mean           1   22.203 54.203
## - symmetry...se              1   24.082 56.082
## - fractal.dimension...worst  1   24.379 56.379
## - smoothness...mean          1   24.989 56.989
## - symmetry...worst           1   26.006 58.006
## - fractal.dimension...mean   1   27.096 59.096
## - perimeter...mean           1   29.294 61.294
## - radius...se                1   31.223 63.223
## - radius...mean              1   36.433 68.433
## - fractal.dimension...se     1   39.344 71.344
## - compactness...mean         1   39.389 71.389
## - radius...worst             1   41.418 73.418
## - texture...worst            1   47.999 79.999
## - concave.points...se        1   48.146 80.146
summary(step_model)
## 
## Call:
## glm(formula = diagnosis ~ radius...mean + perimeter...mean + 
##     area...mean + smoothness...mean + compactness...mean + concavity...mean + 
##     fractal.dimension...mean + radius...se + concavity...se + 
##     concave.points...se + symmetry...se + fractal.dimension...se + 
##     radius...worst + texture...worst + symmetry...worst + fractal.dimension...worst, 
##     family = "binomial", data = wdbctraining)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.300e-03  -2.000e-08  -2.000e-08   2.000e-08   1.183e-03  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                  -4466      83435  -0.054    0.957
## radius...mean              -129433    2460278  -0.053    0.958
## perimeter...mean             97645    1837053   0.053    0.958
## area...mean                  25373     536691   0.047    0.962
## smoothness...mean             2354      43973   0.054    0.957
## compactness...mean          -11335     202247  -0.056    0.955
## concavity...mean              4990      92183   0.054    0.957
## fractal.dimension...mean      7273     131333   0.055    0.956
## radius...se                   5731     104294   0.055    0.956
## concavity...se               -4704      90054  -0.052    0.958
## concave.points...se           8967     164377   0.055    0.956
## symmetry...se                -4610      84671  -0.054    0.957
## fractal.dimension...se      -12493     230173  -0.054    0.957
## radius...worst               20917     394836   0.053    0.958
## texture...worst               3828      71037   0.054    0.957
## symmetry...worst              4306      82025   0.052    0.958
## fractal.dimension...worst     5096      96652   0.053    0.958
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.0146e+02  on 453  degrees of freedom
## Residual deviance: 1.1654e-05  on 437  degrees of freedom
## AIC: 34
## 
## Number of Fisher Scoring iterations: 25
#Predicting the probability of having a heart desease
Prob_can_train <- predict.glm(step_model,newdata= wdbctraining,type = "response")
can_train_prediction <- prediction(Prob_can_train,wdbctraining$diagnosis)
can_train_auc <- as.numeric(performance(can_train_prediction,"auc")@y.values)
#Prob_can_test <- predict.glm(model2,newdata= wdbctest,type = "response")
logis_anova <- anova(step_model,test = "Chisq")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(anova(step_model,test = "Chisq"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: diagnosis
## 
## Terms added sequentially (first to last)
## 
## 
##                           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                        453     601.46              
## radius...mean              1   338.59       452     262.87 < 2.2e-16 ***
## perimeter...mean           1    64.57       451     198.29 9.303e-16 ***
## area...mean                1     6.67       450     191.62  0.009803 ** 
## smoothness...mean          1    21.85       449     169.77 2.941e-06 ***
## compactness...mean         1     0.03       448     169.74  0.858556    
## concavity...mean           1     9.33       447     160.41  0.002252 ** 
## fractal.dimension...mean   1     3.05       446     157.36  0.080743 .  
## radius...se                1     2.11       445     155.24  0.146181    
## concavity...se             1    45.00       444     110.24 1.966e-11 ***
## concave.points...se        1     0.03       443     110.21  0.868337    
## symmetry...se              1     3.84       442     106.38  0.050190 .  
## fractal.dimension...se     1     3.59       441     102.79  0.058134 .  
## radius...worst             1    38.57       440      64.22 5.277e-10 ***
## texture...worst            1    22.61       439      41.60 1.981e-06 ***
## symmetry...worst           1    17.22       438      24.38 3.324e-05 ***
## fractal.dimension...worst  1    24.38       437       0.00 7.912e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Predicting the probability of having a heart desease
Prob_can_test <- predict.glm(step_model,newdata= wdbctest,type = "response")
can_test_prediction <- prediction(Prob_can_test,wdbctest$diagnosis)
can_test_auc <- as.numeric(performance(can_test_prediction,"auc")@y.values)
#ROC for svm
trains_roc <- performance(can_train_prediction,"tpr","fpr")  
test_roc <- performance(can_test_prediction,"tpr","fpr")
plot_roc(trains_roc,can_train_auc,test_roc,can_test_auc)

Predict_cancer <- ifelse((Prob_can_test>0.5),1,0)
Predict_cancer <-factor(Predict_cancer,levels=c(0,1),labels=c("B","M"))
#Checking over all prediction
confusion_matrix <- table(Predict_cancer,wdbctest$diagnosis)
print(confusion_matrix)
##               
## Predict_cancer  B  M
##              B 71  2
##              M  3 38
#Checking over all accuracy of the prediction
predictive_accuracy <- (confusion_matrix[1,1]+confusion_matrix[2,2])/sum(confusion_matrix)
print(predictive_accuracy)
## [1] 0.9561404
lda_model <- lda(diagnosis ~., data = wdbc_completed)
lda_model
## Call:
## lda(diagnosis ~ ., data = wdbc_completed)
## 
## Prior probabilities of groups:
##         B         M 
## 0.6285211 0.3714789 
## 
## Group means:
##   radius...mean texture...mean perimeter...mean area...mean
## B     0.1837611      0.2088789        0.1818854   0.1276650
## M     0.3727973      0.3041778        0.3795233   0.3337742
##   smoothness...mean compactness...mean concavity...mean
## B         0.2438656          0.1757517        0.1079138
## M         0.3071911          0.3624211        0.3751509
##   concave.points...mean symmetry...mean fractal.dimension...mean
## B             0.1278201        0.224296                0.1324651
## M             0.4359337        0.285121                0.1297632
##   radius...se texture...se perimeter...se  area...se smoothness...se
## B  0.06007043    0.1760860     0.05656603 0.02643517       0.1761292
## M  0.17239110    0.1744448     0.16136104 0.12078165       0.1628301
##   compactness...se concavity...se concave.points...se symmetry...se
## B        0.1417005     0.06564832           0.1867333     0.1608842
## M        0.2211945     0.10547369           0.2852176     0.1588994
##   fractal.dimension...se radius...worst texture...worst perimeter...worst
## B             0.09186499      0.1512153       0.2320361         0.1456845
## M             0.10581461      0.3658350       0.3503234         0.3612876
##   area...worst smoothness...worst compactness...worst concavity...worst
## B    0.0878466          0.2416419           0.1468644         0.1327777
## M    0.2901406          0.3306064           0.3271796         0.3589195
##   concave.points...worst symmetry...worst fractal.dimension...worst
## B              0.2558225        0.1713557                 0.1176004
## M              0.6248906        0.2505580                 0.1752301
## 
## Coefficients of linear discriminants:
##                                    LD1
## radius...mean             -29.24639714
## texture...mean              0.86956617
## perimeter...mean           21.28147866
## area...mean                 3.70473614
## smoothness...mean           0.03721572
## compactness...mean         -7.14765557
## concavity...mean            2.97205049
## concave.points...mean       2.14067928
## symmetry...mean             0.13818811
## fractal.dimension...mean    0.03916040
## radius...se                 6.14332672
## texture...se               -0.15414036
## perimeter...se             -2.43500030
## area...se                  -2.40326046
## smoothness...se             2.44897462
## compactness...se            0.02608046
## concavity...se             -6.98079828
## concave.points...se         2.73554436
## symmetry...se               0.65514666
## fractal.dimension...se     -1.03885774
## radius...worst             34.43829527
## texture...worst             1.74371696
## perimeter...worst          -2.78072505
## area...worst              -21.14126028
## smoothness...worst          0.59308291
## compactness...worst         0.34510686
## concavity...worst           2.34981071
## concave.points...worst      0.66484656
## symmetry...worst            1.84867868
## fractal.dimension...worst   4.38611209

The probability of being diagnoses with a malignant and the likelihood to be a benign in the sample are respectively 0.6285211 and 0.3714789

Making a dataframe out of the LDA for visualization purpose.

# Making a df out of the LDA for visualization purpose.
predict_lda_df <- predict(lda_model, wdbc_completed)$x %>% 
  as_data_frame() %>% 
  cbind(diagnosis = wdbc_completed$diagnosis) 
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
ggplot(predict_lda_df, aes(x=LD1, fill=diagnosis)) + geom_density(alpha=0.5)

LDA (Linear Discriminant Analysis) model

fit the LDA to the train set

#fit the LDA to the train set
lda.train <- lda(diagnosis ~., data = wdbctraining)
lda.pred.train <- predict(lda.train,wdbctraining)
lda.prediction <- prediction(lda.pred.train$posterior[,2],wdbctraining$diagnosis)
lda.AUC <- performance(lda.prediction,"auc")@y.values
#perf <- performance(lda.prediction,"tpr","fpr")
#plot(perf,colorize=TRUE)

Evaluation on the test data

#we evaluation on the test data
lda.pred.test <- predict(lda.train,wdbctest)
lda.prediction.test <- prediction(lda.pred.test$posterior[,2],wdbctest$diagnosis)
lda_test_AUC <- performance(lda.prediction.test ,"auc")@y.values
lda_test_AUC
## [[1]]
## [1] 0.9986486

Confusion matrix and accuracy

#Checking over all prediction
confusion_matrix <- table(lda.pred.test$class,wdbctest$diagnosis)
dimnames(confusion_matrix ) <- list(Actual = c("B", "M"), "Predicted" = c("B","M"))
print(confusion_matrix)
##       Predicted
## Actual  B  M
##      B 74  4
##      M  0 36
#Checking over all accuracy of the prediction
predictive_accuracy <- (confusion_matrix[1,1]+confusion_matrix[2,2])/sum(confusion_matrix)
print(predictive_accuracy)
## [1] 0.9649123
plot_rocs <- function(train_roc,train_auc,test_roc,test_auc){
  plot(train_roc,col="blue",lty="solid",main="",lwd=2,xlab="False Positive Rate",
       ylab="True Positive Rate")
  plot(test_roc,col="red",lty="dashed",lwd=2,add=TRUE)
  abline(c(0,1))  
  #legend
  train.legend <- paste("Training AUC = ",train_auc)
  test.legend <- paste("Test AUC = ", test_auc)
  legend("bottomright",legend = c(train.legend,test.legend),
         lty=c("solid","dashed"),lwd=2,col=c("blue","red"))
}

ROC for lda

#ROC for lda
trains_lda_roc <- performance(lda.prediction,"tpr","fpr")  
test_lda_roc <- performance(lda.prediction.test,"tpr","fpr")
plot_rocs(trains_lda_roc,lda.AUC,test_lda_roc,lda_test_AUC)