Load the data

library(readxl)
library(naniar)
wis<- read.csv("Breast_Cancer_Data.csv",na=common_na_strings)
str(wis)
## 'data.frame':    569 obs. of  33 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ X                      : logi  NA NA NA NA NA NA ...
data <- wis
dim(data)
## [1] 569  33

Data Pre Processing

library(caret)

# Checking Missing Data 
colSums(is.na(data))
##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave.points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave.points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave.points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst                       X 
##                       0                       0                     569
# go get which rows contain the missing data 
# which(is.na(data), arr.ind=TRUE)

###### Mean and Median Replacement for NA Values 
#  data$rating[is.na(data$rating)]=mean(data$rating,na.rm=TRUE)
#  data$rating[is.na(data$rating)]=median(data$rating,na.rm=TRUE)

# Since X contains more than 88% of missing values, let us remove it! 

data$X<- NULL
data$id <- NULL
str(data)
## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
nearZeroVar(data, saveMetrics= TRUE)
##                         freqRatio percentUnique zeroVar   nzv
## diagnosis                1.683962     0.3514938   FALSE FALSE
## radius_mean              1.333333    80.1405975   FALSE FALSE
## texture_mean             1.000000    84.1827768   FALSE FALSE
## perimeter_mean           1.000000    91.7398946   FALSE FALSE
## area_mean                1.500000    94.7275923   FALSE FALSE
## smoothness_mean          1.250000    83.3040422   FALSE FALSE
## compactness_mean         1.000000    94.3760984   FALSE FALSE
## concavity_mean           4.333333    94.3760984   FALSE FALSE
## concave.points_mean      4.333333    95.2548330   FALSE FALSE
## symmetry_mean            1.000000    75.9226714   FALSE FALSE
## fractal_dimension_mean   1.000000    87.6977153   FALSE FALSE
## radius_se                1.000000    94.9033392   FALSE FALSE
## texture_se               1.000000    91.2126538   FALSE FALSE
## perimeter_se             2.000000    93.6731107   FALSE FALSE
## area_se                  1.000000    92.7943761   FALSE FALSE
## smoothness_se            1.000000    96.1335677   FALSE FALSE
## compactness_se           1.000000    95.0790861   FALSE FALSE
## concavity_se             6.500000    93.6731107   FALSE FALSE
## concave.points_se        4.333333    89.1036907   FALSE FALSE
## symmetry_se              1.333333    87.5219684   FALSE FALSE
## fractal_dimension_se     1.000000    95.7820738   FALSE FALSE
## radius_worst             1.250000    80.3163445   FALSE FALSE
## texture_worst            1.000000    89.8066784   FALSE FALSE
## perimeter_worst          1.000000    90.3339192   FALSE FALSE
## area_worst               1.000000    95.6063269   FALSE FALSE
## smoothness_worst         1.000000    72.2319859   FALSE FALSE
## compactness_worst        1.000000    92.9701230   FALSE FALSE
## concavity_worst          4.333333    94.7275923   FALSE FALSE
## concave.points_worst     4.333333    86.4674868   FALSE FALSE
## symmetry_worst           1.000000    87.8734622   FALSE FALSE
## fractal_dimension_worst  1.500000    94.0246046   FALSE FALSE
feature_map <- unlist(lapply(data, is.numeric)) 
findLinearCombos((data[,feature_map]))
## $linearCombos
## list()
## 
## $remove
## NULL
# Let us see all the numeric variables in the column

num_cols <- unlist(lapply(data, is.numeric))  
only_numeric<- data[, num_cols]
str(only_numeric)
## 'data.frame':    569 obs. of  30 variables:
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
# find correlations to exclude from the model 
findCorrelation( cor(only_numeric), cutoff = .75, names = TRUE )
##  [1] "concavity_mean"         "concave.points_mean"    "compactness_mean"      
##  [4] "concave.points_worst"   "concavity_worst"        "perimeter_worst"       
##  [7] "radius_worst"           "perimeter_mean"         "compactness_worst"     
## [10] "area_worst"             "radius_mean"            "perimeter_se"          
## [13] "concave.points_se"      "compactness_se"         "area_se"               
## [16] "smoothness_mean"        "fractal_dimension_mean" "texture_mean"
# [1] "concavity_mean"         "concave.points_mean"   
# [3] "compactness_mean"       "concave.points_worst"  
# [5] "concavity_worst"        "perimeter_worst"       
# [7] "radius_worst"           "perimeter_mean"        
# [9] "compactness_worst"      "area_worst"            
#[11] "radius_mean"            "perimeter_se"          
#[13] "concave.points_se"      "compactness_se"        
#[15] "area_se"                "smoothness_mean"       
# [17] "fractal_dimension_mean" "texture_mean"  

# there are many variables with high corelations, this is probably because 
# the size of the tumor variable might be causing that to happens

#############Quick EDA 

library(skimr)
skim(data)
Data summary
Name data
Number of rows 569
Number of columns 31
_______________________
Column type frequency:
factor 1
numeric 30
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
diagnosis 0 1 FALSE 2 B: 357, M: 212

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
radius_mean 0 1 14.13 3.52 6.98 11.70 13.37 15.78 28.11 ▂▇▃▁▁
texture_mean 0 1 19.29 4.30 9.71 16.17 18.84 21.80 39.28 ▃▇▃▁▁
perimeter_mean 0 1 91.97 24.30 43.79 75.17 86.24 104.10 188.50 ▃▇▃▁▁
area_mean 0 1 654.89 351.91 143.50 420.30 551.10 782.70 2501.00 ▇▃▂▁▁
smoothness_mean 0 1 0.10 0.01 0.05 0.09 0.10 0.11 0.16 ▁▇▇▁▁
compactness_mean 0 1 0.10 0.05 0.02 0.06 0.09 0.13 0.35 ▇▇▂▁▁
concavity_mean 0 1 0.09 0.08 0.00 0.03 0.06 0.13 0.43 ▇▃▂▁▁
concave.points_mean 0 1 0.05 0.04 0.00 0.02 0.03 0.07 0.20 ▇▃▂▁▁
symmetry_mean 0 1 0.18 0.03 0.11 0.16 0.18 0.20 0.30 ▁▇▅▁▁
fractal_dimension_mean 0 1 0.06 0.01 0.05 0.06 0.06 0.07 0.10 ▆▇▂▁▁
radius_se 0 1 0.41 0.28 0.11 0.23 0.32 0.48 2.87 ▇▁▁▁▁
texture_se 0 1 1.22 0.55 0.36 0.83 1.11 1.47 4.88 ▇▅▁▁▁
perimeter_se 0 1 2.87 2.02 0.76 1.61 2.29 3.36 21.98 ▇▁▁▁▁
area_se 0 1 40.34 45.49 6.80 17.85 24.53 45.19 542.20 ▇▁▁▁▁
smoothness_se 0 1 0.01 0.00 0.00 0.01 0.01 0.01 0.03 ▇▃▁▁▁
compactness_se 0 1 0.03 0.02 0.00 0.01 0.02 0.03 0.14 ▇▃▁▁▁
concavity_se 0 1 0.03 0.03 0.00 0.02 0.03 0.04 0.40 ▇▁▁▁▁
concave.points_se 0 1 0.01 0.01 0.00 0.01 0.01 0.01 0.05 ▇▇▁▁▁
symmetry_se 0 1 0.02 0.01 0.01 0.02 0.02 0.02 0.08 ▇▃▁▁▁
fractal_dimension_se 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.03 ▇▁▁▁▁
radius_worst 0 1 16.27 4.83 7.93 13.01 14.97 18.79 36.04 ▆▇▃▁▁
texture_worst 0 1 25.68 6.15 12.02 21.08 25.41 29.72 49.54 ▃▇▆▁▁
perimeter_worst 0 1 107.26 33.60 50.41 84.11 97.66 125.40 251.20 ▇▇▃▁▁
area_worst 0 1 880.58 569.36 185.20 515.30 686.50 1084.00 4254.00 ▇▂▁▁▁
smoothness_worst 0 1 0.13 0.02 0.07 0.12 0.13 0.15 0.22 ▂▇▇▂▁
compactness_worst 0 1 0.25 0.16 0.03 0.15 0.21 0.34 1.06 ▇▅▁▁▁
concavity_worst 0 1 0.27 0.21 0.00 0.11 0.23 0.38 1.25 ▇▅▂▁▁
concave.points_worst 0 1 0.11 0.07 0.00 0.06 0.10 0.16 0.29 ▅▇▅▃▁
symmetry_worst 0 1 0.29 0.06 0.16 0.25 0.28 0.32 0.66 ▅▇▁▁▁
fractal_dimension_worst 0 1 0.08 0.02 0.06 0.07 0.08 0.09 0.21 ▇▃▁▁▁
# library(DataExplorer)
# create_report(data)

###################

Checking Correlation between Numeric Variables

library(GGally)
# Plot correlation heatmap
ggcorr(data, label = TRUE, 
       palette = "RdBu", 
       name = "Correlation", 
       hjust = 0.75, 
       label_size = 2, 
       label_round = 2)

### alot of variables are highly corelated 

Since there are 30 variables, let us try to do some dimentionality Reduction

library(factoextra)

wdbc.pr <- prcomp(data[c(2:31)], center = TRUE, scale = TRUE)
summary(wdbc.pr)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000
##### this shows that with just 6 PCs I am ablee to represent variance of around 90% of the data

##### Note, Since an eigenvalues <1 would mean that the component actually explains less than a single explanatory variable we would like to discard those.

###### If our data is well suited for PCA we should be able to discard these components while retaining at least 70–80% of cumulative variance.

### let us plot these 

screeplot(wdbc.pr, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

cumpro <- cumsum(wdbc.pr$sdev^2 / sum(wdbc.pr$sdev^2))
plot(cumpro[0:15], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 6, col="blue", lty=5)
abline(h = 0.88759, col="blue", lty=5)
legend("topleft", legend=c("Cut-off @ PC6"),
       col=c("blue"), lty=5, cex=0.6)

###### We can see that first 6 components explain around 90% of the data 


###### We notice is that the first 6 components has an Eigenvalue >1 and explains almost 90% of variance, this is great! We can effectively reduce dimensionality from 30 to 6 while only “loosing” about 10% of variance!

We can also see that wee can explain 60% of the variance with just 2 components

# First two PCs 
plot(wdbc.pr$x[,1],wdbc.pr$x[,2], xlab="PC1 (44.3%)", ylab = "PC2 (19%)", main = "PC1 / PC2 - plot")

fviz_pca_ind(wdbc.pr, geom.ind = "point", pointshape = 21, 
             pointsize = 2, 
             fill.ind = data$diagnosis, 
             col.ind = "black", 
             palette = "jco", 
             addEllipses = TRUE,
             label = "var",
             col.var = "black",
             repel = TRUE,
             legend.title = "Diagnosis") +
  ggtitle("2D PCA-plot from 30 feature dataset") +
  theme(plot.title = element_text(hjust = 0.5))

#### we see the beauty of PCA. With just the first two components we can clearly see some separation between the benign and malignant tumors. This is a clear indication that the data is well-suited for some kind of classification model (like discriminant analysis).

LDA

library(MASS) # for LDA 

# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(data), replace = TRUE, prob = c(0.7,0.3))
train <- data[index == 1, ]
test <- data[index == 2, ]

wdbc_raw.lda <- lda(diagnosis~., data = train)

print(wdbc_raw.lda) # print the LDA
## Call:
## lda(diagnosis ~ ., data = train)
## 
## Prior probabilities of groups:
##         B         M 
## 0.6479592 0.3520408 
## 
## Group means:
##   radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## B    12.21941     18.00465       78.53476  467.6327      0.09207587
## M    17.46493     21.35101      115.49391  980.3304      0.10353957
##   compactness_mean concavity_mean concave.points_mean symmetry_mean
## B       0.07975531     0.04568185          0.02586663     0.1739382
## M       0.14983188     0.16425435          0.08978297     0.1942261
##   fractal_dimension_mean radius_se texture_se perimeter_se  area_se
## B             0.06252114 0.2818071   1.205663     1.965192 21.03904
## M             0.06317594 0.6129370   1.171064     4.386007 72.77007
##   smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## B   0.006989016     0.02131469   0.02560055       0.009800461  0.02035035
## M   0.006492993     0.03319930   0.04148522       0.014883986  0.02021338
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## B          0.003552607     13.47448      23.63315        87.58291   566.0039
## M          0.004133464     21.32384      29.22348       142.75899  1444.6877
##   smoothness_worst compactness_worst concavity_worst concave.points_worst
## B        0.1242542         0.1862876       0.1678460           0.07534906
## M        0.1467007         0.3972314       0.4695159           0.18687290
##   symmetry_worst fractal_dimension_worst
## B      0.2704665              0.07929543
## M      0.3296362              0.09417819
## 
## Coefficients of linear discriminants:
##                                   LD1
## radius_mean              -0.903870650
## texture_mean             -0.018671996
## perimeter_mean            0.017448490
## area_mean                 0.004358580
## smoothness_mean          -4.147707916
## compactness_mean        -22.356931825
## concavity_mean            9.292530916
## concave.points_mean      10.559487069
## symmetry_mean             6.292532389
## fractal_dimension_mean   15.660599159
## radius_se                 0.711397863
## texture_se                0.121328535
## perimeter_se             -0.057709792
## area_se                   0.002390755
## smoothness_se            -4.047998676
## compactness_se            0.776279239
## concavity_se            -20.363396592
## concave.points_se        40.763205610
## symmetry_se              22.095113470
## fractal_dimension_se      2.076362150
## radius_worst              1.271649182
## texture_worst             0.057946697
## perimeter_worst           0.003132556
## area_worst               -0.007329593
## smoothness_worst          9.547275942
## compactness_worst         0.736729982
## concavity_worst           1.446347774
## concave.points_worst      4.186167608
## symmetry_worst           -0.744362907
## fractal_dimension_worst  11.705728639
# Plot the LDA 
plot(wdbc_raw.lda, col = as.integer(train$diagnosis))

#prediction and Confusion Matrix 

pred <- predict(wdbc_raw.lda, newdata = test)

caret::confusionMatrix(pred$class,test$diagnosis,positive="M")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 103  12
##          M   0  62
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8846, 0.9645)
##     No Information Rate : 0.5819          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8574          
##                                           
##  Mcnemar's Test P-Value : 0.001496        
##                                           
##             Sensitivity : 0.8378          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8957          
##              Prevalence : 0.4181          
##          Detection Rate : 0.3503          
##    Detection Prevalence : 0.3503          
##       Balanced Accuracy : 0.9189          
##                                           
##        'Positive' Class : M               
## 
##### We are still classifying 12 individuals as Benign even though they have malignant cells


#########let us plot a ROC CUrve 
library(ROCR)

# ROC Curve with ROCR package 

library(ROCR)
pred <- predict(wdbc_raw.lda, test, type="response")
pref <- prediction(pred$posterior[,2],test$diagnosis)

# Calculating AUC
auc = performance(pref, "auc")
# Now converting S4 class to a vector
auc = unlist(slot(auc, "y.values"))

# ROC Curve 
eval <- performance(pref,"tpr","fpr")
plot(eval,col = "black",main="ROC Curve") 
abline(0,1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc[[1]],3), sep = ""),lty=3)

### There is still scope of improvement 

####PCA with LDA

#### Let us create a data frame with our PCAs 

wdbc.pcst <- wdbc.pr$x[,1:6]
wdbc.pcst <- cbind(wdbc.pcst, as.numeric(data$diagnosis)-1) # this numeric piece will make M as 1 
colnames(wdbc.pcst)[7] <- "diagnosis" # adding the Target Variable 

data <- as.data.frame(wdbc.pcst)

# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(data), replace = TRUE, prob = c(0.7,0.3))
train <- data[index == 1, ]
test <- data[index == 2, ]

# Model
wdbc.lda <- lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = train)

# Plot the LDA 
plot(wdbc.lda, col = as.integer(train$diagnosis))

#prediction and Confusion Matrix 

pred <- predict(wdbc.lda, newdata = test)
caret::confusionMatrix(pred$class,factor(test$diagnosis))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 103  12
##          1   0  62
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8846, 0.9645)
##     No Information Rate : 0.5819          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8574          
##                                           
##  Mcnemar's Test P-Value : 0.001496        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8378          
##          Pos Pred Value : 0.8957          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5819          
##          Detection Rate : 0.5819          
##    Detection Prevalence : 0.6497          
##       Balanced Accuracy : 0.9189          
##                                           
##        'Positive' Class : 0               
## 
# ROC Curve with ROCR package 

library(ROCR)
pred <- predict(wdbc.lda, test, type="response")
pref <- prediction(pred$posterior[,2],test$diagnosis)

# Calculating AUC
auc1 = performance(pref, "auc")
# Now converting S4 class to a vector
auc1 = unlist(slot(auc1, "y.values"))

# ROC Curve 
eval <- performance(pref,"tpr","fpr")
plot(eval,col = "black",main="ROC Curve") 
abline(0,1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc1[[1]],3), sep = ""),lty=3)

### this has significantly enhanced our our AUC, Good!