Breast Cancer Data

Load the data

library(readxl)
library(naniar)
wis<- read.csv("Breast_Cancer_Data.csv",na=common_na_strings)
str(wis)

## 'data.frame':    569 obs. of  33 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ X                      : logi  NA NA NA NA NA NA ...

data <- wis
dim(data)

## [1] 569  33

Data Pre Processing

library(caret)

# Checking Missing Data 
colSums(is.na(data))

##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave.points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave.points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave.points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst                       X 
##                       0                       0                     569

# go get which rows contain the missing data 
# which(is.na(data), arr.ind=TRUE)

###### Mean and Median Replacement for NA Values 
#  data$rating[is.na(data$rating)]=mean(data$rating,na.rm=TRUE)
#  data$rating[is.na(data$rating)]=median(data$rating,na.rm=TRUE)

# Since X contains more than 88% of missing values, let us remove it! 

data$X<- NULL
data$id <- NULL
str(data)

## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

nearZeroVar(data, saveMetrics= TRUE)

##                         freqRatio percentUnique zeroVar   nzv
## diagnosis                1.683962     0.3514938   FALSE FALSE
## radius_mean              1.333333    80.1405975   FALSE FALSE
## texture_mean             1.000000    84.1827768   FALSE FALSE
## perimeter_mean           1.000000    91.7398946   FALSE FALSE
## area_mean                1.500000    94.7275923   FALSE FALSE
## smoothness_mean          1.250000    83.3040422   FALSE FALSE
## compactness_mean         1.000000    94.3760984   FALSE FALSE
## concavity_mean           4.333333    94.3760984   FALSE FALSE
## concave.points_mean      4.333333    95.2548330   FALSE FALSE
## symmetry_mean            1.000000    75.9226714   FALSE FALSE
## fractal_dimension_mean   1.000000    87.6977153   FALSE FALSE
## radius_se                1.000000    94.9033392   FALSE FALSE
## texture_se               1.000000    91.2126538   FALSE FALSE
## perimeter_se             2.000000    93.6731107   FALSE FALSE
## area_se                  1.000000    92.7943761   FALSE FALSE
## smoothness_se            1.000000    96.1335677   FALSE FALSE
## compactness_se           1.000000    95.0790861   FALSE FALSE
## concavity_se             6.500000    93.6731107   FALSE FALSE
## concave.points_se        4.333333    89.1036907   FALSE FALSE
## symmetry_se              1.333333    87.5219684   FALSE FALSE
## fractal_dimension_se     1.000000    95.7820738   FALSE FALSE
## radius_worst             1.250000    80.3163445   FALSE FALSE
## texture_worst            1.000000    89.8066784   FALSE FALSE
## perimeter_worst          1.000000    90.3339192   FALSE FALSE
## area_worst               1.000000    95.6063269   FALSE FALSE
## smoothness_worst         1.000000    72.2319859   FALSE FALSE
## compactness_worst        1.000000    92.9701230   FALSE FALSE
## concavity_worst          4.333333    94.7275923   FALSE FALSE
## concave.points_worst     4.333333    86.4674868   FALSE FALSE
## symmetry_worst           1.000000    87.8734622   FALSE FALSE
## fractal_dimension_worst  1.500000    94.0246046   FALSE FALSE

feature_map <- unlist(lapply(data, is.numeric)) 
findLinearCombos((data[,feature_map]))

## $linearCombos
## list()
## 
## $remove
## NULL

# Let us see all the numeric variables in the column

num_cols <- unlist(lapply(data, is.numeric))  
only_numeric<- data[, num_cols]
str(only_numeric)

## 'data.frame':    569 obs. of  30 variables:
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

# find correlations to exclude from the model 
findCorrelation( cor(only_numeric), cutoff = .75, names = TRUE )

##  [1] "concavity_mean"         "concave.points_mean"    "compactness_mean"      
##  [4] "concave.points_worst"   "concavity_worst"        "perimeter_worst"       
##  [7] "radius_worst"           "perimeter_mean"         "compactness_worst"     
## [10] "area_worst"             "radius_mean"            "perimeter_se"          
## [13] "concave.points_se"      "compactness_se"         "area_se"               
## [16] "smoothness_mean"        "fractal_dimension_mean" "texture_mean"

# [1] "concavity_mean"         "concave.points_mean"   
# [3] "compactness_mean"       "concave.points_worst"  
# [5] "concavity_worst"        "perimeter_worst"       
# [7] "radius_worst"           "perimeter_mean"        
# [9] "compactness_worst"      "area_worst"            
#[11] "radius_mean"            "perimeter_se"          
#[13] "concave.points_se"      "compactness_se"        
#[15] "area_se"                "smoothness_mean"       
# [17] "fractal_dimension_mean" "texture_mean"  

# there are many variables with high corelations, this is probably because 
# the size of the tumor variable might be causing that to happens

#############Quick EDA 

library(skimr)
skim(data)

Data summary
Name	data
Number of rows	569
Number of columns	31
_______________________
Column type frequency:
factor	1
numeric	30
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
diagnosis	0	1	FALSE	2	B: 357, M: 212

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
radius_mean	1	14.13	3.52	6.98	11.70	13.37	15.78	28.11	▂▇▃▁▁
texture_mean	1	19.29	4.30	9.71	16.17	18.84	21.80	39.28	▃▇▃▁▁
perimeter_mean	1	91.97	24.30	43.79	75.17	86.24	104.10	188.50	▃▇▃▁▁
area_mean	1	654.89	351.91	143.50	420.30	551.10	782.70	2501.00	▇▃▂▁▁
smoothness_mean	1	0.10	0.01	0.05	0.09	0.10	0.11	0.16	▁▇▇▁▁
compactness_mean	1	0.10	0.05	0.02	0.06	0.09	0.13	0.35	▇▇▂▁▁
concavity_mean	1	0.09	0.08	0.00	0.03	0.06	0.13	0.43	▇▃▂▁▁
concave.points_mean	1	0.05	0.04	0.00	0.02	0.03	0.07	0.20	▇▃▂▁▁
symmetry_mean	1	0.18	0.03	0.11	0.16	0.18	0.20	0.30	▁▇▅▁▁
fractal_dimension_mean	1	0.06	0.01	0.05	0.06	0.06	0.07	0.10	▆▇▂▁▁
radius_se	1	0.41	0.28	0.11	0.23	0.32	0.48	2.87	▇▁▁▁▁
texture_se	1	1.22	0.55	0.36	0.83	1.11	1.47	4.88	▇▅▁▁▁
perimeter_se	1	2.87	2.02	0.76	1.61	2.29	3.36	21.98	▇▁▁▁▁
area_se	1	40.34	45.49	6.80	17.85	24.53	45.19	542.20	▇▁▁▁▁
smoothness_se	1	0.01	0.00	0.00	0.01	0.01	0.01	0.03	▇▃▁▁▁
compactness_se	1	0.03	0.02	0.00	0.01	0.02	0.03	0.14	▇▃▁▁▁
concavity_se	1	0.03	0.03	0.00	0.02	0.03	0.04	0.40	▇▁▁▁▁
concave.points_se	1	0.01	0.01	0.00	0.01	0.01	0.01	0.05	▇▇▁▁▁
symmetry_se	1	0.02	0.01	0.01	0.02	0.02	0.02	0.08	▇▃▁▁▁
fractal_dimension_se	1	0.00	0.00	0.00	0.00	0.00	0.00	0.03	▇▁▁▁▁
radius_worst	1	16.27	4.83	7.93	13.01	14.97	18.79	36.04	▆▇▃▁▁
texture_worst	1	25.68	6.15	12.02	21.08	25.41	29.72	49.54	▃▇▆▁▁
perimeter_worst	1	107.26	33.60	50.41	84.11	97.66	125.40	251.20	▇▇▃▁▁
area_worst	1	880.58	569.36	185.20	515.30	686.50	1084.00	4254.00	▇▂▁▁▁
smoothness_worst	1	0.13	0.02	0.07	0.12	0.13	0.15	0.22	▂▇▇▂▁
compactness_worst	1	0.25	0.16	0.03	0.15	0.21	0.34	1.06	▇▅▁▁▁
concavity_worst	1	0.27	0.21	0.00	0.11	0.23	0.38	1.25	▇▅▂▁▁
concave.points_worst	1	0.11	0.07	0.00	0.06	0.10	0.16	0.29	▅▇▅▃▁
symmetry_worst	1	0.29	0.06	0.16	0.25	0.28	0.32	0.66	▅▇▁▁▁
fractal_dimension_worst	1	0.08	0.02	0.06	0.07	0.08	0.09	0.21	▇▃▁▁▁

# library(DataExplorer)
# create_report(data)

###################

Checking Correlation between Numeric Variables

library(GGally)
# Plot correlation heatmap
ggcorr(data, label = TRUE, 
       palette = "RdBu", 
       name = "Correlation", 
       hjust = 0.75, 
       label_size = 2, 
       label_round = 2)

### alot of variables are highly corelated

Since there are 30 variables, let us try to do some dimentionality Reduction

library(factoextra)

wdbc.pr <- prcomp(data[c(2:31)], center = TRUE, scale = TRUE)
summary(wdbc.pr)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000

##### this shows that with just 6 PCs I am ablee to represent variance of around 90% of the data

##### Note, Since an eigenvalues <1 would mean that the component actually explains less than a single explanatory variable we would like to discard those.

###### If our data is well suited for PCA we should be able to discard these components while retaining at least 70–80% of cumulative variance.

### let us plot these 

screeplot(wdbc.pr, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

cumpro <- cumsum(wdbc.pr$sdev^2 / sum(wdbc.pr$sdev^2))
plot(cumpro[0:15], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 6, col="blue", lty=5)
abline(h = 0.88759, col="blue", lty=5)
legend("topleft", legend=c("Cut-off @ PC6"),
       col=c("blue"), lty=5, cex=0.6)

###### We can see that first 6 components explain around 90% of the data 


###### We notice is that the first 6 components has an Eigenvalue >1 and explains almost 90% of variance, this is great! We can effectively reduce dimensionality from 30 to 6 while only “loosing” about 10% of variance!

We can also see that wee can explain 60% of the variance with just 2 components

# First two PCs 
plot(wdbc.pr$x[,1],wdbc.pr$x[,2], xlab="PC1 (44.3%)", ylab = "PC2 (19%)", main = "PC1 / PC2 - plot")

fviz_pca_ind(wdbc.pr, geom.ind = "point", pointshape = 21, 
             pointsize = 2, 
             fill.ind = data$diagnosis, 
             col.ind = "black", 
             palette = "jco", 
             addEllipses = TRUE,
             label = "var",
             col.var = "black",
             repel = TRUE,
             legend.title = "Diagnosis") +
  ggtitle("2D PCA-plot from 30 feature dataset") +
  theme(plot.title = element_text(hjust = 0.5))

#### we see the beauty of PCA. With just the first two components we can clearly see some separation between the benign and malignant tumors. This is a clear indication that the data is well-suited for some kind of classification model (like discriminant analysis).

LDA

library(MASS) # for LDA 

# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(data), replace = TRUE, prob = c(0.7,0.3))
train <- data[index == 1, ]
test <- data[index == 2, ]

wdbc_raw.lda <- lda(diagnosis~., data = train)

print(wdbc_raw.lda) # print the LDA

## Call:
## lda(diagnosis ~ ., data = train)
## 
## Prior probabilities of groups:
##         B         M 
## 0.6479592 0.3520408 
## 
## Group means:
##   radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## B    12.21941     18.00465       78.53476  467.6327      0.09207587
## M    17.46493     21.35101      115.49391  980.3304      0.10353957
##   compactness_mean concavity_mean concave.points_mean symmetry_mean
## B       0.07975531     0.04568185          0.02586663     0.1739382
## M       0.14983188     0.16425435          0.08978297     0.1942261
##   fractal_dimension_mean radius_se texture_se perimeter_se  area_se
## B             0.06252114 0.2818071   1.205663     1.965192 21.03904
## M             0.06317594 0.6129370   1.171064     4.386007 72.77007
##   smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## B   0.006989016     0.02131469   0.02560055       0.009800461  0.02035035
## M   0.006492993     0.03319930   0.04148522       0.014883986  0.02021338
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## B          0.003552607     13.47448      23.63315        87.58291   566.0039
## M          0.004133464     21.32384      29.22348       142.75899  1444.6877
##   smoothness_worst compactness_worst concavity_worst concave.points_worst
## B        0.1242542         0.1862876       0.1678460           0.07534906
## M        0.1467007         0.3972314       0.4695159           0.18687290
##   symmetry_worst fractal_dimension_worst
## B      0.2704665              0.07929543
## M      0.3296362              0.09417819
## 
## Coefficients of linear discriminants:
##                                   LD1
## radius_mean              -0.903870650
## texture_mean             -0.018671996
## perimeter_mean            0.017448490
## area_mean                 0.004358580
## smoothness_mean          -4.147707916
## compactness_mean        -22.356931825
## concavity_mean            9.292530916
## concave.points_mean      10.559487069
## symmetry_mean             6.292532389
## fractal_dimension_mean   15.660599159
## radius_se                 0.711397863
## texture_se                0.121328535
## perimeter_se             -0.057709792
## area_se                   0.002390755
## smoothness_se            -4.047998676
## compactness_se            0.776279239
## concavity_se            -20.363396592
## concave.points_se        40.763205610
## symmetry_se              22.095113470
## fractal_dimension_se      2.076362150
## radius_worst              1.271649182
## texture_worst             0.057946697
## perimeter_worst           0.003132556
## area_worst               -0.007329593
## smoothness_worst          9.547275942
## compactness_worst         0.736729982
## concavity_worst           1.446347774
## concave.points_worst      4.186167608
## symmetry_worst           -0.744362907
## fractal_dimension_worst  11.705728639

# Plot the LDA 
plot(wdbc_raw.lda, col = as.integer(train$diagnosis))

#prediction and Confusion Matrix 

pred <- predict(wdbc_raw.lda, newdata = test)

caret::confusionMatrix(pred$class,test$diagnosis,positive="M")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 103  12
##          M   0  62
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8846, 0.9645)
##     No Information Rate : 0.5819          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8574          
##                                           
##  Mcnemar's Test P-Value : 0.001496        
##                                           
##             Sensitivity : 0.8378          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8957          
##              Prevalence : 0.4181          
##          Detection Rate : 0.3503          
##    Detection Prevalence : 0.3503          
##       Balanced Accuracy : 0.9189          
##                                           
##        'Positive' Class : M               
##

##### We are still classifying 12 individuals as Benign even though they have malignant cells


#########let us plot a ROC CUrve 
library(ROCR)

# ROC Curve with ROCR package 

library(ROCR)
pred <- predict(wdbc_raw.lda, test, type="response")
pref <- prediction(pred$posterior[,2],test$diagnosis)

# Calculating AUC
auc = performance(pref, "auc")
# Now converting S4 class to a vector
auc = unlist(slot(auc, "y.values"))

# ROC Curve 
eval <- performance(pref,"tpr","fpr")
plot(eval,col = "black",main="ROC Curve") 
abline(0,1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc[[1]],3), sep = ""),lty=3)

### There is still scope of improvement

####PCA with LDA

#### Let us create a data frame with our PCAs 

wdbc.pcst <- wdbc.pr$x[,1:6]
wdbc.pcst <- cbind(wdbc.pcst, as.numeric(data$diagnosis)-1) # this numeric piece will make M as 1 
colnames(wdbc.pcst)[7] <- "diagnosis" # adding the Target Variable 

data <- as.data.frame(wdbc.pcst)

# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(data), replace = TRUE, prob = c(0.7,0.3))
train <- data[index == 1, ]
test <- data[index == 2, ]

# Model
wdbc.lda <- lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = train)

# Plot the LDA 
plot(wdbc.lda, col = as.integer(train$diagnosis))

#prediction and Confusion Matrix 

pred <- predict(wdbc.lda, newdata = test)
caret::confusionMatrix(pred$class,factor(test$diagnosis))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 103  12
##          1   0  62
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8846, 0.9645)
##     No Information Rate : 0.5819          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8574          
##                                           
##  Mcnemar's Test P-Value : 0.001496        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8378          
##          Pos Pred Value : 0.8957          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5819          
##          Detection Rate : 0.5819          
##    Detection Prevalence : 0.6497          
##       Balanced Accuracy : 0.9189          
##                                           
##        'Positive' Class : 0               
##

# ROC Curve with ROCR package 

library(ROCR)
pred <- predict(wdbc.lda, test, type="response")
pref <- prediction(pred$posterior[,2],test$diagnosis)

# Calculating AUC
auc1 = performance(pref, "auc")
# Now converting S4 class to a vector
auc1 = unlist(slot(auc1, "y.values"))

# ROC Curve 
eval <- performance(pref,"tpr","fpr")
plot(eval,col = "black",main="ROC Curve") 
abline(0,1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc1[[1]],3), sep = ""),lty=3)

### this has significantly enhanced our our AUC, Good!

Breast Cancer Data

Chirag Ahluwalia

5/5/2020

Load the data

Data Pre Processing

Checking Correlation between Numeric Variables

Since there are 30 variables, let us try to do some dimentionality Reduction

We can also see that wee can explain 60% of the variance with just 2 components

LDA