library(readxl)
library(naniar)
wis<- read.csv("Breast_Cancer_Data.csv",na=common_na_strings)
str(wis)
## 'data.frame': 569 obs. of 33 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ X : logi NA NA NA NA NA NA ...
data <- wis
dim(data)
## [1] 569 33
library(caret)
# Checking Missing Data
colSums(is.na(data))
## id diagnosis radius_mean
## 0 0 0
## texture_mean perimeter_mean area_mean
## 0 0 0
## smoothness_mean compactness_mean concavity_mean
## 0 0 0
## concave.points_mean symmetry_mean fractal_dimension_mean
## 0 0 0
## radius_se texture_se perimeter_se
## 0 0 0
## area_se smoothness_se compactness_se
## 0 0 0
## concavity_se concave.points_se symmetry_se
## 0 0 0
## fractal_dimension_se radius_worst texture_worst
## 0 0 0
## perimeter_worst area_worst smoothness_worst
## 0 0 0
## compactness_worst concavity_worst concave.points_worst
## 0 0 0
## symmetry_worst fractal_dimension_worst X
## 0 0 569
# go get which rows contain the missing data
# which(is.na(data), arr.ind=TRUE)
###### Mean and Median Replacement for NA Values
# data$rating[is.na(data$rating)]=mean(data$rating,na.rm=TRUE)
# data$rating[is.na(data$rating)]=median(data$rating,na.rm=TRUE)
# Since X contains more than 88% of missing values, let us remove it!
data$X<- NULL
data$id <- NULL
str(data)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
nearZeroVar(data, saveMetrics= TRUE)
## freqRatio percentUnique zeroVar nzv
## diagnosis 1.683962 0.3514938 FALSE FALSE
## radius_mean 1.333333 80.1405975 FALSE FALSE
## texture_mean 1.000000 84.1827768 FALSE FALSE
## perimeter_mean 1.000000 91.7398946 FALSE FALSE
## area_mean 1.500000 94.7275923 FALSE FALSE
## smoothness_mean 1.250000 83.3040422 FALSE FALSE
## compactness_mean 1.000000 94.3760984 FALSE FALSE
## concavity_mean 4.333333 94.3760984 FALSE FALSE
## concave.points_mean 4.333333 95.2548330 FALSE FALSE
## symmetry_mean 1.000000 75.9226714 FALSE FALSE
## fractal_dimension_mean 1.000000 87.6977153 FALSE FALSE
## radius_se 1.000000 94.9033392 FALSE FALSE
## texture_se 1.000000 91.2126538 FALSE FALSE
## perimeter_se 2.000000 93.6731107 FALSE FALSE
## area_se 1.000000 92.7943761 FALSE FALSE
## smoothness_se 1.000000 96.1335677 FALSE FALSE
## compactness_se 1.000000 95.0790861 FALSE FALSE
## concavity_se 6.500000 93.6731107 FALSE FALSE
## concave.points_se 4.333333 89.1036907 FALSE FALSE
## symmetry_se 1.333333 87.5219684 FALSE FALSE
## fractal_dimension_se 1.000000 95.7820738 FALSE FALSE
## radius_worst 1.250000 80.3163445 FALSE FALSE
## texture_worst 1.000000 89.8066784 FALSE FALSE
## perimeter_worst 1.000000 90.3339192 FALSE FALSE
## area_worst 1.000000 95.6063269 FALSE FALSE
## smoothness_worst 1.000000 72.2319859 FALSE FALSE
## compactness_worst 1.000000 92.9701230 FALSE FALSE
## concavity_worst 4.333333 94.7275923 FALSE FALSE
## concave.points_worst 4.333333 86.4674868 FALSE FALSE
## symmetry_worst 1.000000 87.8734622 FALSE FALSE
## fractal_dimension_worst 1.500000 94.0246046 FALSE FALSE
feature_map <- unlist(lapply(data, is.numeric))
findLinearCombos((data[,feature_map]))
## $linearCombos
## list()
##
## $remove
## NULL
# Let us see all the numeric variables in the column
num_cols <- unlist(lapply(data, is.numeric))
only_numeric<- data[, num_cols]
str(only_numeric)
## 'data.frame': 569 obs. of 30 variables:
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
# find correlations to exclude from the model
findCorrelation( cor(only_numeric), cutoff = .75, names = TRUE )
## [1] "concavity_mean" "concave.points_mean" "compactness_mean"
## [4] "concave.points_worst" "concavity_worst" "perimeter_worst"
## [7] "radius_worst" "perimeter_mean" "compactness_worst"
## [10] "area_worst" "radius_mean" "perimeter_se"
## [13] "concave.points_se" "compactness_se" "area_se"
## [16] "smoothness_mean" "fractal_dimension_mean" "texture_mean"
# [1] "concavity_mean" "concave.points_mean"
# [3] "compactness_mean" "concave.points_worst"
# [5] "concavity_worst" "perimeter_worst"
# [7] "radius_worst" "perimeter_mean"
# [9] "compactness_worst" "area_worst"
#[11] "radius_mean" "perimeter_se"
#[13] "concave.points_se" "compactness_se"
#[15] "area_se" "smoothness_mean"
# [17] "fractal_dimension_mean" "texture_mean"
# there are many variables with high corelations, this is probably because
# the size of the tumor variable might be causing that to happens
#############Quick EDA
library(skimr)
skim(data)
| Name | data |
| Number of rows | 569 |
| Number of columns | 31 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 30 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| diagnosis | 0 | 1 | FALSE | 2 | B: 357, M: 212 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| radius_mean | 0 | 1 | 14.13 | 3.52 | 6.98 | 11.70 | 13.37 | 15.78 | 28.11 | ▂▇▃▁▁ |
| texture_mean | 0 | 1 | 19.29 | 4.30 | 9.71 | 16.17 | 18.84 | 21.80 | 39.28 | ▃▇▃▁▁ |
| perimeter_mean | 0 | 1 | 91.97 | 24.30 | 43.79 | 75.17 | 86.24 | 104.10 | 188.50 | ▃▇▃▁▁ |
| area_mean | 0 | 1 | 654.89 | 351.91 | 143.50 | 420.30 | 551.10 | 782.70 | 2501.00 | ▇▃▂▁▁ |
| smoothness_mean | 0 | 1 | 0.10 | 0.01 | 0.05 | 0.09 | 0.10 | 0.11 | 0.16 | ▁▇▇▁▁ |
| compactness_mean | 0 | 1 | 0.10 | 0.05 | 0.02 | 0.06 | 0.09 | 0.13 | 0.35 | ▇▇▂▁▁ |
| concavity_mean | 0 | 1 | 0.09 | 0.08 | 0.00 | 0.03 | 0.06 | 0.13 | 0.43 | ▇▃▂▁▁ |
| concave.points_mean | 0 | 1 | 0.05 | 0.04 | 0.00 | 0.02 | 0.03 | 0.07 | 0.20 | ▇▃▂▁▁ |
| symmetry_mean | 0 | 1 | 0.18 | 0.03 | 0.11 | 0.16 | 0.18 | 0.20 | 0.30 | ▁▇▅▁▁ |
| fractal_dimension_mean | 0 | 1 | 0.06 | 0.01 | 0.05 | 0.06 | 0.06 | 0.07 | 0.10 | ▆▇▂▁▁ |
| radius_se | 0 | 1 | 0.41 | 0.28 | 0.11 | 0.23 | 0.32 | 0.48 | 2.87 | ▇▁▁▁▁ |
| texture_se | 0 | 1 | 1.22 | 0.55 | 0.36 | 0.83 | 1.11 | 1.47 | 4.88 | ▇▅▁▁▁ |
| perimeter_se | 0 | 1 | 2.87 | 2.02 | 0.76 | 1.61 | 2.29 | 3.36 | 21.98 | ▇▁▁▁▁ |
| area_se | 0 | 1 | 40.34 | 45.49 | 6.80 | 17.85 | 24.53 | 45.19 | 542.20 | ▇▁▁▁▁ |
| smoothness_se | 0 | 1 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 | 0.03 | ▇▃▁▁▁ |
| compactness_se | 0 | 1 | 0.03 | 0.02 | 0.00 | 0.01 | 0.02 | 0.03 | 0.14 | ▇▃▁▁▁ |
| concavity_se | 0 | 1 | 0.03 | 0.03 | 0.00 | 0.02 | 0.03 | 0.04 | 0.40 | ▇▁▁▁▁ |
| concave.points_se | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.01 | 0.01 | 0.01 | 0.05 | ▇▇▁▁▁ |
| symmetry_se | 0 | 1 | 0.02 | 0.01 | 0.01 | 0.02 | 0.02 | 0.02 | 0.08 | ▇▃▁▁▁ |
| fractal_dimension_se | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.03 | ▇▁▁▁▁ |
| radius_worst | 0 | 1 | 16.27 | 4.83 | 7.93 | 13.01 | 14.97 | 18.79 | 36.04 | ▆▇▃▁▁ |
| texture_worst | 0 | 1 | 25.68 | 6.15 | 12.02 | 21.08 | 25.41 | 29.72 | 49.54 | ▃▇▆▁▁ |
| perimeter_worst | 0 | 1 | 107.26 | 33.60 | 50.41 | 84.11 | 97.66 | 125.40 | 251.20 | ▇▇▃▁▁ |
| area_worst | 0 | 1 | 880.58 | 569.36 | 185.20 | 515.30 | 686.50 | 1084.00 | 4254.00 | ▇▂▁▁▁ |
| smoothness_worst | 0 | 1 | 0.13 | 0.02 | 0.07 | 0.12 | 0.13 | 0.15 | 0.22 | ▂▇▇▂▁ |
| compactness_worst | 0 | 1 | 0.25 | 0.16 | 0.03 | 0.15 | 0.21 | 0.34 | 1.06 | ▇▅▁▁▁ |
| concavity_worst | 0 | 1 | 0.27 | 0.21 | 0.00 | 0.11 | 0.23 | 0.38 | 1.25 | ▇▅▂▁▁ |
| concave.points_worst | 0 | 1 | 0.11 | 0.07 | 0.00 | 0.06 | 0.10 | 0.16 | 0.29 | ▅▇▅▃▁ |
| symmetry_worst | 0 | 1 | 0.29 | 0.06 | 0.16 | 0.25 | 0.28 | 0.32 | 0.66 | ▅▇▁▁▁ |
| fractal_dimension_worst | 0 | 1 | 0.08 | 0.02 | 0.06 | 0.07 | 0.08 | 0.09 | 0.21 | ▇▃▁▁▁ |
# library(DataExplorer)
# create_report(data)
###################
library(GGally)
# Plot correlation heatmap
ggcorr(data, label = TRUE,
palette = "RdBu",
name = "Correlation",
hjust = 0.75,
label_size = 2,
label_round = 2)
### alot of variables are highly corelated
library(factoextra)
wdbc.pr <- prcomp(data[c(2:31)], center = TRUE, scale = TRUE)
summary(wdbc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion 0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion 0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
## PC29 PC30
## Standard deviation 0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion 1.00000 1.00000
##### this shows that with just 6 PCs I am ablee to represent variance of around 90% of the data
##### Note, Since an eigenvalues <1 would mean that the component actually explains less than a single explanatory variable we would like to discard those.
###### If our data is well suited for PCA we should be able to discard these components while retaining at least 70–80% of cumulative variance.
### let us plot these
screeplot(wdbc.pr, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
cumpro <- cumsum(wdbc.pr$sdev^2 / sum(wdbc.pr$sdev^2))
plot(cumpro[0:15], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 6, col="blue", lty=5)
abline(h = 0.88759, col="blue", lty=5)
legend("topleft", legend=c("Cut-off @ PC6"),
col=c("blue"), lty=5, cex=0.6)
###### We can see that first 6 components explain around 90% of the data
###### We notice is that the first 6 components has an Eigenvalue >1 and explains almost 90% of variance, this is great! We can effectively reduce dimensionality from 30 to 6 while only “loosing” about 10% of variance!
# First two PCs
plot(wdbc.pr$x[,1],wdbc.pr$x[,2], xlab="PC1 (44.3%)", ylab = "PC2 (19%)", main = "PC1 / PC2 - plot")
fviz_pca_ind(wdbc.pr, geom.ind = "point", pointshape = 21,
pointsize = 2,
fill.ind = data$diagnosis,
col.ind = "black",
palette = "jco",
addEllipses = TRUE,
label = "var",
col.var = "black",
repel = TRUE,
legend.title = "Diagnosis") +
ggtitle("2D PCA-plot from 30 feature dataset") +
theme(plot.title = element_text(hjust = 0.5))
#### we see the beauty of PCA. With just the first two components we can clearly see some separation between the benign and malignant tumors. This is a clear indication that the data is well-suited for some kind of classification model (like discriminant analysis).
library(MASS) # for LDA
# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(data), replace = TRUE, prob = c(0.7,0.3))
train <- data[index == 1, ]
test <- data[index == 2, ]
wdbc_raw.lda <- lda(diagnosis~., data = train)
print(wdbc_raw.lda) # print the LDA
## Call:
## lda(diagnosis ~ ., data = train)
##
## Prior probabilities of groups:
## B M
## 0.6479592 0.3520408
##
## Group means:
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## B 12.21941 18.00465 78.53476 467.6327 0.09207587
## M 17.46493 21.35101 115.49391 980.3304 0.10353957
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## B 0.07975531 0.04568185 0.02586663 0.1739382
## M 0.14983188 0.16425435 0.08978297 0.1942261
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## B 0.06252114 0.2818071 1.205663 1.965192 21.03904
## M 0.06317594 0.6129370 1.171064 4.386007 72.77007
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## B 0.006989016 0.02131469 0.02560055 0.009800461 0.02035035
## M 0.006492993 0.03319930 0.04148522 0.014883986 0.02021338
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## B 0.003552607 13.47448 23.63315 87.58291 566.0039
## M 0.004133464 21.32384 29.22348 142.75899 1444.6877
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## B 0.1242542 0.1862876 0.1678460 0.07534906
## M 0.1467007 0.3972314 0.4695159 0.18687290
## symmetry_worst fractal_dimension_worst
## B 0.2704665 0.07929543
## M 0.3296362 0.09417819
##
## Coefficients of linear discriminants:
## LD1
## radius_mean -0.903870650
## texture_mean -0.018671996
## perimeter_mean 0.017448490
## area_mean 0.004358580
## smoothness_mean -4.147707916
## compactness_mean -22.356931825
## concavity_mean 9.292530916
## concave.points_mean 10.559487069
## symmetry_mean 6.292532389
## fractal_dimension_mean 15.660599159
## radius_se 0.711397863
## texture_se 0.121328535
## perimeter_se -0.057709792
## area_se 0.002390755
## smoothness_se -4.047998676
## compactness_se 0.776279239
## concavity_se -20.363396592
## concave.points_se 40.763205610
## symmetry_se 22.095113470
## fractal_dimension_se 2.076362150
## radius_worst 1.271649182
## texture_worst 0.057946697
## perimeter_worst 0.003132556
## area_worst -0.007329593
## smoothness_worst 9.547275942
## compactness_worst 0.736729982
## concavity_worst 1.446347774
## concave.points_worst 4.186167608
## symmetry_worst -0.744362907
## fractal_dimension_worst 11.705728639
# Plot the LDA
plot(wdbc_raw.lda, col = as.integer(train$diagnosis))
#prediction and Confusion Matrix
pred <- predict(wdbc_raw.lda, newdata = test)
caret::confusionMatrix(pred$class,test$diagnosis,positive="M")
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 103 12
## M 0 62
##
## Accuracy : 0.9322
## 95% CI : (0.8846, 0.9645)
## No Information Rate : 0.5819
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8574
##
## Mcnemar's Test P-Value : 0.001496
##
## Sensitivity : 0.8378
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8957
## Prevalence : 0.4181
## Detection Rate : 0.3503
## Detection Prevalence : 0.3503
## Balanced Accuracy : 0.9189
##
## 'Positive' Class : M
##
##### We are still classifying 12 individuals as Benign even though they have malignant cells
#########let us plot a ROC CUrve
library(ROCR)
# ROC Curve with ROCR package
library(ROCR)
pred <- predict(wdbc_raw.lda, test, type="response")
pref <- prediction(pred$posterior[,2],test$diagnosis)
# Calculating AUC
auc = performance(pref, "auc")
# Now converting S4 class to a vector
auc = unlist(slot(auc, "y.values"))
# ROC Curve
eval <- performance(pref,"tpr","fpr")
plot(eval,col = "black",main="ROC Curve")
abline(0,1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc[[1]],3), sep = ""),lty=3)
### There is still scope of improvement
####PCA with LDA
#### Let us create a data frame with our PCAs
wdbc.pcst <- wdbc.pr$x[,1:6]
wdbc.pcst <- cbind(wdbc.pcst, as.numeric(data$diagnosis)-1) # this numeric piece will make M as 1
colnames(wdbc.pcst)[7] <- "diagnosis" # adding the Target Variable
data <- as.data.frame(wdbc.pcst)
# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(data), replace = TRUE, prob = c(0.7,0.3))
train <- data[index == 1, ]
test <- data[index == 2, ]
# Model
wdbc.lda <- lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = train)
# Plot the LDA
plot(wdbc.lda, col = as.integer(train$diagnosis))
#prediction and Confusion Matrix
pred <- predict(wdbc.lda, newdata = test)
caret::confusionMatrix(pred$class,factor(test$diagnosis))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 103 12
## 1 0 62
##
## Accuracy : 0.9322
## 95% CI : (0.8846, 0.9645)
## No Information Rate : 0.5819
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8574
##
## Mcnemar's Test P-Value : 0.001496
##
## Sensitivity : 1.0000
## Specificity : 0.8378
## Pos Pred Value : 0.8957
## Neg Pred Value : 1.0000
## Prevalence : 0.5819
## Detection Rate : 0.5819
## Detection Prevalence : 0.6497
## Balanced Accuracy : 0.9189
##
## 'Positive' Class : 0
##
# ROC Curve with ROCR package
library(ROCR)
pred <- predict(wdbc.lda, test, type="response")
pref <- prediction(pred$posterior[,2],test$diagnosis)
# Calculating AUC
auc1 = performance(pref, "auc")
# Now converting S4 class to a vector
auc1 = unlist(slot(auc1, "y.values"))
# ROC Curve
eval <- performance(pref,"tpr","fpr")
plot(eval,col = "black",main="ROC Curve")
abline(0,1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc1[[1]],3), sep = ""),lty=3)
### this has significantly enhanced our our AUC, Good!