Breast Cancer detection

Introduction

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.

Importing and Cleaning the data

Before importing, let’s first load the required libraries.

#url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
#download.file(url, 'wdbc.csv')

# use read_csv to the read into a dataframe
# columnNames are missing in the above link, so we need to give them manually.
columnNames <- c("id","diagnosis","radius_mean","texture_mean","perimeter_mean",
                 "area_mean","smoothness_mean","compactness_mean","concavity_mean",
                 "concave_points_mean","symmetry_mean","fractal_dimension_mean",
                 "radius_se","texture_se","perimeter_se","area_se","smoothness_se",
                 "compactness_se","concavity_se","concave_points_se","symmetry_se",
                 "fractal_dimension_se","radius_worst","texture_worst","perimeter_worst",
                 "area_worst","smoothness_worst","compactness_worst","concavity_worst",
                 "concave_points_worst","symmetry_worst","fractal_dimension_worst")
#wdbc <- read_csv(url, col_names = columnNames, col_types = NULL)
wdbc <- read.csv('wdbc.csv', header = FALSE, col.names = columnNames)

Lets have alook at a peak

library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(wdbc)
## Rows: 569
## Columns: 32
## $ id                      <int> 842302, 842517, 84300903, 84348301, 84358402,…
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", …
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.45…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57,…
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1,…
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, …
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, …
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, …
## $ concave_points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, …
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.208…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, …
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.334…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.890…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.1…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.011…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.024…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, …
## $ concave_points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.018…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, …
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.005…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6,…
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.179…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.524…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, …
## $ concave_points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, …
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.398…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, …

Exploratory Data Analysis

Our response variable is diagnosis: Benign (B) or Malignant (M). We have 3 sets of 10 numeric variables: mean, se, worst Let’s first collect all the 30 numeric variables into a matrix

# Convert the features of the data: wdbc.data
wdbc.data <- as.matrix(wdbc[,c(3:32)])

# Set the row names of wdbc.data
row.names(wdbc.data) <- wdbc$id

# Create diagnosis vector
diagnosis <- as.numeric(wdbc$diagnosis == "M")

Here are some answers to basic questions like below:

How many observationsare in this dataset ?

number of rows

nrow(wdbc.data)
## [1] 569

How many variables/features in the data are suffixed with _mean, _se, _worst?

sum(endsWith(colnames(wdbc.data), "_mean"))
## [1] 10
sum(endsWith(colnames(wdbc.data), "_se"))
## [1] 10
sum(endsWith(colnames(wdbc.data), "_worst"))
## [1] 10

How many observations have benign or malignant diagnosis ?

table(wdbc$diagnosis)
## 
##   B   M 
## 357 212

What is the mean of each of the numeric columns ?

round(colMeans(wdbc.data),2)
##             radius_mean            texture_mean          perimeter_mean 
##                   14.13                   19.29                   91.97 
##               area_mean         smoothness_mean        compactness_mean 
##                  654.89                    0.10                    0.10 
##          concavity_mean     concave_points_mean           symmetry_mean 
##                    0.09                    0.05                    0.18 
##  fractal_dimension_mean               radius_se              texture_se 
##                    0.06                    0.41                    1.22 
##            perimeter_se                 area_se           smoothness_se 
##                    2.87                   40.34                    0.01 
##          compactness_se            concavity_se       concave_points_se 
##                    0.03                    0.03                    0.01 
##             symmetry_se    fractal_dimension_se            radius_worst 
##                    0.02                    0.00                   16.27 
##           texture_worst         perimeter_worst              area_worst 
##                   25.68                  107.26                  880.58 
##        smoothness_worst       compactness_worst         concavity_worst 
##                    0.13                    0.25                    0.27 
##    concave_points_worst          symmetry_worst fractal_dimension_worst 
##                    0.11                    0.29                    0.08

What is the sd of each of the numeric columns ?

roundSD <- function(x){
  round(sd(x),2)
}
apply(wdbc.data, 2, roundSD)
##             radius_mean            texture_mean          perimeter_mean 
##                    3.52                    4.30                   24.30 
##               area_mean         smoothness_mean        compactness_mean 
##                  351.91                    0.01                    0.05 
##          concavity_mean     concave_points_mean           symmetry_mean 
##                    0.08                    0.04                    0.03 
##  fractal_dimension_mean               radius_se              texture_se 
##                    0.01                    0.28                    0.55 
##            perimeter_se                 area_se           smoothness_se 
##                    2.02                   45.49                    0.00 
##          compactness_se            concavity_se       concave_points_se 
##                    0.02                    0.03                    0.01 
##             symmetry_se    fractal_dimension_se            radius_worst 
##                    0.01                    0.00                    4.83 
##           texture_worst         perimeter_worst              area_worst 
##                    6.15                   33.60                  569.36 
##        smoothness_worst       compactness_worst         concavity_worst 
##                    0.02                    0.16                    0.21 
##    concave_points_worst          symmetry_worst fractal_dimension_worst 
##                    0.07                    0.06                    0.02

PCA

Why PCA? Due to the number of variables in the model, we can try using a dimensionality reduction technique to unveil any patterns in the data. As mentioned in the Exploratory Data Analysis section, there are thirty variables that when combined can be used to model each patient’s diagnosis. Using PCA we can combine our many variables into different linear combinations that each explain a part of the variance of the model. By proceeding with PCA we are assuming the linearity of the combinations of our variables within the dataset. By choosing only the linear combinations that provide a majority (>= 85%) of the co-variance, we can reduce the complexity of our model. We can then more easily see how the model works and provide meaningful graphs and representations of our complex dataset.

The first step in doing a PCA, is to ask ourselves whether or not the data should be scaled to unit variance. That is, to bring all the numeric variables to the same scale. In other words, we are trying to determine whether we should use a correlation matrix or a covariance matrix in our calculations of eigen values and eigen vectors (aka principal components).

Running a PCA using covariance matrix:

When the covariance matrix is used to calculate the eigen values and eigen vectors, we use the princomp() function. Let’s take a look at the summary of the princomp output.

wdbc.pcov <- princomp(wdbc.data, scores = TRUE)
summary(wdbc.pcov)
## Importance of components:
##                             Comp.1      Comp.2       Comp.3      Comp.4
## Standard deviation     665.5844581 85.42395908 26.506542133 7.385979582
## Proportion of Variance   0.9820447  0.01617649  0.001557511 0.000120932
## Cumulative Proportion    0.9820447  0.99822116  0.999778672 0.999899604
##                              Comp.5       Comp.6       Comp.7       Comp.8
## Standard deviation     6.310302e+00 1.731851e+00 1.346157e+00 6.089449e-01
## Proportion of Variance 8.827245e-05 6.648840e-06 4.017137e-06 8.220172e-07
## Cumulative Proportion  9.999879e-01 9.999945e-01 9.999985e-01 9.999994e-01
##                              Comp.9      Comp.10      Comp.11      Comp.12
## Standard deviation     3.940054e-01 2.896782e-01 1.776328e-01 8.651121e-02
## Proportion of Variance 3.441353e-07 1.860187e-07 6.994732e-08 1.659089e-08
## Cumulative Proportion  9.999997e-01 9.999999e-01 1.000000e+00 1.000000e+00
##                             Comp.13      Comp.14      Comp.15      Comp.16
## Standard deviation     5.617918e-02 4.645111e-02 3.638966e-02 2.528130e-02
## Proportion of Variance 6.996416e-09 4.783183e-09 2.935492e-09 1.416849e-09
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                             Comp.17      Comp.18      Comp.19      Comp.20
## Standard deviation     1.934488e-02 1.532176e-02 1.357421e-02 1.280201e-02
## Proportion of Variance 8.295777e-10 5.204059e-10 4.084640e-10 3.633134e-10
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                             Comp.21      Comp.22     Comp.23      Comp.24
## Standard deviation     8.830228e-03 7.583529e-03 5.90389e-03 5.324037e-03
## Proportion of Variance 1.728497e-10 1.274875e-10 7.72683e-11 6.283577e-11
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.00000e+00 1.000000e+00
##                             Comp.25      Comp.26      Comp.27      Comp.28
## Standard deviation     4.014722e-03 3.531047e-03 1.916772e-03 1.686090e-03
## Proportion of Variance 3.573023e-11 2.763960e-11 8.144523e-12 6.302115e-12
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                             Comp.29      Comp.30
## Standard deviation     1.414706e-03 8.371162e-04
## Proportion of Variance 4.436669e-12 1.553447e-12
## Cumulative Proportion  1.000000e+00 1.000000e+00

Bi-plot using covariance matrix: Looking at the descriptive statistics of “area_mean” and “area_worst”, we can observe that they have unusually large values for both mean and standard deviation. The units of measurements for these variables are different than the units of measurements of the other numeric variables. The effect of using variables with different scales can lead to amplified variances. This can be visually assessed by looking at the bi-plot of PC1 vs PC2, calculated from using non-scaled data (vs) scaled data. Below output shows non-scaled data since we are using a covariance matrix.

cex.before <- par("cex")
par(cex = 0.7)
biplot(wdbc.pcov)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

par(cex = cex.before)

Scree plots can be useful in deciding how many PC’s we should keep in the model. Let’s create the scree-plots in R. As there is no R function to create a scree-plot, we need to prepare the data for the plot.

# Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))

# Calculate variability of each component
pr.cvar <- wdbc.pcov$sdev ^ 2

# Variance explained by each principal component: pve
pve_cov <- pr.cvar/sum(pr.cvar)

lets check the values before visualizing the scree-plot:

# Eigen values
round(pr.cvar, 2)
##    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7    Comp.8 
## 443002.67   7297.25    702.60     54.55     39.82      3.00      1.81      0.37 
##    Comp.9   Comp.10   Comp.11   Comp.12   Comp.13   Comp.14   Comp.15   Comp.16 
##      0.16      0.08      0.03      0.01      0.00      0.00      0.00      0.00 
##   Comp.17   Comp.18   Comp.19   Comp.20   Comp.21   Comp.22   Comp.23   Comp.24 
##      0.00      0.00      0.00      0.00      0.00      0.00      0.00      0.00 
##   Comp.25   Comp.26   Comp.27   Comp.28   Comp.29   Comp.30 
##      0.00      0.00      0.00      0.00      0.00      0.00
# Percent variance explained
round(pve_cov, 2)
##  Comp.1  Comp.2  Comp.3  Comp.4  Comp.5  Comp.6  Comp.7  Comp.8  Comp.9 Comp.10 
##    0.98    0.02    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00 
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 
##    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00 
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29 Comp.30 
##    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
# Cummulative percent explained
round(cumsum(pve_cov), 2)
##  Comp.1  Comp.2  Comp.3  Comp.4  Comp.5  Comp.6  Comp.7  Comp.8  Comp.9 Comp.10 
##    0.98    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00 
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 
##    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00 
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29 Comp.30 
##    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00

Now will reate a plot of variance explained for each principal component.

Scree plot using covariance matrix:

# Plot variance explained for each principal component
plot(pve_cov, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve_cov), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

Lets run PCA using correlation matrix:

wdbc.pr <- prcomp(wdbc.data, scale = TRUE, center = TRUE)
summary(wdbc.pr)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000

84.73% of the variation is explained by the first five PC’s.

Get the eigen values of correlation matrix:

# Eigen values using covariance matrix

round(wdbc.pr$sdev ^2,4)
##  [1] 13.2816  5.6914  2.8179  1.9806  1.6487  1.2074  0.6752  0.4766  0.4169
## [10]  0.3507  0.2939  0.2612  0.2414  0.1570  0.0941  0.0799  0.0594  0.0526
## [19]  0.0495  0.0312  0.0300  0.0274  0.0243  0.0181  0.0155  0.0082  0.0069
## [28]  0.0016  0.0007  0.0001

Bi-Plot

Let’s create a bi-plot to visualize this :

cex.before <- par("cex")
par(cex = 0.7)
biplot(wdbc.pr)

par(cex = cex.before)

From the above bi-plot of PC1 vs PC2, we can see that all these variables are trending in the same direction and most of them are highly correlated (More on this .. we can visualize this in a corrplot)

Create a scatter plot of observations by components 1 and 2

# Scatter plot observations by components 1 and 2
plot(wdbc.pr$x[, c(1, 2)], col = (diagnosis + 1),
     xlab = "PC1", ylab = "PC2")
legend(x="topleft", pch=1, col = c("red", "black"), legend = c("B", "M"))

There is a clear seperation of diagnosis (M or B) that is evident in the PC1 vs PC2 plot.

Let’s also take PC1 vs PC3 plot:

# Repeat for components 1 and 3
plot(wdbc.pr$x[, c(1,3)], col = (diagnosis + 1),
     xlab = "PC1", ylab = "PC3")
legend(x="topleft", pch=1, col = c("red", "black"), legend = c("B", "M"))

You can see that the first plot has a cleaner cut separating the two subgroups,because principal component 2 explains more variance in the original data than principal component 3.

# Repeat for components 1 and 3
plot(wdbc.pr$x[, c(1,4)], col = (diagnosis + 1),
     xlab = "PC1", ylab = "PC4")
legend(x="topleft", pch=1, col = c("red", "black"), legend = c("B", "M"))

# Repeat for components 1 and 3
plot(wdbc.pr$x[, c(1,5)], col = (diagnosis + 1),
     xlab = "PC1", ylab = "PC5")
legend(x="topleft", pch=1, col = c("red", "black"), legend = c("B", "M"))

# Repeat for components 1 and 6
plot(wdbc.pr$x[, c(1,6)], col = (diagnosis + 1),
     xlab = "PC1", ylab = "PC6")
legend(x="topleft", pch=1, col = c("red", "black"), legend = c("B", "M"))

Scree plots

Scree plots can be useful in deciding how many PC’s we should keep in the model. Let’s create the scree-plots in R. As there is no R function to create a scree-plot, we need to prepare the data for the plot.

# Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))

# Calculate variability of each component
pr.var <- wdbc.pr$sdev ^ 2

# Assign names to the columns to be consistent with princomp.
# This is done for reporting purposes.
names(pr.var) <- names(pr.cvar)

# Variance explained by each principal component: pve
pve <- pr.var/sum(pr.var)

# Assign names to the columns as it is not done by default.
# This is done to be consistent with princomp.
names(pve) <- names(pve_cov)

Let’s see the values, before creating the plot.

# Eigen values
round(pr.var, 2)
##  Comp.1  Comp.2  Comp.3  Comp.4  Comp.5  Comp.6  Comp.7  Comp.8  Comp.9 Comp.10 
##   13.28    5.69    2.82    1.98    1.65    1.21    0.68    0.48    0.42    0.35 
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 
##    0.29    0.26    0.24    0.16    0.09    0.08    0.06    0.05    0.05    0.03 
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29 Comp.30 
##    0.03    0.03    0.02    0.02    0.02    0.01    0.01    0.00    0.00    0.00
# Percent variance explained
round(pve, 2)
##  Comp.1  Comp.2  Comp.3  Comp.4  Comp.5  Comp.6  Comp.7  Comp.8  Comp.9 Comp.10 
##    0.44    0.19    0.09    0.07    0.05    0.04    0.02    0.02    0.01    0.01 
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 
##    0.01    0.01    0.01    0.01    0.00    0.00    0.00    0.00    0.00    0.00 
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29 Comp.30 
##    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
# Cummulative percent explained
round(cumsum(pve), 2)
##  Comp.1  Comp.2  Comp.3  Comp.4  Comp.5  Comp.6  Comp.7  Comp.8  Comp.9 Comp.10 
##    0.44    0.63    0.73    0.79    0.85    0.89    0.91    0.93    0.94    0.95 
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 
##    0.96    0.97    0.98    0.98    0.99    0.99    0.99    0.99    0.99    1.00 
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29 Comp.30 
##    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00

Create a plot of variance explained for each principal component.

# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

As we can see, scree-plots suggest that 80% of the variation in the numeric data is captured in the first 5 PCs.

LDA

As found in the PCA analysis, we can keep 5 PCs in the model. Our next task is to use the first 5 PCs to build a Linear discriminant function using the lda() function in R.

From the wdbc.pr object, we need to extract the first five PC’s. To do this, let’s first check the variables available for this object.

ls(wdbc.pr)
## [1] "center"   "rotation" "scale"    "sdev"     "x"

Here we are interested in the rotation (also called loadings) of the first five principal components multiplied by the scaled data, which are called scores (basically PC transformed data)

wdbc.pcs <- wdbc.pr$x[,1:6]
head(wdbc.pcs, 20)
##                 PC1          PC2        PC3         PC4         PC5         PC6
## 842302   -9.1847552  -1.94687003 -1.1221788  3.63053641  1.19405948  1.41018364
## 842517   -2.3857026   3.76485906 -0.5288274  1.11728077 -0.62122836  0.02863116
## 84300903 -5.7288555   1.07422859 -0.5512625  0.91128084  0.17693022  0.54097615
## 84348301 -7.1166913 -10.26655564 -3.2299475  0.15241292  2.95827543  3.05073750
## 84358402 -3.9318425   1.94635898  1.3885450  2.93805417 -0.54626674 -1.22541641
## 843786   -2.3781546  -3.94645643 -2.9322967  0.94020959  1.05511354 -0.45064213
## 844359   -2.2369151   2.68766641 -1.6384712  0.14920860 -0.04032404 -0.12883507
## 84458202 -2.1414143  -2.33818665 -0.8711807 -0.12693117  1.42618178 -1.25593410
## 844981   -3.1721332  -3.38883114 -3.1172431 -0.60076844  1.52095211  0.55905282
## 84501001 -6.3461628  -7.72038095 -4.3380987 -3.37223437 -1.70875961 -0.72327234
## 845636    0.8097013   2.65693767 -0.4884001 -1.67109618 -0.27556910  0.12721990
## 84610002 -2.6487698  -0.06650941 -1.5251134  0.05121650 -0.33165929  0.76419423
## 846226   -8.1778388  -2.69860201  5.7251932 -1.11127875 -1.04255311  2.59229030
## 846381   -0.3418251   0.96742803  1.7156620 -0.59447987 -0.46759907  1.00677426
## 84667401 -4.3385617  -4.85680983 -2.8136398 -1.45327830 -1.28892873 -0.34940880
## 84799002 -4.0720732  -2.97444398 -3.1225267 -2.45590991  0.40798314  0.49534213
## 848406   -0.2298528   1.56338211 -0.8018136 -0.65001097  0.49427614 -0.76152096
## 84862001 -4.4141269  -1.41742315 -2.2683231 -0.18610866  1.42260945 -0.75182778
## 849014   -4.9443530   4.11071653 -0.3144724 -0.08812897  0.05666532 -1.13668869
## 8510426   1.2359758   0.18804949 -0.5927619  1.59494272  0.44176553 -0.04859402

Here, the rownames help us see how the PC transformed data looks like.

Now, we need to append the diagnosis column to this PC transformed data frame wdbc.pcs. Let’s call the new data frame as wdbc.pcst.

wdbc.pcst <- wdbc.pcs
wdbc.pcst <- cbind(wdbc.pcs, diagnosis)
head(wdbc.pcst)
##                PC1        PC2        PC3       PC4        PC5         PC6
## 842302   -9.184755  -1.946870 -1.1221788 3.6305364  1.1940595  1.41018364
## 842517   -2.385703   3.764859 -0.5288274 1.1172808 -0.6212284  0.02863116
## 84300903 -5.728855   1.074229 -0.5512625 0.9112808  0.1769302  0.54097615
## 84348301 -7.116691 -10.266556 -3.2299475 0.1524129  2.9582754  3.05073750
## 84358402 -3.931842   1.946359  1.3885450 2.9380542 -0.5462667 -1.22541641
## 843786   -2.378155  -3.946456 -2.9322967 0.9402096  1.0551135 -0.45064213
##          diagnosis
## 842302           1
## 842517           1
## 84300903         1
## 84348301         1
## 84358402         1
## 843786           1

Here, diagnosis == 1 represents malignant and diagnosis == 0 represents benign.

Model Validation

To evaluate the effectiveness of our model in predicting the diagnosis of breast cancer, we can split the original data set into training and test data. Using the training data, we will build the model and predict using the test data. We will then compare the predictions with the original data to check the accuracy of our predictions. We will use three approaches to split and validate the data. In the first approach, we use 75% of the data as our training dataset and 25% as our test dataset. In the second approach, we use 3-fold cross validation and in the third approach we extend that to a 10-fold cross validation.

Splitting the dataset into training/test data

When creating the LDA model, we can split the data into training and test data. Using the training data we can build the LDA function. Next, we use the test data to make predictions.

# Calculate N
N <- nrow(wdbc.pcst)

# Create a random number vector
rvec <- runif(N)

# Select rows from the dataframe
wdbc.pcst.train <- wdbc.pcst[rvec < 0.75,]
wdbc.pcst.test <- wdbc.pcst[rvec >= 0.75,]

# Check the number of observations
nrow(wdbc.pcst.train)
## [1] 440
nrow(wdbc.pcst.test)
## [1] 129

So, 434 observations are in training dataset and 135 observations are in the test dataset. We will use the training dataset to calculate the linear discriminant function by passing it to the lda() function of the MASS package.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
wdbc.pcst.train.df <- wdbc.pcst.train

# convert matrix to a dataframe
wdbc.pcst.train.df <- as.data.frame(wdbc.pcst.train)

# Perform LDA on diagnosis
wdbc.lda <- lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = wdbc.pcst.train.df)

Now let’s summarize the LDA output:

wdbc.lda
## Call:
## lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = wdbc.pcst.train.df)
## 
## Prior probabilities of groups:
##     0     1 
## 0.625 0.375 
## 
## Group means:
##         PC1        PC2        PC3        PC4        PC5          PC6
## 0  2.170929 -0.3590041  0.2736395  0.1025355 -0.1214482  0.007290631
## 1 -3.898347  0.4966642 -0.3375847 -0.2740145  0.2328661 -0.039850284
## 
## Coefficients of linear discriminants:
##             LD1
## PC1 -0.47189013
## PC2  0.16762385
## PC3 -0.23865226
## PC4 -0.19885872
## PC5  0.14852912
## PC6 -0.01942277

Let’s use this to predict by passing the predict function’s newdata as the testing dataset.

wdbc.pcst.test.df <- wdbc.pcst.test

# convert matrix to a dataframe
wdbc.pcst.test.df <- as.data.frame(wdbc.pcst.test)

wdbc.lda.predict <- predict(wdbc.lda, newdata = wdbc.pcst.test.df)

Let’s check what functions we can invoke on this predict object:

ls(wdbc.lda.predict)
## [1] "class"     "posterior" "x"

Our predictions are contained in the class attribute.

# print the predictions
(wdbc.lda.predict.class <- wdbc.lda.predict$class)
##   [1] 1 1 1 0 1 1 1 1 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1
##  [38] 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0
##  [75] 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1
## [112] 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## Levels: 0 1

Next, compare the accuracy of these predictions with the original data.

A simple way to validate the accuracy of our model in predicting diagnosis (M or B) is to compare the test data result to the observed data. Find the proportion of the errors in prediction and see whether our model is acceptable.

(confusionMat <- table(wdbc.lda.predict.class, wdbc.pcst.test.df$diagnosis))
##                       
## wdbc.lda.predict.class  0  1
##                      0 82  8
##                      1  0 39

So according to this output, the model predicted 78 times that the diagnosis is 1 (benign) when the actual observation was 1 (benign) and 9 times it predicted incorrectly. Similarly, the model predicted that the diagnosis is 1 (malignant) 52 times correctly and 1 predicted incorrectly.

The accuracy of this model in predicting benign tumors is 0.884615385 or 88,4615385% accurate. The accuracy of this model in predicting malignant tumors is 1 or 100% accurate.

3-fold cross validation

We can implement a cross-validation plan using the vtreat package’s kWayCrossValidation function. Syntax: kWayCrossValidation(nRows, nSplits, dframe, y)

nRows - number of rows in the training data nSplits - number of folds (partitions) in the cross-validation. E.g, 3 for 3-way CV remaining 2 arguments not needed.

The function returns indicies for training and test data for each fold. Use the data with the training indicies to fit the model and then make predictions using the test indicies. If the estimated model performance looks good, then use all the data to fit a final model. You can’t evaluate this final model, becuase you don’t have data to evaluate it with. Cross Validation only tests the modeling process, while the test/train split tests the final model

library(vtreat)
## Loading required package: wrapr
## 
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
# convert wdbc.pcst to a dataframe
wdbc.pcst.df <- as.data.frame(wdbc.pcst)

nRows <- nrow(wdbc.pcst.df)
splitPlan <- kWayCrossValidation(nRows, 3, NULL, NULL)

# examine the split plan
str(splitPlan)
## List of 3
##  $ :List of 2
##   ..$ train: int [1:380] 1 2 4 5 6 7 8 12 14 15 ...
##   ..$ app  : int [1:189] 206 281 537 412 157 117 149 61 297 535 ...
##  $ :List of 2
##   ..$ train: int [1:379] 2 3 6 9 10 11 12 13 14 16 ...
##   ..$ app  : int [1:190] 433 291 32 128 533 226 519 44 413 301 ...
##  $ :List of 2
##   ..$ train: int [1:379] 1 3 4 5 7 8 9 10 11 13 ...
##   ..$ app  : int [1:190] 38 367 12 448 404 373 19 69 569 315 ...
##  - attr(*, "splitmethod")= chr "kwaycross"

Here, k is the number of folds and splitplan is the cross validation plan

# Run a 3-fold cross validation plan from splitPlan
k <- 3

for ( i in 1:k ) {
  split <- splitPlan[[i]]
  model <- lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = wdbc.pcst.df[split$train,])
  model.pred.cv <- predict(model, newdata = wdbc.pcst.df[split$app,])

  confMat <- table(model.pred.cv$class, wdbc.pcst.df$diagnosis[split$app])
  print(confMat)
}
##    
##       0   1
##   0 115  12
##   1   0  62
##    
##       0   1
##   0 118   5
##   1   0  67
##    
##       0   1
##   0 124  13
##   1   0  53

10-fold cross validation

# Run a 10-fold cross validation plan from splitPlan
k <- 10
nRows <- nrow(wdbc.pcst.df)
splitPlan <- kWayCrossValidation(nRows, 10, NULL, NULL)

# Run a 10-fold cross validation plan from splitPlan

for ( i in 1:k ) {
  split <- splitPlan[[i]]
  model <- lda(diagnosis ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = wdbc.pcst.df[split$train,])
  model.pred.cv <- predict(model, newdata = wdbc.pcst.df[split$app,])

  confMat <- table(model.pred.cv$class, wdbc.pcst.df$diagnosis[split$app])
  print(confMat)
}
##    
##      0  1
##   0 36  1
##   1  0 19
##    
##      0  1
##   0 30  6
##   1  0 21
##    
##      0  1
##   0 39  1
##   1  0 17
##    
##      0  1
##   0 38  2
##   1  0 17
##    
##      0  1
##   0 32  5
##   1  0 20
##    
##      0  1
##   0 34  1
##   1  0 22
##    
##      0  1
##   0 37  5
##   1  0 15
##    
##      0  1
##   0 36  3
##   1  0 18
##    
##      0  1
##   0 36  1
##   1  1 19
##    
##      0  1
##   0 38  2
##   1  0 17

Conclusion

An advanced way of validating the accuracy of our model is by using a k-fold cross-validation.

When we split the data into training and test data set, we are essentially doing 1 out of sample test. However, this process is a little fragile. A better approach than a simple train/test split, using multiple test sets and averaging out of sample error - which gives us a more precise estimate of the true out of sample error. One of the most common approaches for multiple test sets is Cross Validation.