Breast Cancer Project Part 1

Bookmark this page The brca dataset from the dslabs package contains information about breast cancer diagnosis biopsy samples for tumors that were determined to be either benign (not cancer) and malignant (cancer). The brca object is a list consisting of:

brca\(y: a vector of sample classifications ("B" = benign or "M" = malignant) brca\)x: a matrix of numeric features describing properties of the shape and size of cell nuclei extracted from biopsy microscope images For these exercises, load the data by setting your options and loading the libraries and data as shown in the code here:

options(digits = 3) library(matrixStats) library(tidyverse) library(caret) library(dslabs) data(brca) The exercises in this assessment are available to Verified Learners only and are split into four parts, all of which use the data described here.

IMPORTANT: Some of these exercises use dslabs datasets that were added in a July 2019 update. Make sure your package is up to date with the command install.packages(“dslabs”).

Load Libraries and data

options(digits = 3)
library(matrixStats)
## Warning: package 'matrixStats' was built under R version 3.6.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::count()  masks matrixStats::count()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(dslabs)
## Warning: package 'dslabs' was built under R version 3.6.3
data(brca)

1.How many samples are in the dataset? 569

2.How many predictors are in the matrix? 30

3.What proportion of the samples are malignant? 37.3%

4.Which column number has the highest mean? 24

5.Which column number has the lowest standard deviation? 20

dim(brca[["x"]])
## [1] 569  30
table(brca[["y"]])
## 
##   B   M 
## 357 212
mean(brca[["y"]]=="M")
## [1] 0.373
which.max(colMeans(brca$x))
## area_worst 
##         24
which.min(colSds(brca$x))
## [1] 20

normalization without sweeping function Question 2: Scaling the matrix Use sweep() two times to scale each column: subtract the column mean, then divide by the column standard deviation.

After scaling, what is the standard deviation of the first column? After scaling, what is the median value of the first column?

brca_x <- as.data.frame(unlist(brca[["x"]]))
scale <- function(x){
  (x-mean(x))/sd(x)
  (x-mean(x))/sd(x)
}
brca_x1 <-brca_x %>% mutate_all(scale)
#sd
sd(brca_x1[,1])
## [1] 1
#median
median(brca_x1[,1])
## [1] -0.215
#or it can be done by sweep function like
x_centered <- sweep(brca$x, 2, colMeans(brca$x))
x_scaled <- sweep(x_centered, 2, colSds(brca$x), FUN = "/")
#x_scaled$y <- as.vector(unlist(brca[["y"]]))
sd(x_scaled[,1])
## [1] 1
median(x_scaled[,1])
## [1] -0.215

Question 3: Distance Calculate the distance between all samples using the scaled matrix. What is the average distance between the first sample, which is benign, and other benign samples? What is the average distance between the first sample and malignant samples?

d_samples <- dist(x_scaled)
dist_BtoB <- as.matrix(d_samples)[1, brca$y == "B"]
mean(dist_BtoB[2:length(dist_BtoB)])
## [1] 4.41
dist_BtoM <- as.matrix(d_samples)[1, brca$y == "M"]
mean(dist_BtoM)
## [1] 7.12

Question 4: Heatmap of features Make a heatmap of the relationship between features using the scaled matrix. Which of these heatmaps is correct? To remove column and row labels like the images below, use labRow = NA and labCol = NA.

d_features <- dist(t(x_scaled))
heatmap(as.matrix(d_features), labRow = NA, labCol = NA)

Question 5: Hierarchical clustering Perform hierarchical clustering on the 30 features. Cut the tree into 5 groups. All but one of the answer options are in the same group. Which is in a different group?

# hirericial Clustering of Feature
h <- hclust(d_features)
groups <- cutree(h, k = 5)
split(names(groups), groups)
## $`1`
##  [1] "radius_mean"       "perimeter_mean"    "area_mean"        
##  [4] "concavity_mean"    "concave_pts_mean"  "radius_se"        
##  [7] "perimeter_se"      "area_se"           "radius_worst"     
## [10] "perimeter_worst"   "area_worst"        "concave_pts_worst"
## 
## $`2`
## [1] "texture_mean"  "texture_worst"
## 
## $`3`
## [1] "smoothness_mean"   "compactness_mean"  "symmetry_mean"    
## [4] "fractal_dim_mean"  "smoothness_worst"  "compactness_worst"
## [7] "concavity_worst"   "symmetry_worst"    "fractal_dim_worst"
## 
## $`4`
## [1] "texture_se"    "smoothness_se" "symmetry_se"  
## 
## $`5`
## [1] "compactness_se" "concavity_se"   "concave_pts_se" "fractal_dim_se"
pca<-prcomp(x_scaled)
summary(pca)
## Importance of components:
##                          PC1   PC2    PC3   PC4   PC5    PC6    PC7    PC8
## Standard deviation     3.644 2.386 1.6787 1.407 1.284 1.0988 0.8217 0.6904
## Proportion of Variance 0.443 0.190 0.0939 0.066 0.055 0.0403 0.0225 0.0159
## Cumulative Proportion  0.443 0.632 0.7264 0.792 0.847 0.8876 0.9101 0.9260
##                           PC9   PC10   PC11    PC12    PC13    PC14    PC15
## Standard deviation     0.6457 0.5922 0.5421 0.51104 0.49128 0.39624 0.30681
## Proportion of Variance 0.0139 0.0117 0.0098 0.00871 0.00805 0.00523 0.00314
## Cumulative Proportion  0.9399 0.9516 0.9614 0.97007 0.97812 0.98335 0.98649
##                           PC16    PC17    PC18    PC19    PC20  PC21    PC22
## Standard deviation     0.28260 0.24372 0.22939 0.22244 0.17652 0.173 0.16565
## Proportion of Variance 0.00266 0.00198 0.00175 0.00165 0.00104 0.001 0.00091
## Cumulative Proportion  0.98915 0.99113 0.99288 0.99453 0.99557 0.997 0.99749
##                           PC23   PC24    PC25    PC26    PC27    PC28    PC29
## Standard deviation     0.15602 0.1344 0.12442 0.09043 0.08307 0.03987 0.02736
## Proportion of Variance 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005 0.00002
## Cumulative Proportion  0.99830 0.9989 0.99942 0.99969 0.99992 0.99997 1.00000
##                          PC30
## Standard deviation     0.0115
## Proportion of Variance 0.0000
## Cumulative Proportion  1.0000
data.frame(pca$x[,1:2], type = brca$y) %>%
  ggplot(aes(PC1, PC2, color = type)) +
  geom_point()

data.frame(type = brca$y, pca$x[,1:10]) %>%
    gather(key = "PC", value = "value", -type) %>%
    ggplot(aes(PC, value, fill = type)) +
    geom_boxplot()

## Part3 Question 9: Training and test sets Check that the training and test sets have similar proportions of benign and malignant tumors. What proportion of the training set is benign? 0.628 What proportion of the test set is benign? 0.626

set.seed(1) # if using R 3.5 or earlier
set.seed(1, sample.kind = "Rounding")    # if using R 3.6 or later
## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
test_index <- createDataPartition(brca$y, times = 1, p = 0.2, list = FALSE)
test_x <- x_scaled[test_index,]
test_y <- brca$y[test_index]
train_x <- x_scaled[-test_index,]
train_y <- brca$y[-test_index]
mean(train_y=="B")
## [1] 0.628
mean(test_y=="B")
## [1] 0.626

Question 10a: K-means Clustering The predict_kmeans() function defined here takes two arguments - a matrix of observations x and a k-means object k - and assigns each row of x to a cluster from k. Set the seed to 3. Perform k-means clustering on the training set with 2 centers and assign the output to k. Then use the predict_kmeans() function to make predictions on the test set.

What is the overall accuracy? 0.922

predict_kmeans <- function(x, k) {
    centers <- k$centers    # extract cluster centers
    # calculate distance to cluster centers
    distances <- sapply(1:nrow(x), function(i){
                        apply(centers, 1, function(y) dist(rbind(x[i,], y)))
                 })
  max.col(-t(distances))  # select cluster with min distance to center
}
set.seed(3, sample.kind = "Rounding")    # if using R 3.6 or later
## Warning in set.seed(3, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
k <- kmeans(train_x, centers = 2)
kmeans_preds <- ifelse(predict_kmeans(test_x, k) == 1, "B", "M")
mean(kmeans_preds == test_y)
## [1] 0.922
table(test_y,kmeans_preds)
##       kmeans_preds
## test_y  B  M
##      B 71  1
##      M  8 35

Question 10b: K-means Clustering What proportion of benign tumors are correctly identified? 0.986 What proportion of malignant tumors are correctly identified? 0.814

sensitivity(factor(kmeans_preds), test_y, positive = "B")
## [1] 0.986
sensitivity(factor(kmeans_preds), test_y, positive = "M")
## [1] 0.814