Breast Cancer Project Part 1

Load Libraries and data

options(digits = 3)
library(matrixStats)

## Warning: package 'matrixStats' was built under R version 3.6.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.3

## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## Warning: package 'tibble' was built under R version 3.6.3

## Warning: package 'tidyr' was built under R version 3.6.3

## Warning: package 'purrr' was built under R version 3.6.3

## Warning: package 'dplyr' was built under R version 3.6.3

## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::count()  masks matrixStats::count()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.6.3

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(dslabs)

## Warning: package 'dslabs' was built under R version 3.6.3

data(brca)

1.How many samples are in the dataset? 569

2.How many predictors are in the matrix? 30

3.What proportion of the samples are malignant? 37.3%

4.Which column number has the highest mean? 24

5.Which column number has the lowest standard deviation? 20

dim(brca[["x"]])

## [1] 569  30

table(brca[["y"]])

## 
##   B   M 
## 357 212

mean(brca[["y"]]=="M")

## [1] 0.373

which.max(colMeans(brca$x))

## area_worst 
##         24

which.min(colSds(brca$x))

## [1] 20

normalization without sweeping function Question 2: Scaling the matrix Use sweep() two times to scale each column: subtract the column mean, then divide by the column standard deviation.

After scaling, what is the standard deviation of the first column? After scaling, what is the median value of the first column?

brca_x <- as.data.frame(unlist(brca[["x"]]))
scale <- function(x){
  (x-mean(x))/sd(x)
  (x-mean(x))/sd(x)
}
brca_x1 <-brca_x %>% mutate_all(scale)
#sd
sd(brca_x1[,1])

## [1] 1

#median
median(brca_x1[,1])

## [1] -0.215

#or it can be done by sweep function like
x_centered <- sweep(brca$x, 2, colMeans(brca$x))
x_scaled <- sweep(x_centered, 2, colSds(brca$x), FUN = "/")
#x_scaled$y <- as.vector(unlist(brca[["y"]]))
sd(x_scaled[,1])

## [1] 1

median(x_scaled[,1])

## [1] -0.215

Question 3: Distance Calculate the distance between all samples using the scaled matrix. What is the average distance between the first sample, which is benign, and other benign samples? What is the average distance between the first sample and malignant samples?

d_samples <- dist(x_scaled)
dist_BtoB <- as.matrix(d_samples)[1, brca$y == "B"]
mean(dist_BtoB[2:length(dist_BtoB)])

## [1] 4.41

dist_BtoM <- as.matrix(d_samples)[1, brca$y == "M"]
mean(dist_BtoM)

## [1] 7.12

Question 4: Heatmap of features Make a heatmap of the relationship between features using the scaled matrix. Which of these heatmaps is correct? To remove column and row labels like the images below, use labRow = NA and labCol = NA.

d_features <- dist(t(x_scaled))
heatmap(as.matrix(d_features), labRow = NA, labCol = NA)

Question 5: Hierarchical clustering Perform hierarchical clustering on the 30 features. Cut the tree into 5 groups. All but one of the answer options are in the same group. Which is in a different group?

# hirericial Clustering of Feature
h <- hclust(d_features)
groups <- cutree(h, k = 5)
split(names(groups), groups)

## $`1`
##  [1] "radius_mean"       "perimeter_mean"    "area_mean"        
##  [4] "concavity_mean"    "concave_pts_mean"  "radius_se"        
##  [7] "perimeter_se"      "area_se"           "radius_worst"     
## [10] "perimeter_worst"   "area_worst"        "concave_pts_worst"
## 
## $`2`
## [1] "texture_mean"  "texture_worst"
## 
## $`3`
## [1] "smoothness_mean"   "compactness_mean"  "symmetry_mean"    
## [4] "fractal_dim_mean"  "smoothness_worst"  "compactness_worst"
## [7] "concavity_worst"   "symmetry_worst"    "fractal_dim_worst"
## 
## $`4`
## [1] "texture_se"    "smoothness_se" "symmetry_se"  
## 
## $`5`
## [1] "compactness_se" "concavity_se"   "concave_pts_se" "fractal_dim_se"

pca<-prcomp(x_scaled)
summary(pca)

## Importance of components:
##                          PC1   PC2    PC3   PC4   PC5    PC6    PC7    PC8
## Standard deviation     3.644 2.386 1.6787 1.407 1.284 1.0988 0.8217 0.6904
## Proportion of Variance 0.443 0.190 0.0939 0.066 0.055 0.0403 0.0225 0.0159
## Cumulative Proportion  0.443 0.632 0.7264 0.792 0.847 0.8876 0.9101 0.9260
##                           PC9   PC10   PC11    PC12    PC13    PC14    PC15
## Standard deviation     0.6457 0.5922 0.5421 0.51104 0.49128 0.39624 0.30681
## Proportion of Variance 0.0139 0.0117 0.0098 0.00871 0.00805 0.00523 0.00314
## Cumulative Proportion  0.9399 0.9516 0.9614 0.97007 0.97812 0.98335 0.98649
##                           PC16    PC17    PC18    PC19    PC20  PC21    PC22
## Standard deviation     0.28260 0.24372 0.22939 0.22244 0.17652 0.173 0.16565
## Proportion of Variance 0.00266 0.00198 0.00175 0.00165 0.00104 0.001 0.00091
## Cumulative Proportion  0.98915 0.99113 0.99288 0.99453 0.99557 0.997 0.99749
##                           PC23   PC24    PC25    PC26    PC27    PC28    PC29
## Standard deviation     0.15602 0.1344 0.12442 0.09043 0.08307 0.03987 0.02736
## Proportion of Variance 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005 0.00002
## Cumulative Proportion  0.99830 0.9989 0.99942 0.99969 0.99992 0.99997 1.00000
##                          PC30
## Standard deviation     0.0115
## Proportion of Variance 0.0000
## Cumulative Proportion  1.0000

data.frame(pca$x[,1:2], type = brca$y) %>%
  ggplot(aes(PC1, PC2, color = type)) +
  geom_point()

data.frame(type = brca$y, pca$x[,1:10]) %>%
    gather(key = "PC", value = "value", -type) %>%
    ggplot(aes(PC, value, fill = type)) +
    geom_boxplot()

## Part3 Question 9: Training and test sets Check that the training and test sets have similar proportions of benign and malignant tumors. What proportion of the training set is benign? 0.628 What proportion of the test set is benign? 0.626

set.seed(1) # if using R 3.5 or earlier
set.seed(1, sample.kind = "Rounding")    # if using R 3.6 or later

## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

test_index <- createDataPartition(brca$y, times = 1, p = 0.2, list = FALSE)
test_x <- x_scaled[test_index,]
test_y <- brca$y[test_index]
train_x <- x_scaled[-test_index,]
train_y <- brca$y[-test_index]
mean(train_y=="B")

## [1] 0.628

mean(test_y=="B")

## [1] 0.626

Question 10a: K-means Clustering The predict_kmeans() function defined here takes two arguments - a matrix of observations x and a k-means object k - and assigns each row of x to a cluster from k. Set the seed to 3. Perform k-means clustering on the training set with 2 centers and assign the output to k. Then use the predict_kmeans() function to make predictions on the test set.

What is the overall accuracy? 0.922

predict_kmeans <- function(x, k) {
    centers <- k$centers    # extract cluster centers
    # calculate distance to cluster centers
    distances <- sapply(1:nrow(x), function(i){
                        apply(centers, 1, function(y) dist(rbind(x[i,], y)))
                 })
  max.col(-t(distances))  # select cluster with min distance to center
}
set.seed(3, sample.kind = "Rounding")    # if using R 3.6 or later

## Warning in set.seed(3, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

k <- kmeans(train_x, centers = 2)
kmeans_preds <- ifelse(predict_kmeans(test_x, k) == 1, "B", "M")
mean(kmeans_preds == test_y)

## [1] 0.922

table(test_y,kmeans_preds)

##       kmeans_preds
## test_y  B  M
##      B 71  1
##      M  8 35

Question 10b: K-means Clustering What proportion of benign tumors are correctly identified? 0.986 What proportion of malignant tumors are correctly identified? 0.814

sensitivity(factor(kmeans_preds), test_y, positive = "B")

## [1] 0.986

sensitivity(factor(kmeans_preds), test_y, positive = "M")

## [1] 0.814

Breast Cancer Project Part 1

Saqib Kareem

May 14, 2020

Breast Cancer Project Part 1

Load Libraries and data