Breast Cancer Project Part 1

Bookmark this page The brca dataset from the dslabs package contains information about breast cancer diagnosis biopsy samples for tumors that were determined to be either benign (not cancer) and malignant (cancer). The brca object is a list consisting of:

brca\(y: a vector of sample classifications ("B" = benign or "M" = malignant) brca\)x: a matrix of numeric features describing properties of the shape and size of cell nuclei extracted from biopsy microscope images For these exercises, load the data by setting your options and loading the libraries and data as shown in the code here:

options(digits = 3) library(matrixStats) library(tidyverse) library(caret) library(dslabs) data(brca) The exercises in this assessment are available to Verified Learners only and are split into four parts, all of which use the data described here.

IMPORTANT: Some of these exercises use dslabs datasets that were added in a July 2019 update. Make sure your package is up to date with the command install.packages(“dslabs”).

Load Libraries and data

options(warn = -1)
options(digits = 3)
library(matrixStats)
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::count()  masks matrixStats::count()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(dslabs)
data(brca)

1.How many samples are in the dataset? 569

2.How many predictors are in the matrix? 30

3.What proportion of the samples are malignant? 37.3%

4.Which column number has the highest mean? 24

5.Which column number has the lowest standard deviation? 20

dim(brca[["x"]])

## [1] 569  30

table(brca[["y"]])

## 
##   B   M 
## 357 212

mean(brca[["y"]]=="M")

## [1] 0.373

which.max(colMeans(brca$x))

## area_worst 
##         24

which.min(colSds(brca$x))

## [1] 20

normalization without sweeping function Question 2: Scaling the matrix Use sweep() two times to scale each column: subtract the column mean, then divide by the column standard deviation.

After scaling, what is the standard deviation of the first column? After scaling, what is the median value of the first column?

brca_x <- as.data.frame(unlist(brca[["x"]]))
scale <- function(x){
  (x-mean(x))/sd(x)
  (x-mean(x))/sd(x)
}
brca_x1 <-brca_x %>% mutate_all(scale)
#sd
sd(brca_x1[,1])

## [1] 1

#median
median(brca_x1[,1])

## [1] -0.215

#or it can be done by sweep function like
x_centered <- sweep(brca$x, 2, colMeans(brca$x))
x_scaled <- sweep(x_centered, 2, colSds(brca$x), FUN = "/")
#x_scaled$y <- as.vector(unlist(brca[["y"]]))
sd(x_scaled[,1])

## [1] 1

median(x_scaled[,1])

## [1] -0.215

Question 3: Distance Calculate the distance between all samples using the scaled matrix. What is the average distance between the first sample, which is benign, and other benign samples? What is the average distance between the first sample and malignant samples?

d_samples <- dist(x_scaled)
dist_BtoB <- as.matrix(d_samples)[1, brca$y == "B"]
mean(dist_BtoB[2:length(dist_BtoB)])

## [1] 4.41

dist_BtoM <- as.matrix(d_samples)[1, brca$y == "M"]
mean(dist_BtoM)

## [1] 7.12

Question 4: Heatmap of features Make a heatmap of the relationship between features using the scaled matrix. Which of these heatmaps is correct? To remove column and row labels like the images below, use labRow = NA and labCol = NA.

d_features <- dist(t(x_scaled))
heatmap(as.matrix(d_features), labRow = NA, labCol = NA)

Question 5: Hierarchical clustering Perform hierarchical clustering on the 30 features. Cut the tree into 5 groups. All but one of the answer options are in the same group. Which is in a different group?

# hirericial Clustering of Feature
h <- hclust(d_features)
groups <- cutree(h, k = 5)
split(names(groups), groups)

## $`1`
##  [1] "radius_mean"       "perimeter_mean"    "area_mean"        
##  [4] "concavity_mean"    "concave_pts_mean"  "radius_se"        
##  [7] "perimeter_se"      "area_se"           "radius_worst"     
## [10] "perimeter_worst"   "area_worst"        "concave_pts_worst"
## 
## $`2`
## [1] "texture_mean"  "texture_worst"
## 
## $`3`
## [1] "smoothness_mean"   "compactness_mean"  "symmetry_mean"    
## [4] "fractal_dim_mean"  "smoothness_worst"  "compactness_worst"
## [7] "concavity_worst"   "symmetry_worst"    "fractal_dim_worst"
## 
## $`4`
## [1] "texture_se"    "smoothness_se" "symmetry_se"  
## 
## $`5`
## [1] "compactness_se" "concavity_se"   "concave_pts_se" "fractal_dim_se"

pca<-prcomp(x_scaled)
summary(pca)

## Importance of components:
##                          PC1   PC2    PC3   PC4   PC5    PC6    PC7    PC8
## Standard deviation     3.644 2.386 1.6787 1.407 1.284 1.0988 0.8217 0.6904
## Proportion of Variance 0.443 0.190 0.0939 0.066 0.055 0.0403 0.0225 0.0159
## Cumulative Proportion  0.443 0.632 0.7264 0.792 0.847 0.8876 0.9101 0.9260
##                           PC9   PC10   PC11    PC12    PC13    PC14    PC15
## Standard deviation     0.6457 0.5922 0.5421 0.51104 0.49128 0.39624 0.30681
## Proportion of Variance 0.0139 0.0117 0.0098 0.00871 0.00805 0.00523 0.00314
## Cumulative Proportion  0.9399 0.9516 0.9614 0.97007 0.97812 0.98335 0.98649
##                           PC16    PC17    PC18    PC19    PC20  PC21    PC22
## Standard deviation     0.28260 0.24372 0.22939 0.22244 0.17652 0.173 0.16565
## Proportion of Variance 0.00266 0.00198 0.00175 0.00165 0.00104 0.001 0.00091
## Cumulative Proportion  0.98915 0.99113 0.99288 0.99453 0.99557 0.997 0.99749
##                           PC23   PC24    PC25    PC26    PC27    PC28    PC29
## Standard deviation     0.15602 0.1344 0.12442 0.09043 0.08307 0.03987 0.02736
## Proportion of Variance 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005 0.00002
## Cumulative Proportion  0.99830 0.9989 0.99942 0.99969 0.99992 0.99997 1.00000
##                          PC30
## Standard deviation     0.0115
## Proportion of Variance 0.0000
## Cumulative Proportion  1.0000

data.frame(pca$x[,1:2], type = brca$y) %>%
  ggplot(aes(PC1, PC2, color = type)) +
  geom_point()

data.frame(type = brca$y, pca$x[,1:10]) %>%
    gather(key = "PC", value = "value", -type) %>%
    ggplot(aes(PC, value, fill = type)) +
    geom_boxplot()

## Part3 Question 9: Training and test sets Check that the training and test sets have similar proportions of benign and malignant tumors. What proportion of the training set is benign? 0.628 What proportion of the test set is benign? 0.626

set.seed(1) # if using R 3.5 or earlier
set.seed(1, sample.kind = "Rounding")    # if using R 3.6 or later
test_index <- createDataPartition(brca$y, times = 1, p = 0.2, list = FALSE)
test_x <- x_scaled[test_index,]
test_y <- brca$y[test_index]
train_x <- x_scaled[-test_index,]
train_y <- brca$y[-test_index]
mean(train_y=="B")

## [1] 0.628

mean(test_y=="B")

## [1] 0.626

Question 10a: K-means Clustering The predict_kmeans() function defined here takes two arguments - a matrix of observations x and a k-means object k - and assigns each row of x to a cluster from k. Set the seed to 3. Perform k-means clustering on the training set with 2 centers and assign the output to k. Then use the predict_kmeans() function to make predictions on the test set.

What is the overall accuracy? 0.922

predict_kmeans <- function(x, k) {
    centers <- k$centers    # extract cluster centers
    # calculate distance to cluster centers
    distances <- sapply(1:nrow(x), function(i){
                        apply(centers, 1, function(y) dist(rbind(x[i,], y)))
                 })
  max.col(-t(distances))  # select cluster with min distance to center
}
set.seed(3, sample.kind = "Rounding")    # if using R 3.6 or later
k <- kmeans(train_x, centers = 2)
kmeans_preds <- ifelse(predict_kmeans(test_x, k) == 1, "B", "M")
mean(kmeans_preds == test_y)

## [1] 0.922

table(test_y,kmeans_preds)

##       kmeans_preds
## test_y  B  M
##      B 71  1
##      M  8 35

Question 10b: K-means Clustering What proportion of benign tumors are correctly identified? 0.986 What proportion of malignant tumors are correctly identified? 0.814

sensitivity(factor(kmeans_preds), test_y, positive = "B")

## [1] 0.986

sensitivity(factor(kmeans_preds), test_y, positive = "M")

## [1] 0.814

Question 11: Logistic regression model Fit a logistic regression model on the training set using all predictors. Ignore warnings about the algorithm not converging. Make predictions on the test set.

What is the accuracy of the logistic regression model on the test set?

train_glm <- train(train_x, train_y,
                     method = "glm")
glm_preds <- predict(train_glm, test_x)
mean(glm_preds == test_y)

## [1] 0.957

train_lda <- train(train_x, train_y,
                     method = "lda")
lda_preds <- predict(train_lda, test_x)
mean(lda_preds == test_y)

## [1] 0.991

train_qda <- train(train_x, train_y,
                     method = "qda")
qda_preds <- predict(train_qda, test_x)
mean(qda_preds == test_y)

## [1] 0.957

library(gam)

## Loading required package: splines

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded gam 1.16.1

#grid <- expand.grid(span = seq(0.15, 0.65, len = 10), degree = 1)
train_loess <- train(train_x, train_y, 
                   method = "gamLoess")
loess_preds <- predict(train_loess, test_x)
mean(loess_preds == test_y)

## [1] 0.983

set.seed(7,sample.kind = "Rounding")
train_knn <- train(train_x, train_y, 
                   method = "knn",
                   tuneGrid = data.frame(k = seq(3, 21, 2)))
knn_preds <- predict(train_knn, test_x)
mean(knn_preds == test_y)

## [1] 0.948

set.seed(9,sample.kind = "Rounding")
train_rf <- train(train_x, train_y, 
                   method = "rf",
                   tuneGrid = data.frame(mtry = c(3,5,7,9)))
rf_preds <- predict(train_rf, test_x)
mean(rf_preds == test_y)

## [1] 0.974

train_rf$bestTune

##   mtry
## 1    3

varImp(train_rf)

## rf variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## perimeter_worst    100.00
## concave_pts_worst   99.78
## area_worst          90.14
## radius_worst        85.67
## concave_pts_mean    72.50
## area_mean           55.22
## perimeter_mean      54.21
## radius_mean         53.45
## concavity_mean      40.05
## area_se             39.83
## concavity_worst     35.08
## compactness_worst   24.17
## compactness_mean    15.85
## radius_se           15.32
## texture_worst       14.27
## perimeter_se        12.54
## texture_mean        10.68
## smoothness_worst    10.23
## symmetry_worst       8.42
## concavity_se         4.49

Ensembling

model <- c("kmeans_preds","glm_preds","lda_preds","qda_preds","rf_preds","knn_preds","loess_preds")
pred <- sapply(1:7, function(x){
  as.factor(get(model[x]))})
dim(pred)

## [1] 115   7

pred <- as.data.frame(pred)
names(pred) <-c("kmeans_preds","glm_preds","lda_preds","qda_preds","rf_preds","knn_preds","loess_preds")
acc <- colMeans(as.matrix(pred)==test_y)
acc

## kmeans_preds    glm_preds    lda_preds    qda_preds     rf_preds    knn_preds 
##        0.922        0.957        0.991        0.957        0.974        0.948 
##  loess_preds 
##        0.983

mean(acc)

## [1] 0.961

ensemble <- cbind(glm = glm_preds == "B", lda = lda_preds == "B", qda = qda_preds == "B", loess = loess_preds == "B", rf = rf_preds == "B", knn = knn_preds == "B", kmeans = kmeans_preds == "B")

ensemble_preds <- ifelse(rowMeans(ensemble) > 0.5, "B", "M")
mean(ensemble_preds == test_y)

## [1] 0.983

Data Science Machine Learning

Saqib Kareem

May 15, 2020

Breast Cancer Project Part 1

Load Libraries and data

Ensembling