Bookmark this page The brca dataset from the dslabs package contains information about breast cancer diagnosis biopsy samples for tumors that were determined to be either benign (not cancer) and malignant (cancer). The brca object is a list consisting of:
brca\(y: a vector of sample classifications ("B" = benign or "M" = malignant) brca\)x: a matrix of numeric features describing properties of the shape and size of cell nuclei extracted from biopsy microscope images For these exercises, load the data by setting your options and loading the libraries and data as shown in the code here:
options(digits = 3) library(matrixStats) library(tidyverse) library(caret) library(dslabs) data(brca) The exercises in this assessment are available to Verified Learners only and are split into four parts, all of which use the data described here.
IMPORTANT: Some of these exercises use dslabs datasets that were added in a July 2019 update. Make sure your package is up to date with the command install.packages(“dslabs”).
options(warn = -1)
options(digits = 3)
library(matrixStats)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 3.0.0 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::count() masks matrixStats::count()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(dslabs)
data(brca)
1.How many samples are in the dataset? 569
2.How many predictors are in the matrix? 30
3.What proportion of the samples are malignant? 37.3%
4.Which column number has the highest mean? 24
5.Which column number has the lowest standard deviation? 20
dim(brca[["x"]])
## [1] 569 30
table(brca[["y"]])
##
## B M
## 357 212
mean(brca[["y"]]=="M")
## [1] 0.373
which.max(colMeans(brca$x))
## area_worst
## 24
which.min(colSds(brca$x))
## [1] 20
normalization without sweeping function Question 2: Scaling the matrix Use sweep() two times to scale each column: subtract the column mean, then divide by the column standard deviation.
After scaling, what is the standard deviation of the first column? After scaling, what is the median value of the first column?
brca_x <- as.data.frame(unlist(brca[["x"]]))
scale <- function(x){
(x-mean(x))/sd(x)
(x-mean(x))/sd(x)
}
brca_x1 <-brca_x %>% mutate_all(scale)
#sd
sd(brca_x1[,1])
## [1] 1
#median
median(brca_x1[,1])
## [1] -0.215
#or it can be done by sweep function like
x_centered <- sweep(brca$x, 2, colMeans(brca$x))
x_scaled <- sweep(x_centered, 2, colSds(brca$x), FUN = "/")
#x_scaled$y <- as.vector(unlist(brca[["y"]]))
sd(x_scaled[,1])
## [1] 1
median(x_scaled[,1])
## [1] -0.215
Question 3: Distance Calculate the distance between all samples using the scaled matrix. What is the average distance between the first sample, which is benign, and other benign samples? What is the average distance between the first sample and malignant samples?
d_samples <- dist(x_scaled)
dist_BtoB <- as.matrix(d_samples)[1, brca$y == "B"]
mean(dist_BtoB[2:length(dist_BtoB)])
## [1] 4.41
dist_BtoM <- as.matrix(d_samples)[1, brca$y == "M"]
mean(dist_BtoM)
## [1] 7.12
Question 4: Heatmap of features Make a heatmap of the relationship between features using the scaled matrix. Which of these heatmaps is correct? To remove column and row labels like the images below, use labRow = NA and labCol = NA.
d_features <- dist(t(x_scaled))
heatmap(as.matrix(d_features), labRow = NA, labCol = NA)
Question 5: Hierarchical clustering Perform hierarchical clustering on the 30 features. Cut the tree into 5 groups. All but one of the answer options are in the same group. Which is in a different group?
# hirericial Clustering of Feature
h <- hclust(d_features)
groups <- cutree(h, k = 5)
split(names(groups), groups)
## $`1`
## [1] "radius_mean" "perimeter_mean" "area_mean"
## [4] "concavity_mean" "concave_pts_mean" "radius_se"
## [7] "perimeter_se" "area_se" "radius_worst"
## [10] "perimeter_worst" "area_worst" "concave_pts_worst"
##
## $`2`
## [1] "texture_mean" "texture_worst"
##
## $`3`
## [1] "smoothness_mean" "compactness_mean" "symmetry_mean"
## [4] "fractal_dim_mean" "smoothness_worst" "compactness_worst"
## [7] "concavity_worst" "symmetry_worst" "fractal_dim_worst"
##
## $`4`
## [1] "texture_se" "smoothness_se" "symmetry_se"
##
## $`5`
## [1] "compactness_se" "concavity_se" "concave_pts_se" "fractal_dim_se"
pca<-prcomp(x_scaled)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 3.644 2.386 1.6787 1.407 1.284 1.0988 0.8217 0.6904
## Proportion of Variance 0.443 0.190 0.0939 0.066 0.055 0.0403 0.0225 0.0159
## Cumulative Proportion 0.443 0.632 0.7264 0.792 0.847 0.8876 0.9101 0.9260
## PC9 PC10 PC11 PC12 PC13 PC14 PC15
## Standard deviation 0.6457 0.5922 0.5421 0.51104 0.49128 0.39624 0.30681
## Proportion of Variance 0.0139 0.0117 0.0098 0.00871 0.00805 0.00523 0.00314
## Cumulative Proportion 0.9399 0.9516 0.9614 0.97007 0.97812 0.98335 0.98649
## PC16 PC17 PC18 PC19 PC20 PC21 PC22
## Standard deviation 0.28260 0.24372 0.22939 0.22244 0.17652 0.173 0.16565
## Proportion of Variance 0.00266 0.00198 0.00175 0.00165 0.00104 0.001 0.00091
## Cumulative Proportion 0.98915 0.99113 0.99288 0.99453 0.99557 0.997 0.99749
## PC23 PC24 PC25 PC26 PC27 PC28 PC29
## Standard deviation 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987 0.02736
## Proportion of Variance 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005 0.00002
## Cumulative Proportion 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997 1.00000
## PC30
## Standard deviation 0.0115
## Proportion of Variance 0.0000
## Cumulative Proportion 1.0000
data.frame(pca$x[,1:2], type = brca$y) %>%
ggplot(aes(PC1, PC2, color = type)) +
geom_point()
data.frame(type = brca$y, pca$x[,1:10]) %>%
gather(key = "PC", value = "value", -type) %>%
ggplot(aes(PC, value, fill = type)) +
geom_boxplot()
## Part3 Question 9: Training and test sets Check that the training and test sets have similar proportions of benign and malignant tumors. What proportion of the training set is benign? 0.628 What proportion of the test set is benign? 0.626
set.seed(1) # if using R 3.5 or earlier
set.seed(1, sample.kind = "Rounding") # if using R 3.6 or later
test_index <- createDataPartition(brca$y, times = 1, p = 0.2, list = FALSE)
test_x <- x_scaled[test_index,]
test_y <- brca$y[test_index]
train_x <- x_scaled[-test_index,]
train_y <- brca$y[-test_index]
mean(train_y=="B")
## [1] 0.628
mean(test_y=="B")
## [1] 0.626
Question 10a: K-means Clustering The predict_kmeans() function defined here takes two arguments - a matrix of observations x and a k-means object k - and assigns each row of x to a cluster from k. Set the seed to 3. Perform k-means clustering on the training set with 2 centers and assign the output to k. Then use the predict_kmeans() function to make predictions on the test set.
What is the overall accuracy? 0.922
predict_kmeans <- function(x, k) {
centers <- k$centers # extract cluster centers
# calculate distance to cluster centers
distances <- sapply(1:nrow(x), function(i){
apply(centers, 1, function(y) dist(rbind(x[i,], y)))
})
max.col(-t(distances)) # select cluster with min distance to center
}
set.seed(3, sample.kind = "Rounding") # if using R 3.6 or later
k <- kmeans(train_x, centers = 2)
kmeans_preds <- ifelse(predict_kmeans(test_x, k) == 1, "B", "M")
mean(kmeans_preds == test_y)
## [1] 0.922
table(test_y,kmeans_preds)
## kmeans_preds
## test_y B M
## B 71 1
## M 8 35
Question 10b: K-means Clustering What proportion of benign tumors are correctly identified? 0.986 What proportion of malignant tumors are correctly identified? 0.814
sensitivity(factor(kmeans_preds), test_y, positive = "B")
## [1] 0.986
sensitivity(factor(kmeans_preds), test_y, positive = "M")
## [1] 0.814
Question 11: Logistic regression model Fit a logistic regression model on the training set using all predictors. Ignore warnings about the algorithm not converging. Make predictions on the test set.
What is the accuracy of the logistic regression model on the test set?
train_glm <- train(train_x, train_y,
method = "glm")
glm_preds <- predict(train_glm, test_x)
mean(glm_preds == test_y)
## [1] 0.957
train_lda <- train(train_x, train_y,
method = "lda")
lda_preds <- predict(train_lda, test_x)
mean(lda_preds == test_y)
## [1] 0.991
train_qda <- train(train_x, train_y,
method = "qda")
qda_preds <- predict(train_qda, test_x)
mean(qda_preds == test_y)
## [1] 0.957
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
#grid <- expand.grid(span = seq(0.15, 0.65, len = 10), degree = 1)
train_loess <- train(train_x, train_y,
method = "gamLoess")
loess_preds <- predict(train_loess, test_x)
mean(loess_preds == test_y)
## [1] 0.983
set.seed(7,sample.kind = "Rounding")
train_knn <- train(train_x, train_y,
method = "knn",
tuneGrid = data.frame(k = seq(3, 21, 2)))
knn_preds <- predict(train_knn, test_x)
mean(knn_preds == test_y)
## [1] 0.948
set.seed(9,sample.kind = "Rounding")
train_rf <- train(train_x, train_y,
method = "rf",
tuneGrid = data.frame(mtry = c(3,5,7,9)))
rf_preds <- predict(train_rf, test_x)
mean(rf_preds == test_y)
## [1] 0.974
train_rf$bestTune
## mtry
## 1 3
varImp(train_rf)
## rf variable importance
##
## only 20 most important variables shown (out of 30)
##
## Overall
## perimeter_worst 100.00
## concave_pts_worst 99.78
## area_worst 90.14
## radius_worst 85.67
## concave_pts_mean 72.50
## area_mean 55.22
## perimeter_mean 54.21
## radius_mean 53.45
## concavity_mean 40.05
## area_se 39.83
## concavity_worst 35.08
## compactness_worst 24.17
## compactness_mean 15.85
## radius_se 15.32
## texture_worst 14.27
## perimeter_se 12.54
## texture_mean 10.68
## smoothness_worst 10.23
## symmetry_worst 8.42
## concavity_se 4.49
model <- c("kmeans_preds","glm_preds","lda_preds","qda_preds","rf_preds","knn_preds","loess_preds")
pred <- sapply(1:7, function(x){
as.factor(get(model[x]))})
dim(pred)
## [1] 115 7
pred <- as.data.frame(pred)
names(pred) <-c("kmeans_preds","glm_preds","lda_preds","qda_preds","rf_preds","knn_preds","loess_preds")
acc <- colMeans(as.matrix(pred)==test_y)
acc
## kmeans_preds glm_preds lda_preds qda_preds rf_preds knn_preds
## 0.922 0.957 0.991 0.957 0.974 0.948
## loess_preds
## 0.983
mean(acc)
## [1] 0.961
ensemble <- cbind(glm = glm_preds == "B", lda = lda_preds == "B", qda = qda_preds == "B", loess = loess_preds == "B", rf = rf_preds == "B", knn = knn_preds == "B", kmeans = kmeans_preds == "B")
ensemble_preds <- ifelse(rowMeans(ensemble) > 0.5, "B", "M")
mean(ensemble_preds == test_y)
## [1] 0.983