Pre_loading

rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
column_names <- c("SampleCodeNumber", "ClumpThickness", "UniformityOfCellSize", "UniformityOfCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "Class")

cancer_df <- read_csv('./Datasets/Classification/breast-cancer-wisconsin.data',
                      col_names = column_names, show_col_types = FALSE) %>%
  # Convert all columns to numeric
  mutate(across(everything(), as.numeric)) %>%
  # Drop rows with any NA values
  drop_na() %>%
  mutate(Class = if_else(Class == 2, 0, 1))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion

1

Use the appropriate functions to obtain descriptive information about the variables included in the dataset (paste or include a screenshot with the resulting information).

str(cancer_df)
## tibble [683 × 11] (S3: tbl_df/tbl/data.frame)
##  $ SampleCodeNumber        : num [1:683] 1000025 1002945 1015425 1016277 1017023 ...
##  $ ClumpThickness          : num [1:683] 5 5 3 6 4 8 1 2 2 4 ...
##  $ UniformityOfCellSize    : num [1:683] 1 4 1 8 1 10 1 1 1 2 ...
##  $ UniformityOfCellShape   : num [1:683] 1 4 1 8 1 10 1 2 1 1 ...
##  $ MarginalAdhesion        : num [1:683] 1 5 1 1 3 8 1 1 1 1 ...
##  $ SingleEpithelialCellSize: num [1:683] 2 7 2 3 2 7 2 2 2 2 ...
##  $ BareNuclei              : num [1:683] 1 10 2 4 1 10 10 1 1 1 ...
##  $ BlandChromatin          : num [1:683] 3 3 3 3 3 9 3 3 1 2 ...
##  $ NormalNucleoli          : num [1:683] 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses                 : num [1:683] 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class                   : num [1:683] 0 0 0 0 0 1 0 0 0 0 ...
summary(cancer_df)
##  SampleCodeNumber   ClumpThickness   UniformityOfCellSize UniformityOfCellShape
##  Min.   :   63375   Min.   : 1.000   Min.   : 1.000       Min.   : 1.000       
##  1st Qu.:  877617   1st Qu.: 2.000   1st Qu.: 1.000       1st Qu.: 1.000       
##  Median : 1171795   Median : 4.000   Median : 1.000       Median : 1.000       
##  Mean   : 1076720   Mean   : 4.442   Mean   : 3.151       Mean   : 3.215       
##  3rd Qu.: 1238705   3rd Qu.: 6.000   3rd Qu.: 5.000       3rd Qu.: 5.000       
##  Max.   :13454352   Max.   :10.000   Max.   :10.000       Max.   :10.000       
##  MarginalAdhesion SingleEpithelialCellSize   BareNuclei     BlandChromatin  
##  Min.   : 1.00    Min.   : 1.000           Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 1.00    1st Qu.: 2.000           1st Qu.: 1.000   1st Qu.: 2.000  
##  Median : 1.00    Median : 2.000           Median : 1.000   Median : 3.000  
##  Mean   : 2.83    Mean   : 3.234           Mean   : 3.545   Mean   : 3.445  
##  3rd Qu.: 4.00    3rd Qu.: 4.000           3rd Qu.: 6.000   3rd Qu.: 5.000  
##  Max.   :10.00    Max.   :10.000           Max.   :10.000   Max.   :10.000  
##  NormalNucleoli     Mitoses           Class       
##  Min.   : 1.00   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.: 1.00   1st Qu.: 1.000   1st Qu.:0.0000  
##  Median : 1.00   Median : 1.000   Median :0.0000  
##  Mean   : 2.87   Mean   : 1.603   Mean   :0.3499  
##  3rd Qu.: 4.00   3rd Qu.: 1.000   3rd Qu.:1.0000  
##  Max.   :10.00   Max.   :10.000   Max.   :1.0000
dim(cancer_df)
## [1] 683  11

2

Calculate the correlation between the different attributes (include the figure produced by R in your answer).

cor(cancer_df)
##                          SampleCodeNumber ClumpThickness UniformityOfCellSize
## SampleCodeNumber               1.00000000    -0.05634966          -0.04139605
## ClumpThickness                -0.05634966     1.00000000           0.64248149
## UniformityOfCellSize          -0.04139605     0.64248149           1.00000000
## UniformityOfCellShape         -0.04222123     0.65346999           0.90722823
## MarginalAdhesion              -0.06963009     0.48782872           0.70697695
## SingleEpithelialCellSize      -0.04864387     0.52359604           0.75354402
## BareNuclei                    -0.09924781     0.59309144           0.69170875
## BlandChromatin                -0.06196640     0.55374245           0.75555916
## NormalNucleoli                -0.05069861     0.53406591           0.71934604
## Mitoses                       -0.03797243     0.35095717           0.46075470
## Class                         -0.08470103     0.71478993           0.82080144
##                          UniformityOfCellShape MarginalAdhesion
## SampleCodeNumber                   -0.04222123      -0.06963009
## ClumpThickness                      0.65346999       0.48782872
## UniformityOfCellSize                0.90722823       0.70697695
## UniformityOfCellShape               1.00000000       0.68594806
## MarginalAdhesion                    0.68594806       1.00000000
## SingleEpithelialCellSize            0.72246241       0.59454777
## BareNuclei                          0.71387755       0.67064829
## BlandChromatin                      0.73534350       0.66856706
## NormalNucleoli                      0.71796341       0.60312106
## Mitoses                             0.44125758       0.41889833
## Class                               0.82189095       0.70629414
##                          SingleEpithelialCellSize  BareNuclei BlandChromatin
## SampleCodeNumber                      -0.04864387 -0.09924781     -0.0619664
## ClumpThickness                         0.52359604  0.59309144      0.5537424
## UniformityOfCellSize                   0.75354402  0.69170875      0.7555592
## UniformityOfCellShape                  0.72246241  0.71387755      0.7353435
## MarginalAdhesion                       0.59454777  0.67064829      0.6685671
## SingleEpithelialCellSize               1.00000000  0.58571613      0.6181279
## BareNuclei                             0.58571613  1.00000000      0.6806149
## BlandChromatin                         0.61812790  0.68061486      1.0000000
## NormalNucleoli                         0.62892640  0.58428020      0.6656015
## Mitoses                                0.48058330  0.33921044      0.3460109
## Class                                  0.69095816  0.82269587      0.7582276
##                          NormalNucleoli     Mitoses       Class
## SampleCodeNumber            -0.05069861 -0.03797243 -0.08470103
## ClumpThickness               0.53406591  0.35095717  0.71478993
## UniformityOfCellSize         0.71934604  0.46075470  0.82080144
## UniformityOfCellShape        0.71796341  0.44125758  0.82189095
## MarginalAdhesion             0.60312106  0.41889833  0.70629414
## SingleEpithelialCellSize     0.62892640  0.48058330  0.69095816
## BareNuclei                   0.58428020  0.33921044  0.82269587
## BlandChromatin               0.66560153  0.34601089  0.75822755
## NormalNucleoli               1.00000000  0.43375727  0.71867719
## Mitoses                      0.43375727  1.00000000  0.42344792
## Class                        0.71867719  0.42344792  1.00000000
pairs(cancer_df)

3

Divide the input dataset into training and testing. a. Split the datasets using 80% for training and 20% for testing. b. How many examples will be used for training and how many for testing?

# Split data
index <- sample(1:nrow(cancer_df), size = nrow(cancer_df)*0.8)
# UPDRS_df <- UPDRS_df %>% 
#   rename(`Subject Number` = `subject#`) %>%
#   mutate(`Subject Number` = as_factor(`Subject Number`))
train <- cancer_df[index,]
test <- cancer_df[-index,]

# Number of training and test examples  
train_num <- nrow(train)
test_num <- nrow(test)

cat("Number of training examples:", train_num, "\n")
## Number of training examples: 546
cat("Number of test examples:", test_num)
## Number of test examples: 137

4

Build a logistic regression model including all the input variables. a. How does the model perform? Provide the confusion matrix and the test error. b. Which predictors are statistically significant?

# Build the logistic regression model using all input variables
glm_model <- glm(formula = Class ~ ., data = train, family = binomial)

# Summary to check statistically significant predictors
summary(glm_model)
## 
## Call:
## glm(formula = Class ~ ., family = binomial, data = train)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.243e+01  2.244e+00  -5.539 3.04e-08 ***
## SampleCodeNumber          4.766e-08  9.949e-07   0.048 0.961793    
## ClumpThickness            6.698e-01  1.834e-01   3.652 0.000260 ***
## UniformityOfCellSize     -2.048e-01  2.542e-01  -0.806 0.420397    
## UniformityOfCellShape     3.652e-01  2.763e-01   1.322 0.186121    
## MarginalAdhesion          4.904e-01  1.564e-01   3.136 0.001710 ** 
## SingleEpithelialCellSize  2.440e-02  1.932e-01   0.126 0.899491    
## BareNuclei                4.782e-01  1.242e-01   3.850 0.000118 ***
## BlandChromatin            5.896e-01  2.249e-01   2.622 0.008744 ** 
## NormalNucleoli            4.095e-01  1.560e-01   2.624 0.008678 ** 
## Mitoses                   9.444e-01  3.267e-01   2.891 0.003839 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 716.293  on 545  degrees of freedom
## Residual deviance:  63.735  on 535  degrees of freedom
## AIC: 85.735
## 
## Number of Fisher Scoring iterations: 9
# Predictions on the test set
test_predictions_prob <- predict(glm_model, newdata = test, type = "response")
test_predictions <- ifelse(test_predictions_prob > 0.5, 1, 0) # Assuming 1 for positive outcome

# Actual outcomes
actual_outcomes <- test$Class

# Confusion matrix
confusion_matrix <- table(Predicted = test_predictions, Actual = actual_outcomes)
print(confusion_matrix)
##          Actual
## Predicted  0  1
##         0 94  3
##         1  3 37
# Calculate test error
test_error <- mean(test_predictions != actual_outcomes)
print(test_error)
## [1] 0.04379562
# Check statistically significant predictors
summary(glm_model)$coefficients
##                               Estimate   Std. Error     z value     Pr(>|z|)
## (Intercept)              -1.242742e+01 2.243607e+00 -5.53903811 3.041375e-08
## SampleCodeNumber          4.766162e-08 9.949435e-07  0.04790384 9.617929e-01
## ClumpThickness            6.697730e-01 1.833768e-01  3.65244090 2.597594e-04
## UniformityOfCellSize     -2.048004e-01 2.541789e-01 -0.80573333 4.203966e-01
## UniformityOfCellShape     3.652422e-01 2.762506e-01  1.32214078 1.861213e-01
## MarginalAdhesion          4.904327e-01 1.563673e-01  3.13641429 1.710274e-03
## SingleEpithelialCellSize  2.440436e-02 1.932187e-01  0.12630436 8.994910e-01
## BareNuclei                4.781580e-01 1.241897e-01  3.85022325 1.180102e-04
## BlandChromatin            5.896159e-01 2.248800e-01  2.62191354 8.743761e-03
## NormalNucleoli            4.094855e-01 1.560242e-01  2.62449980 8.677634e-03
## Mitoses                   9.444334e-01 3.266691e-01  2.89110112 3.838946e-03

5

Build a KNN model testing two different Ks. Start with K equal to the square root of N, where N is the total number of samples (used frequently as a way to initialize K). a. How does the model perform? Provide the confusion matrix and the test error.

library(class)
# Calculate the square root of N (number of training samples) to determine K
K <- round(sqrt(nrow(train)))

# Preparing the data for KNN
# Exclude the outcome variable 'Class' for features
# Assuming 'Class' is the last column in cancer_df
train_X <- train[, -ncol(train)]
test_X <- test[, -ncol(test)]
train_Y <- train$Class
test_Y <- test$Class

# KNN model with K = sqrt(N)
knn_pred_sqrt_N <- knn(train_X, test_X, train_Y, k = K)

# Confusion matrix and test error for K = sqrt(N)
conf_matrix_sqrt_N <- table(Predicted = knn_pred_sqrt_N, Actual = test_Y)
cat("Confusion Matrix for K =", K, ":\n")
## Confusion Matrix for K = 23 :
print(conf_matrix_sqrt_N)
##          Actual
## Predicted  0  1
##         0 81 32
##         1 16  8
test_error_sqrt_N <- mean(knn_pred_sqrt_N != test_Y)
cat("Test error with K =", K, ":", test_error_sqrt_N, "\n")
## Test error with K = 23 : 0.350365
# Try another value of K, for example, K + 5
knn_pred_K_plus_5 <- knn(train_X, test_X, train_Y, k = K + 5)

# Confusion matrix and test error for K = K + 5
conf_matrix_K_plus_5 <- table(Predicted = knn_pred_K_plus_5, Actual = test_Y)
cat("Confusion Matrix for K =", K + 5, ":\n")
## Confusion Matrix for K = 28 :
print(conf_matrix_K_plus_5)
##          Actual
## Predicted  0  1
##         0 84 33
##         1 13  7
test_error_K_plus_5 <- mean(knn_pred_K_plus_5 != test_Y)
cat("Test error with K =", K + 5, ":", test_error_K_plus_5)
## Test error with K = 28 : 0.3357664

6

Build an LDA model including all the input variables. a. How does the model perform? Provide the confusion matrix and the test error. b. Which predictors have more weight on the class? c. Plot the linear discriminants.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# Step 1: Train the LDA model
lda_model <- lda(Class ~ ., data = train)

# Step 2: Model Performance
# Predicting on the test set
test_pred <- predict(lda_model, newdata = test)$class

# Creating a confusion matrix
conf_matrix <- table(Predicted = test_pred, Actual = test$Class)
print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 94  5
##         1  3 35
# Calculating test error
test_error <- mean(test_pred != test$Class)
cat("Test error:", test_error, "\n")
## Test error: 0.05839416
# Step 3: Predictor Weights
# Examining the model coefficients to see predictor importance
cat("Coefficients of linear discriminants:\n")
## Coefficients of linear discriminants:
print(lda_model$scaling)
##                                    LD1
## SampleCodeNumber         -4.875836e-08
## ClumpThickness            1.882809e-01
## UniformityOfCellSize      9.524711e-02
## UniformityOfCellShape     1.296649e-01
## MarginalAdhesion          4.621327e-02
## SingleEpithelialCellSize  4.288567e-02
## BareNuclei                2.734438e-01
## BlandChromatin            9.124872e-02
## NormalNucleoli            1.176545e-01
## Mitoses                   2.920406e-02
# Step 4: Plot Linear Discriminants
# Plot the LDA model
plot(lda_model, dimen = 1, col = c("red", "blue")) # Adjust 'dimen' if needed

7

Build a QDA model including all the input variables. a. How does the model perform? Provide the confusion matrix and the test error.

# Step 1: Train the QDA model
qda_model <- qda(Class ~ ., data = train)

# Step 2: Model Evaluation
# Predicting on the test set
test_pred_qda <- predict(qda_model, newdata = test)$class

# Creating a confusion matrix
conf_matrix_qda <- table(Predicted = test_pred_qda, Actual = test$Class)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix_qda)
##          Actual
## Predicted  0  1
##         0 93  2
##         1  4 38
# Calculating test error
test_error_qda <- mean(test_pred_qda != test$Class)
cat("Test error:", test_error_qda, "\n")
## Test error: 0.04379562

8

Compare the performance of the different models built and comment on it.

Surprisingly, the test error rates of the logistic regression and the QDA both are the smallest, thus they are the best model for now. LDA is also good with a low test error rate, but KNN method is the worst in this task.

The codes are also publicly available at https://rpubs.com/AlanHuang/CSC642-R_Assignment2