Question 2
# URL of the CSV file
url <- "https://s3.us-east-2.amazonaws.com/artificium.us/datasets/Prostate_Cancer_Modified.csv"
# Read the CSV file into a data frame
df <- read.csv(url, stringsAsFactors = FALSE, header = T)
# Show the first five rows of df
head(df, 5)
Question 3
# Calculate the 10% trimmed mean of the smoothness column
smoothness_trimmed_mean <- mean(df$smoothness, trim = 0.1, na.rm = TRUE)
# Replace NAs with the trimmed mean
df$smoothness[is.na(df$smoothness)] <- smoothness_trimmed_mean
Question 4
library(dplyr)
# Function to calculate z-score
z_score <- function(x) {
(x - mean(x)) / sd(x)
}
# Select all numeric columns
numeric_columns <- df %>% select(-id, -diagnosis_result)
# Calculate z-score
df.norm <- df %>%
mutate(across(all_of(names(numeric_columns)), z_score))
Question 5
library(caret)
# Use caret function createDataPartition to split data into training and test sets
# Example from kNN practice exercise
indxTrain <- createDataPartition(y = df.norm$diagnosis_result, p = 0.8, list = FALSE)
df.train <- df.norm[indxTrain,]
df.val <- df.norm[-indxTrain,]
Question 6
# Function to calculate class distribution
# Asked ChatGPT for help formatting the output of this function
class_distribution <- function(data) {
distribution <- table(data$diagnosis_result)
proportions <- prop.table(distribution)
return(data.frame(Class = names(distribution), Count = as.vector(distribution), Proportion = as.vector(proportions)))
}
# Analyze distributions
original_distribution <- class_distribution(df.norm)
train_distribution <- class_distribution(df.train)
val_distribution <- class_distribution(df.val)
# Print distributions
print(original_distribution)
## Class Count Proportion
## 1 B 39 0.3823529
## 2 M 63 0.6176471
print(train_distribution)
## Class Count Proportion
## 1 B 32 0.3855422
## 2 M 51 0.6144578
print(val_distribution)
## Class Count Proportion
## 1 B 7 0.3684211
## 2 M 12 0.6315789
- Class Imbalance:
- The training set has a class distribution that is very similar to
the original dataset
- The validation set also retains a similar distribution, but the
absolute counts are lower because of the smaller amount of data
- Proportionality Compared to the Original Dataset:
- The proportions of Benign and Malignant classifications in both the
training and validation sets are consistent with the original data
- Class imbalance is not a major issue
- Addressing Imbalance:
- Given that the proportions are relatively consistent across the
datasets, the need for intervention is low
- If further analysis reveals that model performance is lacking,
oversampling minority class B may yield better results
Question 7:
# Calculate k as the square root of the number of observations in df.train
k <- round(sqrt(nrow(df.train)))
# Train the KNN model using caret
model <- train(diagnosis_result ~ ., data = df.train, method = "knn", tuneGrid = expand.grid(k = k))
# Output the model summary
print(model)
## k-Nearest Neighbors
##
## 83 samples
## 9 predictor
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 83, 83, 83, 83, 83, 83, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6482241 0.2884335
##
## Tuning parameter 'k' was held constant at a value of 9
Question 8:
library(gmodels)
# Make predictions on the validation set
predictions <- predict(model, newdata = df.val)
# Create and display the confusion matrix
CrossTable(x = df.val$diagnosis_result, y = predictions, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 19
##
##
## | predictions
## df.val$diagnosis_result | B | M | Row Total |
## ------------------------|-----------|-----------|-----------|
## B | 2 | 5 | 7 |
## | 0.286 | 0.714 | 0.368 |
## | 0.250 | 0.455 | |
## | 0.105 | 0.263 | |
## ------------------------|-----------|-----------|-----------|
## M | 6 | 6 | 12 |
## | 0.500 | 0.500 | 0.632 |
## | 0.750 | 0.545 | |
## | 0.316 | 0.316 | |
## ------------------------|-----------|-----------|-----------|
## Column Total | 8 | 11 | 19 |
## | 0.421 | 0.579 | |
## ------------------------|-----------|-----------|-----------|
##
##