Select your Excel file manually
survey_file <- file.choose()
# Read the file using the default Q1-Q7 headers
survey_raw <- read_excel(survey_file)
# The first row contains the real variable names, so move it into the header
real_names <- survey_raw %>%
slice(1) %>%
unlist(use.names = FALSE) %>%
as.character()
survey <- survey_raw %>%
slice(-1)
names(survey) <- make_clean_names(real_names)
head(survey)
## # A tibble: 6 × 7
## standing electrontic_paper platform preference other_platform spending
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Senior electronic McGraw Hill McGraw Hill N/A $50-$75
## 2 Senior electronic Pearson I don't have a… N/A less th…
## 3 Senior electronic McGraw Hill Pearson n/a less th…
## 4 Senior electronic Pearson I don't have a… <NA> $75-$100
## 5 Senior electronic McGraw Hill McGraw Hill N/A $50-$75
## 6 Junior electronic Pearson I don't have a… <NA> less th…
## # ℹ 1 more variable: expensive <chr>
names(survey)
## [1] "standing" "electrontic_paper" "platform"
## [4] "preference" "other_platform" "spending"
## [7] "expensive"
Prepare data
survey <- survey %>%
rename(
academic_status = standing,
textbook_format = electrontic_paper,
user_friendly = platform,
purchase_pref = preference,
other_platform = other_platform,
spending = spending,
price_perception = expensive
) %>%
mutate(across(everything(), ~trimws(as.character(.)))) %>%
filter(!is.na(academic_status), academic_status != "")
Build clustering variables
cluster_data <- survey %>%
mutate(
pearson_user = ifelse(user_friendly == "Pearson", 1, 0),
mcgraw_user = ifelse(user_friendly == "McGraw Hill", 1, 0),
pearson_pref = ifelse(purchase_pref == "Pearson", 1, 0),
mcgraw_pref = ifelse(purchase_pref == "McGraw Hill", 1, 0),
no_pref = ifelse(purchase_pref == "I don't have a preference", 1, 0),
different_platform = ifelse(purchase_pref == "I prefer a different platform", 1, 0),
pearson_expensive = ifelse(price_perception == "Pearson is more expensive", 1, 0),
mcgraw_expensive = ifelse(price_perception == "McGraw Hill is more expensive", 1, 0),
same_price = ifelse(price_perception == "They are about the same", 1, 0),
no_price_attention = ifelse(price_perception == "I don't pay attention to textbook prices", 1, 0),
prefers_electronic = ifelse(textbook_format == "electronic", 1, 0),
prefers_paper = ifelse(textbook_format == "paper", 1, 0),
no_format_pref = ifelse(textbook_format == "I have no preference", 1, 0)
) %>%
select(
pearson_user, mcgraw_user,
pearson_pref, mcgraw_pref, no_pref, different_platform,
pearson_expensive, mcgraw_expensive, same_price, no_price_attention,
prefers_electronic, prefers_paper, no_format_pref
)
cluster_data
## # A tibble: 27 × 13
## pearson_user mcgraw_user pearson_pref mcgraw_pref no_pref different_platform
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 1 0 0
## 2 1 0 0 0 1 0
## 3 0 1 1 0 0 0
## 4 1 0 0 0 1 0
## 5 0 1 0 1 0 0
## 6 1 0 0 0 1 0
## 7 1 0 0 0 1 0
## 8 0 1 0 1 0 0
## 9 1 0 0 0 0 1
## 10 0 1 0 0 1 0
## # ℹ 17 more rows
## # ℹ 7 more variables: pearson_expensive <dbl>, mcgraw_expensive <dbl>,
## # same_price <dbl>, no_price_attention <dbl>, prefers_electronic <dbl>,
## # prefers_paper <dbl>, no_format_pref <dbl>
Elbow method
fviz_nbclust(cluster_data, kmeans, method = "wss") +
ggtitle("Elbow Method for Choosing Number of Clusters")

Run clustering
set.seed(123)
k3 <- kmeans(cluster_data, centers = 3, nstart = 25)
k3
## K-means clustering with 3 clusters of sizes 12, 6, 9
##
## Cluster means:
## pearson_user mcgraw_user pearson_pref mcgraw_pref no_pref
## 1 0 1 0.08333333 0.4166667 0.3333333
## 2 1 0 0.00000000 0.0000000 1.0000000
## 3 1 0 0.77777778 0.0000000 0.0000000
## different_platform pearson_expensive mcgraw_expensive same_price
## 1 0.1666667 0.1666667 0.08333333 0.5833333
## 2 0.0000000 0.1666667 0.16666667 0.3333333
## 3 0.2222222 0.0000000 0.33333333 0.6666667
## no_price_attention prefers_electronic prefers_paper no_format_pref
## 1 0.1666667 0.5833333 0.2500000 0.1666667
## 2 0.3333333 1.0000000 0.0000000 0.0000000
## 3 0.0000000 0.4444444 0.3333333 0.2222222
##
## Clustering vector:
## [1] 1 2 1 2 1 2 2 1 3 1 1 3 1 3 3 1 3 3 3 2 1 3 1 3 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 22.166667 4.333333 12.888889
## (between_SS / total_SS = 38.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Visualize clusters
fviz_cluster(
k3,
data = cluster_data,
geom = "point",
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal(),
main = "Cluster Analysis of Student Survey Responses"
)

Cluster sizes
survey$cluster <- k3$cluster
survey %>%
count(cluster) %>%
ggplot(aes(x = factor(cluster), y = n, fill = factor(cluster))) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), vjust = -0.3) +
labs(title = "Number of Respondents in Each Cluster",
x = "Cluster",
y = "Count") +
theme_minimal()

Cluster profiles
survey %>%
group_by(cluster) %>%
summarise(
Pearson_User = mean(user_friendly == "Pearson"),
McGraw_User = mean(user_friendly == "McGraw Hill"),
Pearson_Pref = mean(purchase_pref == "Pearson"),
McGraw_Pref = mean(purchase_pref == "McGraw Hill"),
No_Pref = mean(purchase_pref == "I don't have a preference"),
Different_Platform = mean(purchase_pref == "I prefer a different platform"),
Pearson_Expensive = mean(price_perception == "Pearson is more expensive"),
McGraw_Expensive = mean(price_perception == "McGraw Hill is more expensive"),
Same_Price = mean(price_perception == "They are about the same"),
Electronic = mean(textbook_format == "electronic"),
Paper = mean(textbook_format == "paper"),
Count = n()
)
## # A tibble: 3 × 13
## cluster Pearson_User McGraw_User Pearson_Pref McGraw_Pref No_Pref
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0 1 0.0833 0.417 0.333
## 2 2 1 0 0 0 1
## 3 3 1 0 0.778 0 0
## # ℹ 7 more variables: Different_Platform <dbl>, Pearson_Expensive <dbl>,
## # McGraw_Expensive <dbl>, Same_Price <dbl>, Electronic <dbl>, Paper <dbl>,
## # Count <int>