Select your Excel file manually

survey_file <- file.choose()

# Read the file using the default Q1-Q7 headers
survey_raw <- read_excel(survey_file)

# The first row contains the real variable names, so move it into the header
real_names <- survey_raw %>%
  slice(1) %>%
  unlist(use.names = FALSE) %>%
  as.character()

survey <- survey_raw %>%
  slice(-1)

names(survey) <- make_clean_names(real_names)

head(survey)
## # A tibble: 6 × 7
##   standing electrontic_paper platform    preference      other_platform spending
##   <chr>    <chr>             <chr>       <chr>           <chr>          <chr>   
## 1 Senior   electronic        McGraw Hill McGraw Hill     N/A            $50-$75 
## 2 Senior   electronic        Pearson     I don't have a… N/A            less th…
## 3 Senior   electronic        McGraw Hill Pearson         n/a            less th…
## 4 Senior   electronic        Pearson     I don't have a… <NA>           $75-$100
## 5 Senior   electronic        McGraw Hill McGraw Hill     N/A            $50-$75 
## 6 Junior   electronic        Pearson     I don't have a… <NA>           less th…
## # ℹ 1 more variable: expensive <chr>
names(survey)
## [1] "standing"          "electrontic_paper" "platform"         
## [4] "preference"        "other_platform"    "spending"         
## [7] "expensive"

Prepare data

survey <- survey %>%
  rename(
    academic_status = standing,
    textbook_format = electrontic_paper,
    user_friendly = platform,
    purchase_pref = preference,
    other_platform = other_platform,
    spending = spending,
    price_perception = expensive
  ) %>%
  mutate(across(everything(), ~trimws(as.character(.)))) %>%
  filter(!is.na(academic_status), academic_status != "")

Build clustering variables

cluster_data <- survey %>%
  mutate(
    pearson_user = ifelse(user_friendly == "Pearson", 1, 0),
    mcgraw_user = ifelse(user_friendly == "McGraw Hill", 1, 0),
    pearson_pref = ifelse(purchase_pref == "Pearson", 1, 0),
    mcgraw_pref = ifelse(purchase_pref == "McGraw Hill", 1, 0),
    no_pref = ifelse(purchase_pref == "I don't have a preference", 1, 0),
    different_platform = ifelse(purchase_pref == "I prefer a different platform", 1, 0),
    pearson_expensive = ifelse(price_perception == "Pearson is more expensive", 1, 0),
    mcgraw_expensive = ifelse(price_perception == "McGraw Hill is more expensive", 1, 0),
    same_price = ifelse(price_perception == "They are about the same", 1, 0),
    no_price_attention = ifelse(price_perception == "I don't pay attention to textbook prices", 1, 0),
    prefers_electronic = ifelse(textbook_format == "electronic", 1, 0),
    prefers_paper = ifelse(textbook_format == "paper", 1, 0),
    no_format_pref = ifelse(textbook_format == "I have no preference", 1, 0)
  ) %>%
  select(
    pearson_user, mcgraw_user,
    pearson_pref, mcgraw_pref, no_pref, different_platform,
    pearson_expensive, mcgraw_expensive, same_price, no_price_attention,
    prefers_electronic, prefers_paper, no_format_pref
  )

cluster_data
## # A tibble: 27 × 13
##    pearson_user mcgraw_user pearson_pref mcgraw_pref no_pref different_platform
##           <dbl>       <dbl>        <dbl>       <dbl>   <dbl>              <dbl>
##  1            0           1            0           1       0                  0
##  2            1           0            0           0       1                  0
##  3            0           1            1           0       0                  0
##  4            1           0            0           0       1                  0
##  5            0           1            0           1       0                  0
##  6            1           0            0           0       1                  0
##  7            1           0            0           0       1                  0
##  8            0           1            0           1       0                  0
##  9            1           0            0           0       0                  1
## 10            0           1            0           0       1                  0
## # ℹ 17 more rows
## # ℹ 7 more variables: pearson_expensive <dbl>, mcgraw_expensive <dbl>,
## #   same_price <dbl>, no_price_attention <dbl>, prefers_electronic <dbl>,
## #   prefers_paper <dbl>, no_format_pref <dbl>

Elbow method

fviz_nbclust(cluster_data, kmeans, method = "wss") +
  ggtitle("Elbow Method for Choosing Number of Clusters")

Run clustering

set.seed(123)
k3 <- kmeans(cluster_data, centers = 3, nstart = 25)
k3
## K-means clustering with 3 clusters of sizes 12, 6, 9
## 
## Cluster means:
##   pearson_user mcgraw_user pearson_pref mcgraw_pref   no_pref
## 1            0           1   0.08333333   0.4166667 0.3333333
## 2            1           0   0.00000000   0.0000000 1.0000000
## 3            1           0   0.77777778   0.0000000 0.0000000
##   different_platform pearson_expensive mcgraw_expensive same_price
## 1          0.1666667         0.1666667       0.08333333  0.5833333
## 2          0.0000000         0.1666667       0.16666667  0.3333333
## 3          0.2222222         0.0000000       0.33333333  0.6666667
##   no_price_attention prefers_electronic prefers_paper no_format_pref
## 1          0.1666667          0.5833333     0.2500000      0.1666667
## 2          0.3333333          1.0000000     0.0000000      0.0000000
## 3          0.0000000          0.4444444     0.3333333      0.2222222
## 
## Clustering vector:
##  [1] 1 2 1 2 1 2 2 1 3 1 1 3 1 3 3 1 3 3 3 2 1 3 1 3 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 22.166667  4.333333 12.888889
##  (between_SS / total_SS =  38.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Visualize clusters

fviz_cluster(
  k3,
  data = cluster_data,
  geom = "point",
  ellipse.type = "convex",
  palette = "jco",
  ggtheme = theme_minimal(),
  main = "Cluster Analysis of Student Survey Responses"
)

Cluster sizes

survey$cluster <- k3$cluster

survey %>%
  count(cluster) %>%
  ggplot(aes(x = factor(cluster), y = n, fill = factor(cluster))) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = n), vjust = -0.3) +
  labs(title = "Number of Respondents in Each Cluster",
       x = "Cluster",
       y = "Count") +
  theme_minimal()

Cluster profiles

survey %>%
  group_by(cluster) %>%
  summarise(
    Pearson_User = mean(user_friendly == "Pearson"),
    McGraw_User = mean(user_friendly == "McGraw Hill"),
    Pearson_Pref = mean(purchase_pref == "Pearson"),
    McGraw_Pref = mean(purchase_pref == "McGraw Hill"),
    No_Pref = mean(purchase_pref == "I don't have a preference"),
    Different_Platform = mean(purchase_pref == "I prefer a different platform"),
    Pearson_Expensive = mean(price_perception == "Pearson is more expensive"),
    McGraw_Expensive = mean(price_perception == "McGraw Hill is more expensive"),
    Same_Price = mean(price_perception == "They are about the same"),
    Electronic = mean(textbook_format == "electronic"),
    Paper = mean(textbook_format == "paper"),
    Count = n()
  )
## # A tibble: 3 × 13
##   cluster Pearson_User McGraw_User Pearson_Pref McGraw_Pref No_Pref
##     <int>        <dbl>       <dbl>        <dbl>       <dbl>   <dbl>
## 1       1            0           1       0.0833       0.417   0.333
## 2       2            1           0       0            0       1    
## 3       3            1           0       0.778        0       0    
## # ℹ 7 more variables: Different_Platform <dbl>, Pearson_Expensive <dbl>,
## #   McGraw_Expensive <dbl>, Same_Price <dbl>, Electronic <dbl>, Paper <dbl>,
## #   Count <int>