#Clear the environment
rm(list = ls())
# Load necessary libraries
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
library(nnet)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
library(tidyr)
# Load the dataset
file_path <- "~/Desktop/Assignment Data - Ford Ka.xlsx"
sheet_names <- getSheetNames(file_path)
sheet_names
## [1] "Demographic Data" "Demographics Code"
## [3] "MDS Data" "Psychographic Data"
## [5] "Psychographic questionnaire"
# Read each sheet:
demographics_data <- read.xlsx(
file_path,
sheet = "Demographic Data",
colNames = TRUE,
startRow = 6
)
psychographic_data <- read.xlsx(
file_path,
sheet = "Psychographic Data",
colNames = TRUE,
startRow = 6
)
names(demographics_data)
## [1] "Respondent.Number" "Preference.Group" "Gender"
## [4] "Age" "Marital.Status" "Number.of.Children"
## [7] "1st.Time.Purchase" "Age.Category" "Children.Category"
## [10] "Income.Category"
names(psychographic_data)
## [1] "Respondent.Number" "Q1" "Q2"
## [4] "Q3" "Q4" "Q5"
## [7] "Q6" "Q7" "Q8"
## [10] "Q9" "Q10" "Q11"
## [13] "Q12" "Q13" "Q14"
## [16] "Q15" "Q16" "Q17"
## [19] "Q18" "Q19" "Q20"
## [22] "Q21" "Q22" "Q23"
## [25] "Q24" "Q25" "Q26"
## [28] "Q27" "Q28" "Q29"
## [31] "Q30" "Q31" "Q32"
## [34] "Q33" "Q34" "Q35"
## [37] "Q36" "Q37" "Q38"
## [40] "Q39" "Q40" "Q41"
## [43] "Q42" "Q43" "Q44"
## [46] "Q45" "Q46" "Q47"
## [49] "Q48" "Q49" "Q50"
## [52] "Q51" "Q52" "Q53"
## [55] "Q54" "Q55" "Q56"
## [58] "Q57" "Q58" "Q59"
## [61] "Q60" "Q61" "Q62"
colnames(demographics_data) <- c(
"RespondentNumber", "PreferenceGroup", "Gender", "Age", "MaritalStatus",
"NumberOfChildren", "FirstTimePurchase", "AgeCategory", "ChildrenCategory", "IncomeCategory"
)
head(demographics_data)
## RespondentNumber PreferenceGroup Gender Age MaritalStatus NumberOfChildren
## 1 1 1 2 44 3 0
## 2 2 3 1 24 2 1
## 3 3 2 2 34 3 1
## 4 4 3 1 44 3 0
## 5 5 1 2 41 1 2
## 6 6 1 2 26 1 1
## FirstTimePurchase AgeCategory ChildrenCategory IncomeCategory
## 1 2 5 0 6
## 2 1 1 1 3
## 3 2 3 1 1
## 4 2 5 0 3
## 5 1 5 2 4
## 6 1 2 1 4
head(psychographic_data)
## Respondent.Number Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15 Q16 Q17
## 1 1 6 2 4 3 1 5 5 3 4 4 4 5 4 7 6 7 6
## 2 2 7 7 7 5 4 4 5 4 5 5 4 4 4 2 3 4 4
## 3 3 5 4 6 5 7 5 3 5 4 5 5 5 6 3 3 4 2
## 4 4 4 2 5 4 2 4 5 4 3 4 4 4 6 5 6 7 6
## 5 5 5 5 7 6 7 3 4 5 4 2 5 4 4 5 4 3 2
## 6 6 6 6 4 4 5 3 5 2 5 3 4 4 5 1 2 4 4
## Q18 Q19 Q20 Q21 Q22 Q23 Q24 Q25 Q26 Q27 Q28 Q29 Q30 Q31 Q32 Q33 Q34 Q35 Q36
## 1 5 5 6 7 5 2 1 2 3 1 1 2 2 4 4 5 4 3 4
## 2 3 4 2 4 4 7 1 4 3 4 5 7 4 1 5 5 5 3 3
## 3 4 3 4 2 3 3 4 2 4 5 3 3 4 7 5 7 5 7 6
## 4 5 5 5 6 6 1 1 1 2 2 2 3 1 2 5 4 5 4 4
## 5 5 4 5 5 5 3 4 5 2 2 3 1 4 6 7 7 7 5 7
## 6 5 4 1 3 5 7 1 5 3 5 4 5 3 2 6 3 4 4 6
## Q37 Q38 Q39 Q40 Q41 Q42 Q43 Q44 Q45 Q46 Q47 Q48 Q49 Q50 Q51 Q52 Q53 Q54 Q55
## 1 4 3 5 3 5 5 4 3 4 4 4 5 4 4 5 4 2 4 5
## 2 4 7 4 3 7 6 4 7 6 6 7 6 6 7 1 1 1 1 1
## 3 5 7 3 2 1 1 1 1 4 3 4 4 3 2 4 4 3 5 6
## 4 3 3 6 2 4 5 4 2 4 5 4 3 3 4 5 2 3 5 4
## 5 7 7 2 1 3 2 2 3 4 5 5 2 5 3 4 4 6 4 5
## 6 4 2 3 3 6 4 3 7 6 6 7 7 6 7 2 2 1 2 1
## Q56 Q57 Q58 Q59 Q60 Q61 Q62
## 1 4 5 3 4 4 4 2
## 2 1 5 4 3 5 4 5
## 3 3 4 4 5 3 4 4
## 4 4 4 2 5 5 5 3
## 5 5 4 5 4 3 4 5
## 6 2 5 4 4 4 4 4
summary(demographics_data)
## RespondentNumber PreferenceGroup Gender Age
## Min. : 1.00 Min. :1.000 Min. :1.00 Min. :20.00
## 1st Qu.: 63.25 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:29.00
## Median :125.50 Median :2.000 Median :1.00 Median :36.00
## Mean :125.50 Mean :1.784 Mean :1.48 Mean :36.36
## 3rd Qu.:187.75 3rd Qu.:2.000 3rd Qu.:2.00 3rd Qu.:43.00
## Max. :250.00 Max. :3.000 Max. :2.00 Max. :58.00
## MaritalStatus NumberOfChildren FirstTimePurchase AgeCategory
## Min. :1.000 Min. :0.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:2.000 1st Qu.:2.000
## Median :1.000 Median :0.000 Median :2.000 Median :4.000
## Mean :1.872 Mean :0.728 Mean :1.852 Mean :3.768
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :3.000 Max. :4.000 Max. :2.000 Max. :6.000
## ChildrenCategory IncomeCategory
## Min. :0.000 Min. :1.00
## 1st Qu.:0.000 1st Qu.:2.00
## Median :0.000 Median :4.00
## Mean :0.624 Mean :3.68
## 3rd Qu.:1.000 3rd Qu.:5.00
## Max. :2.000 Max. :6.00
summary(psychographic_data)
## Respondent.Number Q1 Q2 Q3 Q4
## Min. : 1.00 Min. :1.0 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.: 63.25 1st Qu.:4.0 1st Qu.:2.00 1st Qu.:4.000 1st Qu.:3.000
## Median :125.50 Median :5.0 Median :4.00 Median :4.000 Median :4.000
## Mean :125.50 Mean :5.1 Mean :4.06 Mean :4.444 Mean :4.236
## 3rd Qu.:187.75 3rd Qu.:6.0 3rd Qu.:6.00 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :250.00 Max. :7.0 Max. :7.00 Max. :7.000 Max. :7.000
## Q5 Q6 Q7 Q8 Q9
## Min. :1.000 Min. :1.000 Min. :2.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.00 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.00 Median :4.000 Median :4.000
## Mean :3.848 Mean :3.992 Mean :3.88 Mean :3.916 Mean :3.904
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.00 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :7.000 Max. :7.000 Max. :6.00 Max. :7.000 Max. :7.000
## Q10 Q11 Q12 Q13
## Min. :1.000 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.916 Mean :3.984 Mean :4.072 Mean :3.988
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :6.000
## Q14 Q15 Q16 Q17
## Min. :1.000 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:3.000
## Median :5.000 Median :5.000 Median :5.000 Median :5.000
## Mean :4.132 Mean :4.972 Mean :4.512 Mean :4.444
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q18 Q19 Q20 Q21
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:4.000
## Median :5.000 Median :5.000 Median :4.000 Median :5.000
## Mean :4.532 Mean :4.688 Mean :3.832 Mean :4.912
## 3rd Qu.:5.750 3rd Qu.:6.000 3rd Qu.:5.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q22 Q23 Q24 Q25 Q26
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :5.000 Median :4.00 Median :2.000 Median :3.000 Median :3.000
## Mean :4.992 Mean :4.12 Mean :2.376 Mean :3.148 Mean :3.012
## 3rd Qu.:6.000 3rd Qu.:6.00 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :7.000 Max. :7.00 Max. :6.000 Max. :7.000 Max. :7.000
## Q27 Q28 Q29 Q30 Q31
## Min. :1.00 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.00 1st Qu.:2.00 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000
## Median :4.00 Median :3.00 Median :3.000 Median :3.000 Median :4.000
## Mean :3.46 Mean :3.12 Mean :3.448 Mean :3.344 Mean :4.056
## 3rd Qu.:4.00 3rd Qu.:4.00 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:6.000
## Max. :7.00 Max. :7.00 Max. :7.000 Max. :6.000 Max. :7.000
## Q32 Q33 Q34 Q35
## Min. :2.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
## Median :5.000 Median :5.000 Median :5.000 Median :5.000
## Mean :4.604 Mean :4.564 Mean :4.496 Mean :4.584
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:5.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q36 Q37 Q38 Q39
## Min. :2.000 Min. :1.000 Min. :2.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:2.000
## Median :4.000 Median :5.000 Median :5.000 Median :4.000
## Mean :4.452 Mean :4.836 Mean :4.616 Mean :3.444
## 3rd Qu.:5.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:4.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q40 Q41 Q42 Q43 Q44
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:2.250 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.00
## Median :3.000 Median :4.000 Median :3.000 Median :3.000 Median :4.00
## Mean :3.368 Mean :3.912 Mean :3.148 Mean :3.392 Mean :4.26
## 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:6.00
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.00
## Q45 Q46 Q47 Q48
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.250 1st Qu.:4.000
## Median :5.000 Median :5.000 Median :5.000 Median :5.000
## Mean :4.744 Mean :4.752 Mean :4.768 Mean :4.776
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q49 Q50 Q51 Q52
## Min. :2.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:2.000
## Median :5.000 Median :5.000 Median :4.000 Median :4.000
## Mean :4.776 Mean :4.812 Mean :3.308 Mean :3.532
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q53 Q54 Q55 Q56 Q57
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :3.00 Median :3.000 Median :3.000 Median :4.000
## Mean :3.616 Mean :3.16 Mean :3.136 Mean :3.148 Mean :4.316
## 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :7.000 Max. :7.00 Max. :6.000 Max. :7.000 Max. :7.000
## Q58 Q59 Q60 Q61 Q62
## Min. :2.000 Min. :1.00 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:3.000 1st Qu.:3.00 1st Qu.:3.000
## Median :4.000 Median :4.00 Median :4.000 Median :4.00 Median :4.000
## Mean :4.384 Mean :4.32 Mean :3.772 Mean :3.68 Mean :3.672
## 3rd Qu.:5.000 3rd Qu.:5.00 3rd Qu.:5.000 3rd Qu.:5.00 3rd Qu.:5.000
## Max. :7.000 Max. :7.00 Max. :7.000 Max. :7.00 Max. :7.000
demographic_vars <- c("Age", "Gender", "MaritalStatus", "NumberOfChildren", "FirstTimePurchase", "IncomeCategory")
for (var in demographic_vars) {
freq_table <- table(demographics_data[[var]], demographics_data$PreferenceGroup)
print(freq_table)
print(chisq.test(freq_table))
}
##
## 1 2 3
## 20 3 2 3
## 21 1 0 1
## 22 2 1 1
## 23 2 0 2
## 24 2 0 4
## 25 0 0 4
## 26 5 5 3
## 27 5 3 2
## 28 2 3 2
## 29 6 2 1
## 30 5 3 3
## 31 1 3 4
## 32 8 2 1
## 33 3 0 3
## 34 6 4 1
## 35 2 3 2
## 36 2 2 1
## 37 3 2 2
## 38 1 1 1
## 39 3 3 3
## 40 5 7 2
## 41 9 1 3
## 42 6 1 5
## 43 9 4 0
## 44 7 2 2
## 45 1 2 0
## 46 2 1 0
## 47 2 1 2
## 48 3 3 2
## 49 1 3 0
## 50 1 1 0
## 51 2 1 1
## 52 2 0 0
## 54 1 1 0
## 55 2 1 1
## 56 1 1 0
## 57 0 1 0
## 58 0 2 0
## Warning in chisq.test(freq_table): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: freq_table
## X-squared = 76.904, df = 74, p-value = 0.3858
##
##
## 1 2 3
## 1 54 36 40
## 2 62 36 22
##
## Pearson's Chi-squared test
##
## data: freq_table
## X-squared = 5.3861, df = 2, p-value = 0.06767
##
##
## 1 2 3
## 1 66 34 27
## 2 14 6 8
## 3 36 32 27
##
## Pearson's Chi-squared test
##
## data: freq_table
## X-squared = 5.2093, df = 4, p-value = 0.2665
##
##
## 1 2 3
## 0 62 45 41
## 1 29 12 7
## 2 15 7 9
## 3 8 8 4
## 4 2 0 1
## Warning in chisq.test(freq_table): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: freq_table
## X-squared = 8.6946, df = 8, p-value = 0.3687
##
##
## 1 2 3
## 1 13 8 16
## 2 103 64 46
##
## Pearson's Chi-squared test
##
## data: freq_table
## X-squared = 7.9211, df = 2, p-value = 0.01905
##
##
## 1 2 3
## 1 11 5 7
## 2 19 15 12
## 3 18 16 12
## 4 19 16 11
## 5 28 12 11
## 6 21 8 9
##
## Pearson's Chi-squared test
##
## data: freq_table
## X-squared = 6.148, df = 10, p-value = 0.8027
Age vs. PreferenceGroup
p ≈ 0.3858
No significant relationship between age groups and Ka preference at the 5 percent level.
Gender vs. PreferenceGroup
p ≈ 0.0677
Slightly above the usual 0.05 cutoff, indicating no strong evidence that gender differs across Ka Choosers, Non-Choosers, or Middle. However, it is borderline, suggesting a possible weak association.
MaritalStatus vs. PreferenceGroup
p ≈ 0.2665
Not significant. Marital status does not appear to be linked strongly to preference group.
NumberOfChildren vs. PreferenceGroup
p ≈ 0.3687
Not significant. Different family sizes do not strongly differentiate preference.
FirstTimePurchase vs. PreferenceGroup
p ≈ 0.0190
Significant. First-time purchase status shows a statistically significant association with Ka preference.
IncomeCategory vs. PreferenceGroup
p ≈ 0.8027
Not significant. Income does not appear to meaningfully separate Ka Choosers from others.
age_table <- table(demographics_data$Age, demographics_data$PreferenceGroup)
age_table
##
## 1 2 3
## 20 3 2 3
## 21 1 0 1
## 22 2 1 1
## 23 2 0 2
## 24 2 0 4
## 25 0 0 4
## 26 5 5 3
## 27 5 3 2
## 28 2 3 2
## 29 6 2 1
## 30 5 3 3
## 31 1 3 4
## 32 8 2 1
## 33 3 0 3
## 34 6 4 1
## 35 2 3 2
## 36 2 2 1
## 37 3 2 2
## 38 1 1 1
## 39 3 3 3
## 40 5 7 2
## 41 9 1 3
## 42 6 1 5
## 43 9 4 0
## 44 7 2 2
## 45 1 2 0
## 46 2 1 0
## 47 2 1 2
## 48 3 3 2
## 49 1 3 0
## 50 1 1 0
## 51 2 1 1
## 52 2 0 0
## 54 1 1 0
## 55 2 1 1
## 56 1 1 0
## 57 0 1 0
## 58 0 2 0
gender_table <- table(demographics_data$Gender, demographics_data$PreferenceGroup)
gender_table
##
## 1 2 3
## 1 54 36 40
## 2 62 36 22
ms_table <- table(demographics_data$MaritalStatus, demographics_data$PreferenceGroup)
ms_table
##
## 1 2 3
## 1 66 34 27
## 2 14 6 8
## 3 36 32 27
noc_table <- table(demographics_data$NumberOfChildren, demographics_data$PreferenceGroup)
noc_table
##
## 1 2 3
## 0 62 45 41
## 1 29 12 7
## 2 15 7 9
## 3 8 8 4
## 4 2 0 1
ftp_table <- table(demographics_data$FirstTimePurchase, demographics_data$PreferenceGroup)
ftp_table
##
## 1 2 3
## 1 13 8 16
## 2 103 64 46
ic_table <- table(demographics_data$IncomeCategory, demographics_data$PreferenceGroup)
ic_table
##
## 1 2 3
## 1 11 5 7
## 2 19 15 12
## 3 18 16 12
## 4 19 16 11
## 5 28 12 11
## 6 21 8 9
With the exception of FirstTimePurchase, all of the demographic variables in this data set do not significantly differentiate Ka Choosers from Non-Choosers or the Middle group. Only FirstTimePurchase appears to be a strong demographic predictor, which implies that first-time car buyers are more apt (or less apt) to choose a Ka than repeat buyers. No compelling evidence was found for the other variables—Age, Gender, MaritalStatus, NumberOfChildren, and IncomeCategory—to show differences between the three preference groups.
In practice, this would not be an effective method for targeting or segmenting Ka buyers using these demographics alone. It might be necessary for marketers and analysts to combine FirstTimePurchase with other data, such as attitudinal or psychographic data, in an attempt to create a better picture of who would be a Ka Chooser versus a Non-Chooser.
A multinomial logit model allows researchers to model how multiple demographic variables predict the choice among three or more categories (in this case, Ka Choosers vs. Ka Non-Choosers vs. Middle). The primary advantages are:
Handling Multiple Categories at Once: Instead of fitting separate models to each pair of groups, the multinomial logit compares all of the preference groups to a given baseline group in a single comparison.
Controlling for Other Variables: The contribution of each predictor is estimated while holding the others constant, providing a clearer picture of which demographics are truly significant.
Detailed Coefficients and Probabilities: The model gives separate coefficients (log-odds) for each group relative to the baseline, which is easy to interpret in terms of how each variable alters the probability of being in one group versus another.
demographics_data$PreferenceGroup <- factor(
demographics_data$PreferenceGroup,
levels = c(1, 2, 3),
labels = c("KaChooser", "KaNonChooser", "Middle")
)
mlogit_model <- multinom(
PreferenceGroup ~ Age + Gender + MaritalStatus + NumberOfChildren + FirstTimePurchase + IncomeCategory,
data = demographics_data
)
## # weights: 24 (14 variable)
## initial value 274.653072
## iter 10 value 252.948633
## final value 251.403856
## converged
summary(mlogit_model)
## Call:
## multinom(formula = PreferenceGroup ~ Age + Gender + MaritalStatus +
## NumberOfChildren + FirstTimePurchase + IncomeCategory, data = demographics_data)
##
## Coefficients:
## (Intercept) Age Gender MaritalStatus NumberOfChildren
## KaNonChooser -0.653224 0.0148776 -0.1080422 0.2387384 -0.06096890
## Middle 3.643868 -0.0381628 -0.8698026 0.2689317 -0.03132285
## FirstTimePurchase IncomeCategory
## KaNonChooser -0.07855675 -0.12653824
## Middle -1.04669453 -0.06479152
##
## Std. Errors:
## (Intercept) Age Gender MaritalStatus NumberOfChildren
## KaNonChooser 1.275940 0.01725463 0.3101613 0.1642739 0.1472353
## Middle 1.266737 0.01935286 0.3468027 0.1788484 0.1640463
## FirstTimePurchase IncomeCategory
## KaNonChooser 0.4939414 0.09881107
## Middle 0.4501743 0.10565048
##
## Residual Deviance: 502.8077
## AIC: 530.8077
From the reported model:
Age: Negative coefficient (-0.038) for Middle vs. KaChooser.
A negative log-odds typically implies that as Age increases, the probability of being Middle decreases relative to KaChooser (once other variables are fixed). The marginal effect for older respondents would thus be negative for the Middle category and positive for KaChooser, though small in magnitude.
Gender: Negative coefficient for Middle vs. KaChooser (-0.87).
If Gender=1 for male, men have a lower probability of being in the Middle group, shifting probability somewhat toward KaChooser (or KaNonChooser). The marginal effect would show how that difference translates into actual probability changes.
FirstTimePurchase: Substantial negative effect for Middle vs. KaChooser (-1.05).
Being a first-time buyer reduces the likelihood of being Middle, boosting the probability of KaChooser. The marginal effect captures how a one-unit shift (from 0 to 1) in first-time purchase status alters the distribution across categories.
KaNonChooser:
Most coefficients for KaNonChooser vs. KaChooser were small and not statistically significant, so the marginal effects would be near zero, indicating minimal change in the probability of KaNonChooser vs. KaChooser for each demographic variable.
Age: Marginal effects would likely show a drop in probability of being Middle (and rise in KaChooser) as age increases.
Gender: If = male=1, then the marginal effect shows men are less likely to be Middle vs. KaChooser, removing a few points from being Middle.
FirstTimePurchase: Extremely large negative log-odds for Middle means a large increase in KaChooser probability for first-time purchasers.
Even though cross-tabs are only indicating that FirstTimePurchase is significant, marginal effects from the multinomial model establish that with other influences controlled, Age and Gender also show substantial probability changes—though mainly for Middle versus KaChooser differentiation.Therefore, Age, Gender, and FirstTimePurchase all reveal substantial marginal effects on Middle affiliation, but none of them predict KaNonChooser versus KaChooser in this sample with certainty.
# K-means
set.seed(123)
# Store models for k=3,4,5
k_values <- 3:5
k_models <- list()
for (k in k_values) {
km <- kmeans(psychographic_data, centers = k, nstart = 25)
k_models[[as.character(k)]] <- km
cat("\nNumber of clusters =", k, "\n")
print(km$size) # how many respondents in each cluster
}
##
## Number of clusters = 3
## [1] 83 83 84
##
## Number of clusters = 4
## [1] 62 62 63 63
##
## Number of clusters = 5
## [1] 49 51 49 52 49
#Elbow
wss <- sapply(1:8, function(k) {
kmeans(psychographic_data, centers = k, nstart = 25)$tot.withinss
})
plot(1:8, wss, type = "b", xlab = "Number of Clusters", ylab = "Total Within-SS")
Managerial decision: I will choose k = 4 for better interpretability and better distinction.
No. of respondents: 62 62 63 63
C1: Name: "Eco & Value Seekers" - Adjectives: eco-conscious, feature-oriented, time-saving
C2: Name: "Trendy but Balanced" - Adjectives: style-focused, moderately practical, balanced priorities
C3. Name: "All-Features & Performance’ - Adjectives:
feature-hungry, performance-loving, versatile
C4. Name: "Fashion-Focused" - Adjectives: style-centric, brand/image
conscious, trend-aware
wcss_values <- sapply(2:5, function(k){
kmeans(psychographic_data, centers = k, nstart=25)$tot.withinss
})
wcss_values
## [1] 358553.50 177509.74 114028.16 84750.62
The move from k=2 to k=3 reduces WCSS by about 181,000, a massive improvement. An increase from k=3 to k=4 decreases WCSS by about 63,000—a notable but smaller improvement. Cutting back from k=4 to k=5 brings it down by just 29,000, indicating decreasing returns.
set.seed(123)
final_kmeans <- kmeans(psychographic_data, centers = 4, nstart = 25)
demographics_data$cluster <- final_kmeans$cluster
clust_pref <- demographics_data %>%
group_by(cluster, PreferenceGroup) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(names_from = PreferenceGroup, values_from = count, values_fill = 0)
# 2) Print the cross-tab table
print(clust_pref)
## # A tibble: 4 × 4
## cluster KaChooser KaNonChooser Middle
## <int> <int> <int> <int>
## 1 1 29 22 12
## 2 2 31 16 15
## 3 3 32 16 14
## 4 4 24 18 21
# 3) Bar chart: distribution of preference groups within each cluster
ggplot(demographics_data, aes(x = factor(cluster), fill = factor(PreferenceGroup))) +
geom_bar(position = "dodge") +
labs(title = "Attitudinal Segments by Preference Group",
x = "Cluster",
y = "Count",
fill = "Preference Group") +
theme_minimal()
Most Ka-Friendly Clusters Clusters 1 and 3 are represented well by KaChooser, but for opposite reasons. Cluster 3 also has a large Non-Chooser segment, which means Cluster 3 attitudes are more polarized.
Balanced vs. Polarized Cluster 2 is reasonably balanced between Choosers and Non-Choosers, while Cluster 3 splits sharply between Ka fans and rejectors.
Potential “Fence-Sitters” Cluster 4 has a bigger Middle segment, i.e., many respondents here are open to persuasion. Targeted promotion may persuade this group if critical issues (e.g., styling, features, or performance) are addressed.
Marketing Implications
Cluster 1: Tap the Ka’s already established strengths (e.g., practicality, reliability) since this group is already extremely open. Cluster 2: Position messages to emphasize why the Ka may perform better or be more appealing than competitors, given its more balanced views. Cluster 3: Identify what specific element polarizes the consumers—some are drawn towards just what the Ka offers, and others are turned away by it. Cluster 4: Address the issues that cause so many to remain undecided. A slight nudge in the correct direction could convert a substantial amount of Middle respondents into Ka Choosers.
demographic_vars <- c("Gender", "MaritalStatus", "NumberOfChildren",
"FirstTimePurchase", "IncomeCategory", "Age")
cat("\nCross-tabulations of demographic variables by cluster:\n")
##
## Cross-tabulations of demographic variables by cluster:
for (dv in demographic_vars) {
cat("\n-----------------------------------\n")
cat("Cross-tab for:", dv, "vs. Cluster\n")
freq_table <- table(demographics_data[[dv]], demographics_data$cluster)
print(freq_table)
}
##
## -----------------------------------
## Cross-tab for: Gender vs. Cluster
##
## 1 2 3 4
## 1 33 28 32 37
## 2 30 34 30 26
##
## -----------------------------------
## Cross-tab for: MaritalStatus vs. Cluster
##
## 1 2 3 4
## 1 33 32 31 31
## 2 11 3 9 5
## 3 19 27 22 27
##
## -----------------------------------
## Cross-tab for: NumberOfChildren vs. Cluster
##
## 1 2 3 4
## 0 42 29 38 39
## 1 12 12 13 11
## 2 4 12 6 9
## 3 5 7 4 4
## 4 0 2 1 0
##
## -----------------------------------
## Cross-tab for: FirstTimePurchase vs. Cluster
##
## 1 2 3 4
## 1 11 10 5 11
## 2 52 52 57 52
##
## -----------------------------------
## Cross-tab for: IncomeCategory vs. Cluster
##
## 1 2 3 4
## 1 4 5 5 9
## 2 14 10 12 10
## 3 9 14 10 13
## 4 11 9 13 13
## 5 15 15 13 8
## 6 10 9 9 10
##
## -----------------------------------
## Cross-tab for: Age vs. Cluster
##
## 1 2 3 4
## 20 3 1 2 2
## 21 0 0 0 2
## 22 3 0 0 1
## 23 1 0 2 1
## 24 2 1 2 1
## 25 1 0 2 1
## 26 2 3 4 4
## 27 1 3 3 3
## 28 2 4 0 1
## 29 4 2 0 3
## 30 2 3 3 3
## 31 2 1 1 4
## 32 2 3 5 1
## 33 1 3 2 0
## 34 3 3 3 2
## 35 2 2 2 1
## 36 3 1 1 0
## 37 0 2 1 4
## 38 0 1 0 2
## 39 4 1 0 4
## 40 3 4 2 5
## 41 2 6 5 0
## 42 2 1 5 4
## 43 7 0 3 3
## 44 0 5 4 2
## 45 0 1 1 1
## 46 0 3 0 0
## 47 1 1 2 1
## 48 1 1 3 3
## 49 4 0 0 0
## 50 1 1 0 0
## 51 0 2 0 2
## 52 2 0 0 0
## 54 0 0 2 0
## 55 0 1 1 2
## 56 1 0 1 0
## 57 0 1 0 0
## 58 1 1 0 0
Cluster 1
Gender: Balanced (33 vs. 30).
Marital Status: Mostly status=1 or 3.
Children: Primarily 0 or 1.
FirstTimePurchase: 11 yes, 52 no → Some first-timers, but majority are repeat buyers.
Income: Spread across categories, slightly more in mid/upper categories than the very bottom or top.
Age: Ranges from early 20s to 40s, with a few small clusters in the lower 20s and mid 30s.
Cluster 2
Gender: Balanced (28 vs. 34).
Marital Status: Fewer in category=2.
Children: Noticeably more 2-children households than Cluster 1.
FirstTimePurchase: 10 vs. 52 → also mostly repeat buyers.
Income: Fairly even distribution, no strong skew.
Age: Possibly leaning slightly older (some higher counts at ages 40, 41).
Cluster 3
Gender: Balanced (32 vs. 30).
Marital Status: Similar to cluster 1's distribution, with category=3 also prominent.
Children: 0 children is dominant (38), but 2 children is 6, 3 children is 4.
FirstTimePurchase: Only 5 yes → lowest proportion of first-timers among the clusters.
Income: Spread out, no glaring pattern.
Age: Also wide-ranging, though not heavily skewed to any single bracket.
Cluster 4
Gender: Balanced (37 vs. 26), with a slight tilt toward gender=1.
Marital Status: 31 of status=1, 27 of status=3.
Children: Mostly 0 children. A moderate number have 2 or 3 kids.
FirstTimePurchase: 11 yes, 52 no.
Income: A fair representation across categories 1–6, slightly more in category=1 than some clusters.
Age: Also quite mixed, with pockets in the upper 30s to mid 40s.
On the basis of cross-tabulation results and multinomial logit results, attitudinal (psychographic) segmentation is the superior method in separating Ford Ka’s target buyers. The following are the reasons why:
Demographics Show Weak Differentiation No sharp pattern is detectable for Ka preference from both the chi-square tests and within each cluster demographic profile for either age, gender, or income to uniquely predict Ka preference. Demographic quantifications were scattered relatively evenly such that only narrow differences between Ka Choosers and Non-Choosers were obtained.
Attitudinal Segments Identify Real Motivations K-means clustering of psychographic characteristics revealed cleaner edges across groups prioritizing attributes of reliability, eco-friendliness, style, or performance. These psychographics explained more accurately which Ka respondents were likely to be (e.g., the eco-friendly or price-conscious ones) than demographics did.
Other Actionable Targeting By addressing attitudes—like “wanting a smaller footprint,” “appreciating style,” or “emphasizing reliability”—Ford is able to more precisely customize its advertising campaigns. Rather than addressing a broad demographic (e.g., young adults under 30), they are able to address “eco and value seekers,” “performance enthusiasts,” or other attitude-based segments that most closely align with the Ka’s capabilities.
Increased Predictive Power When applied to a model predicting Ka preference, attitudinal clusters greatly improved the fit compared to using demographics alone. This suggests that lifestyle and psychographic characteristics have a greater impact on the purchase decision for a Ford Ka.
Therefore, whereas demographics provide broad context, attitudinal segmentation yields more predictive, richer information about who is most likely to buy the Ka and why.
Non-Response Bias
If certain types of respondents (e.g., older adults, higher-income individuals) systematically declined participation, the sample may not reflect the true population's attitudes or demographics.
Coverage Error
Depending on how participants were recruited (online, phone, in-person), some groups (those without reliable internet, for instance) might be underrepresented, skewing results.
Missing Data
If any questions had a high rate of unanswered items, the final sample for analyses could be smaller or biased toward those who completed all questions.
Small or Imbalanced Subsamples
Some demographic categories (e.g., certain income brackets or older age groups) might have very few respondents, undermining the reliability of chi-square and multinomial logit conclusions.
Row Alignment
Merging psychographic and demographic data requires a consistent ID or row order. Any mismatch (e.g., a respondent's cluster assignment not lining up with the correct demographic record) can introduce errors.
Self-Report Bias
Attitudinal questions rely on self-reported preferences, which may not always match real-world behavior.
Time/Context Sensitivity
Consumer attitudes can shift quickly with market changes. Results from a specific period might not generalize beyond that timeframe if market conditions evolve (e.g., new competitors, economic changes).