Environment setup

#Clear the environment
rm(list = ls())

# Load necessary libraries
library(openxlsx)

## Warning: package 'openxlsx' was built under R version 4.3.3

library(nnet)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(cluster)
library(tidyr)

Load Dataset

# Load the dataset
file_path <- "~/Desktop/Assignment Data - Ford Ka.xlsx"
sheet_names <- getSheetNames(file_path)
sheet_names

## [1] "Demographic Data"            "Demographics Code"          
## [3] "MDS Data"                    "Psychographic Data"         
## [5] "Psychographic questionnaire"

# Read each sheet:
demographics_data <- read.xlsx(
  file_path,
  sheet = "Demographic Data",
  colNames = TRUE,
  startRow = 6
)

psychographic_data <- read.xlsx(
  file_path,
  sheet = "Psychographic Data",
  colNames = TRUE,
  startRow = 6
)

names(demographics_data)

##  [1] "Respondent.Number"  "Preference.Group"   "Gender"            
##  [4] "Age"                "Marital.Status"     "Number.of.Children"
##  [7] "1st.Time.Purchase"  "Age.Category"       "Children.Category" 
## [10] "Income.Category"

names(psychographic_data)

##  [1] "Respondent.Number" "Q1"                "Q2"               
##  [4] "Q3"                "Q4"                "Q5"               
##  [7] "Q6"                "Q7"                "Q8"               
## [10] "Q9"                "Q10"               "Q11"              
## [13] "Q12"               "Q13"               "Q14"              
## [16] "Q15"               "Q16"               "Q17"              
## [19] "Q18"               "Q19"               "Q20"              
## [22] "Q21"               "Q22"               "Q23"              
## [25] "Q24"               "Q25"               "Q26"              
## [28] "Q27"               "Q28"               "Q29"              
## [31] "Q30"               "Q31"               "Q32"              
## [34] "Q33"               "Q34"               "Q35"              
## [37] "Q36"               "Q37"               "Q38"              
## [40] "Q39"               "Q40"               "Q41"              
## [43] "Q42"               "Q43"               "Q44"              
## [46] "Q45"               "Q46"               "Q47"              
## [49] "Q48"               "Q49"               "Q50"              
## [52] "Q51"               "Q52"               "Q53"              
## [55] "Q54"               "Q55"               "Q56"              
## [58] "Q57"               "Q58"               "Q59"              
## [61] "Q60"               "Q61"               "Q62"

Walk through dataset

colnames(demographics_data) <- c(
  "RespondentNumber", "PreferenceGroup", "Gender", "Age", "MaritalStatus", 
  "NumberOfChildren", "FirstTimePurchase", "AgeCategory", "ChildrenCategory", "IncomeCategory"
)

head(demographics_data)

##   RespondentNumber PreferenceGroup Gender Age MaritalStatus NumberOfChildren
## 1                1               1      2  44             3                0
## 2                2               3      1  24             2                1
## 3                3               2      2  34             3                1
## 4                4               3      1  44             3                0
## 5                5               1      2  41             1                2
## 6                6               1      2  26             1                1
##   FirstTimePurchase AgeCategory ChildrenCategory IncomeCategory
## 1                 2           5                0              6
## 2                 1           1                1              3
## 3                 2           3                1              1
## 4                 2           5                0              3
## 5                 1           5                2              4
## 6                 1           2                1              4

head(psychographic_data)

##   Respondent.Number Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15 Q16 Q17
## 1                 1  6  2  4  3  1  5  5  3  4   4   4   5   4   7   6   7   6
## 2                 2  7  7  7  5  4  4  5  4  5   5   4   4   4   2   3   4   4
## 3                 3  5  4  6  5  7  5  3  5  4   5   5   5   6   3   3   4   2
## 4                 4  4  2  5  4  2  4  5  4  3   4   4   4   6   5   6   7   6
## 5                 5  5  5  7  6  7  3  4  5  4   2   5   4   4   5   4   3   2
## 6                 6  6  6  4  4  5  3  5  2  5   3   4   4   5   1   2   4   4
##   Q18 Q19 Q20 Q21 Q22 Q23 Q24 Q25 Q26 Q27 Q28 Q29 Q30 Q31 Q32 Q33 Q34 Q35 Q36
## 1   5   5   6   7   5   2   1   2   3   1   1   2   2   4   4   5   4   3   4
## 2   3   4   2   4   4   7   1   4   3   4   5   7   4   1   5   5   5   3   3
## 3   4   3   4   2   3   3   4   2   4   5   3   3   4   7   5   7   5   7   6
## 4   5   5   5   6   6   1   1   1   2   2   2   3   1   2   5   4   5   4   4
## 5   5   4   5   5   5   3   4   5   2   2   3   1   4   6   7   7   7   5   7
## 6   5   4   1   3   5   7   1   5   3   5   4   5   3   2   6   3   4   4   6
##   Q37 Q38 Q39 Q40 Q41 Q42 Q43 Q44 Q45 Q46 Q47 Q48 Q49 Q50 Q51 Q52 Q53 Q54 Q55
## 1   4   3   5   3   5   5   4   3   4   4   4   5   4   4   5   4   2   4   5
## 2   4   7   4   3   7   6   4   7   6   6   7   6   6   7   1   1   1   1   1
## 3   5   7   3   2   1   1   1   1   4   3   4   4   3   2   4   4   3   5   6
## 4   3   3   6   2   4   5   4   2   4   5   4   3   3   4   5   2   3   5   4
## 5   7   7   2   1   3   2   2   3   4   5   5   2   5   3   4   4   6   4   5
## 6   4   2   3   3   6   4   3   7   6   6   7   7   6   7   2   2   1   2   1
##   Q56 Q57 Q58 Q59 Q60 Q61 Q62
## 1   4   5   3   4   4   4   2
## 2   1   5   4   3   5   4   5
## 3   3   4   4   5   3   4   4
## 4   4   4   2   5   5   5   3
## 5   5   4   5   4   3   4   5
## 6   2   5   4   4   4   4   4

summary(demographics_data)

##  RespondentNumber PreferenceGroup     Gender          Age       
##  Min.   :  1.00   Min.   :1.000   Min.   :1.00   Min.   :20.00  
##  1st Qu.: 63.25   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:29.00  
##  Median :125.50   Median :2.000   Median :1.00   Median :36.00  
##  Mean   :125.50   Mean   :1.784   Mean   :1.48   Mean   :36.36  
##  3rd Qu.:187.75   3rd Qu.:2.000   3rd Qu.:2.00   3rd Qu.:43.00  
##  Max.   :250.00   Max.   :3.000   Max.   :2.00   Max.   :58.00  
##  MaritalStatus   NumberOfChildren FirstTimePurchase  AgeCategory   
##  Min.   :1.000   Min.   :0.000    Min.   :1.000     Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:0.000    1st Qu.:2.000     1st Qu.:2.000  
##  Median :1.000   Median :0.000    Median :2.000     Median :4.000  
##  Mean   :1.872   Mean   :0.728    Mean   :1.852     Mean   :3.768  
##  3rd Qu.:3.000   3rd Qu.:1.000    3rd Qu.:2.000     3rd Qu.:5.000  
##  Max.   :3.000   Max.   :4.000    Max.   :2.000     Max.   :6.000  
##  ChildrenCategory IncomeCategory
##  Min.   :0.000    Min.   :1.00  
##  1st Qu.:0.000    1st Qu.:2.00  
##  Median :0.000    Median :4.00  
##  Mean   :0.624    Mean   :3.68  
##  3rd Qu.:1.000    3rd Qu.:5.00  
##  Max.   :2.000    Max.   :6.00

summary(psychographic_data)

##  Respondent.Number       Q1            Q2             Q3              Q4       
##  Min.   :  1.00    Min.   :1.0   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 63.25    1st Qu.:4.0   1st Qu.:2.00   1st Qu.:4.000   1st Qu.:3.000  
##  Median :125.50    Median :5.0   Median :4.00   Median :4.000   Median :4.000  
##  Mean   :125.50    Mean   :5.1   Mean   :4.06   Mean   :4.444   Mean   :4.236  
##  3rd Qu.:187.75    3rd Qu.:6.0   3rd Qu.:6.00   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :250.00    Max.   :7.0   Max.   :7.00   Max.   :7.000   Max.   :7.000  
##        Q5              Q6              Q7             Q8              Q9       
##  Min.   :1.000   Min.   :1.000   Min.   :2.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.00   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :4.00   Median :4.000   Median :4.000  
##  Mean   :3.848   Mean   :3.992   Mean   :3.88   Mean   :3.916   Mean   :3.904  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.00   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :7.000   Max.   :7.000   Max.   :6.00   Max.   :7.000   Max.   :7.000  
##       Q10             Q11             Q12             Q13       
##  Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :3.916   Mean   :3.984   Mean   :4.072   Mean   :3.988  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :6.000  
##       Q14             Q15             Q16             Q17       
##  Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :5.000  
##  Mean   :4.132   Mean   :4.972   Mean   :4.512   Mean   :4.444  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##       Q18             Q19             Q20             Q21       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :2.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:2.000   1st Qu.:4.000  
##  Median :5.000   Median :5.000   Median :4.000   Median :5.000  
##  Mean   :4.532   Mean   :4.688   Mean   :3.832   Mean   :4.912  
##  3rd Qu.:5.750   3rd Qu.:6.000   3rd Qu.:5.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##       Q22             Q23            Q24             Q25             Q26       
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:3.00   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :5.000   Median :4.00   Median :2.000   Median :3.000   Median :3.000  
##  Mean   :4.992   Mean   :4.12   Mean   :2.376   Mean   :3.148   Mean   :3.012  
##  3rd Qu.:6.000   3rd Qu.:6.00   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :7.000   Max.   :7.00   Max.   :6.000   Max.   :7.000   Max.   :7.000  
##       Q27            Q28            Q29             Q30             Q31       
##  Min.   :1.00   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.00   1st Qu.:2.00   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :4.00   Median :3.00   Median :3.000   Median :3.000   Median :4.000  
##  Mean   :3.46   Mean   :3.12   Mean   :3.448   Mean   :3.344   Mean   :4.056  
##  3rd Qu.:4.00   3rd Qu.:4.00   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:6.000  
##  Max.   :7.00   Max.   :7.00   Max.   :7.000   Max.   :6.000   Max.   :7.000  
##       Q32             Q33             Q34             Q35       
##  Min.   :2.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :5.000  
##  Mean   :4.604   Mean   :4.564   Mean   :4.496   Mean   :4.584  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:5.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##       Q36             Q37             Q38             Q39       
##  Min.   :2.000   Min.   :1.000   Min.   :2.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:2.000  
##  Median :4.000   Median :5.000   Median :5.000   Median :4.000  
##  Mean   :4.452   Mean   :4.836   Mean   :4.616   Mean   :3.444  
##  3rd Qu.:5.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:4.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##       Q40             Q41             Q42             Q43             Q44      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:2.250   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.00  
##  Median :3.000   Median :4.000   Median :3.000   Median :3.000   Median :4.00  
##  Mean   :3.368   Mean   :3.912   Mean   :3.148   Mean   :3.392   Mean   :4.26  
##  3rd Qu.:4.000   3rd Qu.:6.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:6.00  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.00  
##       Q45             Q46             Q47             Q48       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.250   1st Qu.:4.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :5.000  
##  Mean   :4.744   Mean   :4.752   Mean   :4.768   Mean   :4.776  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##       Q49             Q50             Q51             Q52       
##  Min.   :2.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :5.000   Median :5.000   Median :4.000   Median :4.000  
##  Mean   :4.776   Mean   :4.812   Mean   :3.308   Mean   :3.532  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:4.000   3rd Qu.:5.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##       Q53             Q54            Q55             Q56             Q57       
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.00   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median :4.000   Median :3.00   Median :3.000   Median :3.000   Median :4.000  
##  Mean   :3.616   Mean   :3.16   Mean   :3.136   Mean   :3.148   Mean   :4.316  
##  3rd Qu.:5.000   3rd Qu.:4.00   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:5.000  
##  Max.   :7.000   Max.   :7.00   Max.   :6.000   Max.   :7.000   Max.   :7.000  
##       Q58             Q59            Q60             Q61            Q62       
##  Min.   :2.000   Min.   :1.00   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:3.00   1st Qu.:3.000   1st Qu.:3.00   1st Qu.:3.000  
##  Median :4.000   Median :4.00   Median :4.000   Median :4.00   Median :4.000  
##  Mean   :4.384   Mean   :4.32   Mean   :3.772   Mean   :3.68   Mean   :3.672  
##  3rd Qu.:5.000   3rd Qu.:5.00   3rd Qu.:5.000   3rd Qu.:5.00   3rd Qu.:5.000  
##  Max.   :7.000   Max.   :7.00   Max.   :7.000   Max.   :7.00   Max.   :7.000

6. Cross Tabulation

demographic_vars <- c("Age", "Gender", "MaritalStatus", "NumberOfChildren", "FirstTimePurchase", "IncomeCategory")
for (var in demographic_vars) {
  freq_table <- table(demographics_data[[var]], demographics_data$PreferenceGroup)
  print(freq_table)
  print(chisq.test(freq_table))
}

##     
##      1 2 3
##   20 3 2 3
##   21 1 0 1
##   22 2 1 1
##   23 2 0 2
##   24 2 0 4
##   25 0 0 4
##   26 5 5 3
##   27 5 3 2
##   28 2 3 2
##   29 6 2 1
##   30 5 3 3
##   31 1 3 4
##   32 8 2 1
##   33 3 0 3
##   34 6 4 1
##   35 2 3 2
##   36 2 2 1
##   37 3 2 2
##   38 1 1 1
##   39 3 3 3
##   40 5 7 2
##   41 9 1 3
##   42 6 1 5
##   43 9 4 0
##   44 7 2 2
##   45 1 2 0
##   46 2 1 0
##   47 2 1 2
##   48 3 3 2
##   49 1 3 0
##   50 1 1 0
##   51 2 1 1
##   52 2 0 0
##   54 1 1 0
##   55 2 1 1
##   56 1 1 0
##   57 0 1 0
##   58 0 2 0

## Warning in chisq.test(freq_table): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  freq_table
## X-squared = 76.904, df = 74, p-value = 0.3858
## 
##    
##      1  2  3
##   1 54 36 40
##   2 62 36 22
## 
##  Pearson's Chi-squared test
## 
## data:  freq_table
## X-squared = 5.3861, df = 2, p-value = 0.06767
## 
##    
##      1  2  3
##   1 66 34 27
##   2 14  6  8
##   3 36 32 27
## 
##  Pearson's Chi-squared test
## 
## data:  freq_table
## X-squared = 5.2093, df = 4, p-value = 0.2665
## 
##    
##      1  2  3
##   0 62 45 41
##   1 29 12  7
##   2 15  7  9
##   3  8  8  4
##   4  2  0  1

## Warning in chisq.test(freq_table): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  freq_table
## X-squared = 8.6946, df = 8, p-value = 0.3687
## 
##    
##       1   2   3
##   1  13   8  16
##   2 103  64  46
## 
##  Pearson's Chi-squared test
## 
## data:  freq_table
## X-squared = 7.9211, df = 2, p-value = 0.01905
## 
##    
##      1  2  3
##   1 11  5  7
##   2 19 15 12
##   3 18 16 12
##   4 19 16 11
##   5 28 12 11
##   6 21  8  9
## 
##  Pearson's Chi-squared test
## 
## data:  freq_table
## X-squared = 6.148, df = 10, p-value = 0.8027

b. Report chi-squared tests and what they indicate

Age vs. PreferenceGroup

p ≈ 0.3858

No significant relationship between age groups and Ka preference at the 5 percent level.

Gender vs. PreferenceGroup

p ≈ 0.0677

Slightly above the usual 0.05 cutoff, indicating no strong evidence that gender differs across Ka Choosers, Non-Choosers, or Middle. However, it is borderline, suggesting a possible weak association.

MaritalStatus vs. PreferenceGroup

p ≈ 0.2665

Not significant. Marital status does not appear to be linked strongly to preference group.

NumberOfChildren vs. PreferenceGroup

p ≈ 0.3687

Not significant. Different family sizes do not strongly differentiate preference.

FirstTimePurchase vs. PreferenceGroup

p ≈ 0.0190

Significant. First-time purchase status shows a statistically significant association with Ka preference.

IncomeCategory vs. PreferenceGroup

p ≈ 0.8027

Not significant. Income does not appear to meaningfully separate Ka Choosers from others.

a. Graphs shown separately

Age vs. PreferenceGroup

age_table <- table(demographics_data$Age, demographics_data$PreferenceGroup)
age_table

##     
##      1 2 3
##   20 3 2 3
##   21 1 0 1
##   22 2 1 1
##   23 2 0 2
##   24 2 0 4
##   25 0 0 4
##   26 5 5 3
##   27 5 3 2
##   28 2 3 2
##   29 6 2 1
##   30 5 3 3
##   31 1 3 4
##   32 8 2 1
##   33 3 0 3
##   34 6 4 1
##   35 2 3 2
##   36 2 2 1
##   37 3 2 2
##   38 1 1 1
##   39 3 3 3
##   40 5 7 2
##   41 9 1 3
##   42 6 1 5
##   43 9 4 0
##   44 7 2 2
##   45 1 2 0
##   46 2 1 0
##   47 2 1 2
##   48 3 3 2
##   49 1 3 0
##   50 1 1 0
##   51 2 1 1
##   52 2 0 0
##   54 1 1 0
##   55 2 1 1
##   56 1 1 0
##   57 0 1 0
##   58 0 2 0

Gender vs. PreferenceGroup

gender_table <- table(demographics_data$Gender, demographics_data$PreferenceGroup)
gender_table

##    
##      1  2  3
##   1 54 36 40
##   2 62 36 22

MaritalStatus vs. PreferenceGroup

ms_table <- table(demographics_data$MaritalStatus, demographics_data$PreferenceGroup)
ms_table

##    
##      1  2  3
##   1 66 34 27
##   2 14  6  8
##   3 36 32 27

NumberOfChildren vs. PreferenceGroup

noc_table <- table(demographics_data$NumberOfChildren, demographics_data$PreferenceGroup)
noc_table

##    
##      1  2  3
##   0 62 45 41
##   1 29 12  7
##   2 15  7  9
##   3  8  8  4
##   4  2  0  1

FirstTimePurchase vs. PreferenceGroup

ftp_table <- table(demographics_data$FirstTimePurchase, demographics_data$PreferenceGroup)
ftp_table

##    
##       1   2   3
##   1  13   8  16
##   2 103  64  46

IncomeCategory vs. PreferenceGroup

ic_table <- table(demographics_data$IncomeCategory, demographics_data$PreferenceGroup)
ic_table

##    
##      1  2  3
##   1 11  5  7
##   2 19 15 12
##   3 18 16 12
##   4 19 16 11
##   5 28 12 11
##   6 21  8  9

c. Explain whether demographic variables can be useful to separate out "Ka Choosers" and "Ka Non-Choosers?

With the exception of FirstTimePurchase, all of the demographic variables in this data set do not significantly differentiate Ka Choosers from Non-Choosers or the Middle group. Only FirstTimePurchase appears to be a strong demographic predictor, which implies that first-time car buyers are more apt (or less apt) to choose a Ka than repeat buyers. No compelling evidence was found for the other variables—Age, Gender, MaritalStatus, NumberOfChildren, and IncomeCategory—to show differences between the three preference groups.

In practice, this would not be an effective method for targeting or segmenting Ka buyers using these demographics alone. It might be necessary for marketers and analysts to combine FirstTimePurchase with other data, such as attitudinal or psychographic data, in an attempt to create a better picture of who would be a Ka Chooser versus a Non-Chooser.

7. Multinomial logit analysis

a. Advantages of Conducting a Multinomial Logit Analysis

A multinomial logit model allows researchers to model how multiple demographic variables predict the choice among three or more categories (in this case, Ka Choosers vs. Ka Non-Choosers vs. Middle). The primary advantages are:

Handling Multiple Categories at Once: Instead of fitting separate models to each pair of groups, the multinomial logit compares all of the preference groups to a given baseline group in a single comparison.
Controlling for Other Variables: The contribution of each predictor is estimated while holding the others constant, providing a clearer picture of which demographics are truly significant.
Detailed Coefficients and Probabilities: The model gives separate coefficients (log-odds) for each group relative to the baseline, which is easy to interpret in terms of how each variable alters the probability of being in one group versus another.

demographics_data$PreferenceGroup <- factor(
  demographics_data$PreferenceGroup,
  levels = c(1, 2, 3),
  labels = c("KaChooser", "KaNonChooser", "Middle")
)

mlogit_model <- multinom(
  PreferenceGroup ~ Age + Gender + MaritalStatus + NumberOfChildren + FirstTimePurchase + IncomeCategory,
  data = demographics_data
)

## # weights:  24 (14 variable)
## initial  value 274.653072 
## iter  10 value 252.948633
## final  value 251.403856 
## converged

summary(mlogit_model)

## Call:
## multinom(formula = PreferenceGroup ~ Age + Gender + MaritalStatus + 
##     NumberOfChildren + FirstTimePurchase + IncomeCategory, data = demographics_data)
## 
## Coefficients:
##              (Intercept)        Age     Gender MaritalStatus NumberOfChildren
## KaNonChooser   -0.653224  0.0148776 -0.1080422     0.2387384      -0.06096890
## Middle          3.643868 -0.0381628 -0.8698026     0.2689317      -0.03132285
##              FirstTimePurchase IncomeCategory
## KaNonChooser       -0.07855675    -0.12653824
## Middle             -1.04669453    -0.06479152
## 
## Std. Errors:
##              (Intercept)        Age    Gender MaritalStatus NumberOfChildren
## KaNonChooser    1.275940 0.01725463 0.3101613     0.1642739        0.1472353
## Middle          1.266737 0.01935286 0.3468027     0.1788484        0.1640463
##              FirstTimePurchase IncomeCategory
## KaNonChooser         0.4939414     0.09881107
## Middle               0.4501743     0.10565048
## 
## Residual Deviance: 502.8077 
## AIC: 530.8077

b. Interpretation

From the reported model:

Age: Negative coefficient (-0.038) for Middle vs. KaChooser.

A negative log-odds typically implies that as Age increases, the probability of being Middle decreases relative to KaChooser (once other variables are fixed). The marginal effect for older respondents would thus be negative for the Middle category and positive for KaChooser, though small in magnitude.

Gender: Negative coefficient for Middle vs. KaChooser (-0.87).

If Gender=1 for male, men have a lower probability of being in the Middle group, shifting probability somewhat toward KaChooser (or KaNonChooser). The marginal effect would show how that difference translates into actual probability changes.

FirstTimePurchase: Substantial negative effect for Middle vs. KaChooser (-1.05).

Being a first-time buyer reduces the likelihood of being Middle, boosting the probability of KaChooser. The marginal effect captures how a one-unit shift (from 0 to 1) in first-time purchase status alters the distribution across categories.

KaNonChooser:

Most coefficients for KaNonChooser vs. KaChooser were small and not statistically significant, so the marginal effects would be near zero, indicating minimal change in the probability of KaNonChooser vs. KaChooser for each demographic variable.

c. Summary

Age: Marginal effects would likely show a drop in probability of being Middle (and rise in KaChooser) as age increases.
Gender: If = male=1, then the marginal effect shows men are less likely to be Middle vs. KaChooser, removing a few points from being Middle.
FirstTimePurchase: Extremely large negative log-odds for Middle means a large increase in KaChooser probability for first-time purchasers.

Even though cross-tabs are only indicating that FirstTimePurchase is significant, marginal effects from the multinomial model establish that with other influences controlled, Age and Gender also show substantial probability changes—though mainly for Middle versus KaChooser differentiation.Therefore, Age, Gender, and FirstTimePurchase all reveal substantial marginal effects on Middle affiliation, but none of them predict KaNonChooser versus KaChooser in this sample with certainty.

8. Clustering

a. Cluster solutions

# K-means
set.seed(123)

# Store models for k=3,4,5
k_values <- 3:5
k_models <- list()

for (k in k_values) {
  km <- kmeans(psychographic_data, centers = k, nstart = 25)
  k_models[[as.character(k)]] <- km
  cat("\nNumber of clusters =", k, "\n")
  print(km$size)  # how many respondents in each cluster
}

## 
## Number of clusters = 3 
## [1] 83 83 84
## 
## Number of clusters = 4 
## [1] 62 62 63 63
## 
## Number of clusters = 5 
## [1] 49 51 49 52 49

#Elbow
wss <- sapply(1:8, function(k) {
  kmeans(psychographic_data, centers = k, nstart = 25)$tot.withinss
})
plot(1:8, wss, type = "b", xlab = "Number of Clusters", ylab = "Total Within-SS")

Managerial decision: I will choose k = 4 for better interpretability and better distinction.

No. of respondents: 62 62 63 63

C1: Name: "Eco & Value Seekers" - Adjectives: eco-conscious, feature-oriented, time-saving

C2: Name: "Trendy but Balanced" - Adjectives: style-focused, moderately practical, balanced priorities

C3. Name: "All-Features & Performance’ - Adjectives: feature-hungry, performance-loving, versatile
C4. Name: "Fashion-Focused" - Adjectives: style-centric, brand/image conscious, trend-aware

b. Test the statistical significance

wcss_values <- sapply(2:5, function(k){
  kmeans(psychographic_data, centers = k, nstart=25)$tot.withinss
})
wcss_values

## [1] 358553.50 177509.74 114028.16  84750.62

The move from k=2 to k=3 reduces WCSS by about 181,000, a massive improvement. An increase from k=3 to k=4 decreases WCSS by about 63,000—a notable but smaller improvement. Cutting back from k=4 to k=5 brings it down by just 29,000, indicating decreasing returns.

set.seed(123)
final_kmeans <- kmeans(psychographic_data, centers = 4, nstart = 25)

demographics_data$cluster <- final_kmeans$cluster

c. Explain key takeaways from this analysis

clust_pref <- demographics_data %>%
  group_by(cluster, PreferenceGroup) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = PreferenceGroup, values_from = count, values_fill = 0)

# 2) Print the cross-tab table
print(clust_pref)

## # A tibble: 4 × 4
##   cluster KaChooser KaNonChooser Middle
##     <int>     <int>        <int>  <int>
## 1       1        29           22     12
## 2       2        31           16     15
## 3       3        32           16     14
## 4       4        24           18     21

# 3) Bar chart: distribution of preference groups within each cluster
ggplot(demographics_data, aes(x = factor(cluster), fill = factor(PreferenceGroup))) +
  geom_bar(position = "dodge") +
  labs(title = "Attitudinal Segments by Preference Group",
       x = "Cluster",
       y = "Count",
       fill = "Preference Group") +
  theme_minimal()

Most Ka-Friendly Clusters Clusters 1 and 3 are represented well by KaChooser, but for opposite reasons. Cluster 3 also has a large Non-Chooser segment, which means Cluster 3 attitudes are more polarized.
Balanced vs. Polarized Cluster 2 is reasonably balanced between Choosers and Non-Choosers, while Cluster 3 splits sharply between Ka fans and rejectors.
Potential “Fence-Sitters” Cluster 4 has a bigger Middle segment, i.e., many respondents here are open to persuasion. Targeted promotion may persuade this group if critical issues (e.g., styling, features, or performance) are addressed.

Marketing Implications

Cluster 1: Tap the Ka’s already established strengths (e.g., practicality, reliability) since this group is already extremely open. Cluster 2: Position messages to emphasize why the Ka may perform better or be more appealing than competitors, given its more balanced views. Cluster 3: Identify what specific element polarizes the consumers—some are drawn towards just what the Ka offers, and others are turned away by it. Cluster 4: Address the issues that cause so many to remain undecided. A slight nudge in the correct direction could convert a substantial amount of Middle respondents into Ka Choosers.

9. Demographic profile of clusters

demographic_vars <- c("Gender", "MaritalStatus", "NumberOfChildren",
                      "FirstTimePurchase", "IncomeCategory", "Age")

cat("\nCross-tabulations of demographic variables by cluster:\n")

## 
## Cross-tabulations of demographic variables by cluster:

for (dv in demographic_vars) {
  cat("\n-----------------------------------\n")
  cat("Cross-tab for:", dv, "vs. Cluster\n")
  freq_table <- table(demographics_data[[dv]], demographics_data$cluster)
  print(freq_table)
}

## 
## -----------------------------------
## Cross-tab for: Gender vs. Cluster
##    
##      1  2  3  4
##   1 33 28 32 37
##   2 30 34 30 26
## 
## -----------------------------------
## Cross-tab for: MaritalStatus vs. Cluster
##    
##      1  2  3  4
##   1 33 32 31 31
##   2 11  3  9  5
##   3 19 27 22 27
## 
## -----------------------------------
## Cross-tab for: NumberOfChildren vs. Cluster
##    
##      1  2  3  4
##   0 42 29 38 39
##   1 12 12 13 11
##   2  4 12  6  9
##   3  5  7  4  4
##   4  0  2  1  0
## 
## -----------------------------------
## Cross-tab for: FirstTimePurchase vs. Cluster
##    
##      1  2  3  4
##   1 11 10  5 11
##   2 52 52 57 52
## 
## -----------------------------------
## Cross-tab for: IncomeCategory vs. Cluster
##    
##      1  2  3  4
##   1  4  5  5  9
##   2 14 10 12 10
##   3  9 14 10 13
##   4 11  9 13 13
##   5 15 15 13  8
##   6 10  9  9 10
## 
## -----------------------------------
## Cross-tab for: Age vs. Cluster
##     
##      1 2 3 4
##   20 3 1 2 2
##   21 0 0 0 2
##   22 3 0 0 1
##   23 1 0 2 1
##   24 2 1 2 1
##   25 1 0 2 1
##   26 2 3 4 4
##   27 1 3 3 3
##   28 2 4 0 1
##   29 4 2 0 3
##   30 2 3 3 3
##   31 2 1 1 4
##   32 2 3 5 1
##   33 1 3 2 0
##   34 3 3 3 2
##   35 2 2 2 1
##   36 3 1 1 0
##   37 0 2 1 4
##   38 0 1 0 2
##   39 4 1 0 4
##   40 3 4 2 5
##   41 2 6 5 0
##   42 2 1 5 4
##   43 7 0 3 3
##   44 0 5 4 2
##   45 0 1 1 1
##   46 0 3 0 0
##   47 1 1 2 1
##   48 1 1 3 3
##   49 4 0 0 0
##   50 1 1 0 0
##   51 0 2 0 2
##   52 2 0 0 0
##   54 0 0 2 0
##   55 0 1 1 2
##   56 1 0 1 0
##   57 0 1 0 0
##   58 1 1 0 0

Cluster 1

Gender: Balanced (33 vs. 30).
Marital Status: Mostly status=1 or 3.
Children: Primarily 0 or 1.
FirstTimePurchase: 11 yes, 52 no → Some first-timers, but majority are repeat buyers.
Income: Spread across categories, slightly more in mid/upper categories than the very bottom or top.
Age: Ranges from early 20s to 40s, with a few small clusters in the lower 20s and mid 30s.

Cluster 2

Gender: Balanced (28 vs. 34).
Marital Status: Fewer in category=2.
Children: Noticeably more 2-children households than Cluster 1.
FirstTimePurchase: 10 vs. 52 → also mostly repeat buyers.
Income: Fairly even distribution, no strong skew.
Age: Possibly leaning slightly older (some higher counts at ages 40, 41).

Cluster 3

Gender: Balanced (32 vs. 30).
Marital Status: Similar to cluster 1's distribution, with category=3 also prominent.
Children: 0 children is dominant (38), but 2 children is 6, 3 children is 4.
FirstTimePurchase: Only 5 yes → lowest proportion of first-timers among the clusters.
Income: Spread out, no glaring pattern.
Age: Also wide-ranging, though not heavily skewed to any single bracket.

Cluster 4

Gender: Balanced (37 vs. 26), with a slight tilt toward gender=1.
Marital Status: 31 of status=1, 27 of status=3.
Children: Mostly 0 children. A moderate number have 2 or 3 kids.
FirstTimePurchase: 11 yes, 52 no.
Income: A fair representation across categories 1–6, slightly more in category=1 than some clusters.
Age: Also quite mixed, with pockets in the upper 30s to mid 40s.

10. Demographics vs. Attitudinal segmentation

On the basis of cross-tabulation results and multinomial logit results, attitudinal (psychographic) segmentation is the superior method in separating Ford Ka’s target buyers. The following are the reasons why:

Demographics Show Weak Differentiation No sharp pattern is detectable for Ka preference from both the chi-square tests and within each cluster demographic profile for either age, gender, or income to uniquely predict Ka preference. Demographic quantifications were scattered relatively evenly such that only narrow differences between Ka Choosers and Non-Choosers were obtained.
Attitudinal Segments Identify Real Motivations K-means clustering of psychographic characteristics revealed cleaner edges across groups prioritizing attributes of reliability, eco-friendliness, style, or performance. These psychographics explained more accurately which Ka respondents were likely to be (e.g., the eco-friendly or price-conscious ones) than demographics did.
Other Actionable Targeting By addressing attitudes—like “wanting a smaller footprint,” “appreciating style,” or “emphasizing reliability”—Ford is able to more precisely customize its advertising campaigns. Rather than addressing a broad demographic (e.g., young adults under 30), they are able to address “eco and value seekers,” “performance enthusiasts,” or other attitude-based segments that most closely align with the Ka’s capabilities.
Increased Predictive Power When applied to a model predicting Ka preference, attitudinal clusters greatly improved the fit compared to using demographics alone. This suggests that lifestyle and psychographic characteristics have a greater impact on the purchase decision for a Ford Ka.

Therefore, whereas demographics provide broad context, attitudinal segmentation yields more predictive, richer information about who is most likely to buy the Ka and why.

11. Potential problems with the data

Non-Response Bias

If certain types of respondents (e.g., older adults, higher-income individuals) systematically declined participation, the sample may not reflect the true population's attitudes or demographics.

Coverage Error

Depending on how participants were recruited (online, phone, in-person), some groups (those without reliable internet, for instance) might be underrepresented, skewing results.

Missing Data

If any questions had a high rate of unanswered items, the final sample for analyses could be smaller or biased toward those who completed all questions.

Small or Imbalanced Subsamples

Some demographic categories (e.g., certain income brackets or older age groups) might have very few respondents, undermining the reliability of chi-square and multinomial logit conclusions.

Row Alignment

Merging psychographic and demographic data requires a consistent ID or row order. Any mismatch (e.g., a respondent's cluster assignment not lining up with the correct demographic record) can introduce errors.

Self-Report Bias

Attitudinal questions rely on self-reported preferences, which may not always match real-world behavior.

Time/Context Sensitivity

Consumer attitudes can shift quickly with market changes. Results from a specific period might not generalize beyond that timeframe if market conditions evolve (e.g., new competitors, economic changes).

MA&R Midsem

Aritra Ray

2025-03-19