install.packages("tidyverse", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dyplyr", repos = "https://cloud.r-project.org")
## Warning: package 'dyplyr' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(dplyr)
install.packages("psych", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
install.packages("factoextra", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
install.packages("NbClust", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages
library(NbClust)
library(ggplot2)
install.packages("ggfortify", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages
library(ggfortify)
library(stats)
library(corrplot)
## corrplot 0.92 loaded
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(readxl)
Read the data for the Demographic data
dt1 <- read_csv("MKTMT2.csv")
## New names:
## Rows: 256 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (10): Demographic Data, ...2, ...3, ...4, ...5, ...6, ...7, ...8, ...9, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
Read the Data For Psychographic data
dt3 <- read_csv("MKTMT.csv")
## New names:
## Rows: 256 Columns: 89
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (63): Psychographic Data, ...2, ...3, ...4, ...5, ...6, ...7, ...8, ...9... lgl
## (26): ...64, ...65, ...66, ...67, ...68, ...69, ...70, ...71, ...72, ......
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...32`
## • `` -> `...33`
## • `` -> `...34`
## • `` -> `...35`
## • `` -> `...36`
## • `` -> `...37`
## • `` -> `...38`
## • `` -> `...39`
## • `` -> `...40`
## • `` -> `...41`
## • `` -> `...42`
## • `` -> `...43`
## • `` -> `...44`
## • `` -> `...45`
## • `` -> `...46`
## • `` -> `...47`
## • `` -> `...48`
## • `` -> `...49`
## • `` -> `...50`
## • `` -> `...51`
## • `` -> `...52`
## • `` -> `...53`
## • `` -> `...54`
## • `` -> `...55`
## • `` -> `...56`
## • `` -> `...57`
## • `` -> `...58`
## • `` -> `...59`
## • `` -> `...60`
## • `` -> `...61`
## • `` -> `...62`
## • `` -> `...63`
## • `` -> `...64`
## • `` -> `...65`
## • `` -> `...66`
## • `` -> `...67`
## • `` -> `...68`
## • `` -> `...69`
## • `` -> `...70`
## • `` -> `...71`
## • `` -> `...72`
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
Cleaning the data
Demodata <- dt1[-1:-5,]
Psydata <- dt3[-1:-5,-64:-89]
Fixing the issue with labels
names(Demodata) <- as.character(unlist(Demodata[1, ]))
Demodata <- Demodata[-1,]
names(Psydata) <- as.character(unlist(Psydata[1,]))
Psydata <- Psydata[-1,]
Question 6 Conduct cross tabulation analysis
# Cross-tabulation for gender
table_gender <- table(Demodata$`Preference Group`, Demodata$Gender)
Chisq_gender <- chisq.test(table_gender)
# Output the cross-tabulation
print("Cross-Tabulation for Gender:")
## [1] "Cross-Tabulation for Gender:"
print(table_gender)
##
## 1 2
## 1 54 62
## 2 36 36
## 3 40 22
# Output chi-squared test result for gender
print("Chi-Squared Test Result for Gender:")
## [1] "Chi-Squared Test Result for Gender:"
print(Chisq_gender)
##
## Pearson's Chi-squared test
##
## data: table_gender
## X-squared = 5.3861, df = 2, p-value = 0.06767
Demodata$`Age Category` <- as.factor(Demodata$`Age Category`)
Demodata$Gender <- as.factor(Demodata$Gender)
Demodata$`Marital Status` <- as.factor(Demodata$`Marital Status`)
Demodata$`1st Time Purchase` <- as.factor(Demodata$`1st Time Purchase`)
Demodata$`Children Category` <- as.factor(Demodata$`Children Category`)
Demodata$`Income Category` <- as.factor(Demodata$`Income Category`)
# Cross-tabulation for Age
table_age <- table(Demodata$`Preference Group`, Demodata$`Age Category`)
Chisq_age <- chisq.test(table_age)
# Output the cross-tabulation
print("Cross-Tabulation for Age Category:")
## [1] "Cross-Tabulation for Age Category:"
print(table_age)
##
## 1 2 3 4 5 6
## 1 10 18 23 11 36 18
## 2 3 13 12 11 15 18
## 3 11 12 12 9 12 6
# Output chi-squared test result for Age
print("Chi-Squared Test Result for Age Category:")
## [1] "Chi-Squared Test Result for Age Category:"
print(Chisq_age)
##
## Pearson's Chi-squared test
##
## data: table_age
## X-squared = 16.571, df = 10, p-value = 0.08442
# Cross-tabulation for
table_MS <- table(Demodata$`Preference Group`, Demodata$`Marital Status`)
Chisq_MS <- chisq.test(table_MS)
# Output the cross-tabulation
print("Cross-Tabulation for Marital Status:")
## [1] "Cross-Tabulation for Marital Status:"
print(table_MS)
##
## 1 2 3
## 1 66 14 36
## 2 34 6 32
## 3 27 8 27
# Output chi-squared test result for MS
print("Chi-Squared Test Result for Marital Status:")
## [1] "Chi-Squared Test Result for Marital Status:"
print(Chisq_MS)
##
## Pearson's Chi-squared test
##
## data: table_MS
## X-squared = 5.2093, df = 4, p-value = 0.2665
# Cross-tabulation for Number of Children category
table_NC <- table(Demodata$`Preference Group`, Demodata$`Children Category`)
Chisq_NC <- chisq.test(table_NC)
# Output the cross-tabulation
print("Cross-Tabulation for Children Category:")
## [1] "Cross-Tabulation for Children Category:"
print(table_NC)
##
## 0 1 2
## 1 62 29 25
## 2 45 12 15
## 3 41 7 14
# Output chi-squared test result for NC
print("Chi-Squared Test Result for Children Category:")
## [1] "Chi-Squared Test Result for Children Category:"
print(Chisq_NC)
##
## Pearson's Chi-squared test
##
## data: table_NC
## X-squared = 5.6242, df = 4, p-value = 0.229
# Cross-tabulation for first time purchase
table_ftp <- table(Demodata$`Preference Group`, Demodata$`1st Time Purchase`)
Chisq_ftp <- chisq.test(table_ftp)
# Output the cross-tabulation
print("Cross-Tabulation for first time purchase:")
## [1] "Cross-Tabulation for first time purchase:"
print(table_ftp)
##
## 1 2
## 1 13 103
## 2 8 64
## 3 16 46
# Output chi-squared test result for first time purchase
print("Chi-Squared Test Result for first time purchase:")
## [1] "Chi-Squared Test Result for first time purchase:"
print(Chisq_ftp)
##
## Pearson's Chi-squared test
##
## data: table_ftp
## X-squared = 7.9211, df = 2, p-value = 0.01905
# Cross-tabulation for Income category
table_ic <- table(Demodata$`Preference Group`, Demodata$`Income Category`)
Chisq_ic <- chisq.test(table_ic)
# Output the cross-tabulation
print("Cross-Tabulation for Income category:")
## [1] "Cross-Tabulation for Income category:"
print(table_ic)
##
## 1 2 3 4 5 6
## 1 11 19 18 19 28 21
## 2 5 15 16 16 12 8
## 3 7 12 12 11 11 9
# Output chi-squared test result for gender
print("Chi-Squared Test Result for Income category:")
## [1] "Chi-Squared Test Result for Income category:"
print(Chisq_ic)
##
## Pearson's Chi-squared test
##
## data: table_ic
## X-squared = 6.148, df = 10, p-value = 0.8027
library(nnet)
Multinomial Logit analysis:
model <- multinom(`Preference Group` ~ Gender + `Marital Status` + `1st Time Purchase` + `Age Category` + `Children Category` + `Income Category`, data = Demodata)
## # weights: 54 (34 variable)
## initial value 274.653072
## iter 10 value 242.348533
## iter 20 value 242.121883
## final value 242.120989
## converged
summary(model)
## Call:
## multinom(formula = `Preference Group` ~ Gender + `Marital Status` +
## `1st Time Purchase` + `Age Category` + `Children Category` +
## `Income Category`, data = Demodata)
##
## Coefficients:
## (Intercept) Gender2 `Marital Status`2 `Marital Status`3
## 2 -1.206737 -0.1832794 -0.3994818 0.4340006
## 3 1.010198 -0.9089321 0.1019688 0.4504932
## `1st Time Purchase`2 `Age Category`2 `Age Category`3 `Age Category`4
## 2 -0.3207994 0.9932451 0.7085791 1.31314357
## 3 -1.2400532 -0.1803708 -0.4817254 -0.02764787
## `Age Category`5 `Age Category`6 `Children Category`1 `Children Category`2
## 2 0.2918722 1.2510204 -0.5615958 -0.16765720
## 3 -1.0987485 -0.9952638 -1.0295955 0.08703588
## `Income Category`2 `Income Category`3 `Income Category`4 `Income Category`5
## 2 0.6915027 0.8019001 0.6306727 -0.002483459
## 3 0.6386279 0.4871914 0.5047717 0.039813363
## `Income Category`6
## 2 -0.36476058
## 3 0.04378515
##
## Std. Errors:
## (Intercept) Gender2 `Marital Status`2 `Marital Status`3
## 2 0.9314591 0.3266593 0.5606070 0.3408875
## 3 0.7762766 0.3598778 0.5370417 0.3722750
## `1st Time Purchase`2 `Age Category`2 `Age Category`3 `Age Category`4
## 2 0.5138924 0.7711673 0.7729856 0.8116159
## 3 0.4763903 0.6253002 0.6233675 0.6832566
## `Age Category`5 `Age Category`6 `Children Category`1 `Children Category`2
## 2 0.7513390 0.7734935 0.4180463 0.4064363
## 3 0.6063919 0.7057009 0.5046900 0.4296165
## `Income Category`2 `Income Category`3 `Income Category`4 `Income Category`5
## 2 0.6766709 0.6756445 0.6732419 0.6768921
## 3 0.6883135 0.6908293 0.6917375 0.6855149
## `Income Category`6
## 2 0.7164009
## 3 0.6986548
##
## Residual Deviance: 484.242
## AIC: 552.242
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
Anova(model)
## Analysis of Deviance Table (Type II tests)
##
## Response: Preference Group
## LR Chisq Df Pr(>Chisq)
## Gender 6.8374 2 0.03275 *
## `Marital Status` 3.5588 4 0.46900
## `1st Time Purchase` 7.1211 2 0.02842 *
## `Age Category` 14.7059 10 0.14316
## `Children Category` 5.7687 4 0.21710
## `Income Category` 7.8519 10 0.64330
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Cluster analysis:
library(cluster)
# Try out different numbers of clusters
for(k in 2:5) {
print(paste("K-means with", k, "clusters"))
model2 <- kmeans(Psydata, centers = k, nstart = 25)
print(model2$size)
}
## [1] "K-means with 2 clusters"
## [1] 125 125
## [1] "K-means with 3 clusters"
## [1] 84 83 83
## [1] "K-means with 4 clusters"
## [1] 62 62 63 63
## [1] "K-means with 5 clusters"
## [1] 49 51 49 52 49
sil_width <- silhouette(model2$cluster, dist(Psydata))
summary(sil_width)
## Silhouette of 250 units in 5 clusters from silhouette.default(x = model2$cluster, dist = dist(Psydata)) :
## Cluster sizes and average silhouette widths:
## 49 51 49 52 49
## 0.4973565 0.3817432 0.3811023 0.5026281 0.3769831
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.0004548 0.2989258 0.4910947 0.4284889 0.5816374 0.6487175
# K-means for 3 clusters
model_3 <- kmeans(Psydata, centers = 3, nstart = 25)
# Compute distance matrix
distance_matrix <- dist(Psydata)
# Silhouette analysis for 3 clusters
silhouette_3 <- silhouette(model_3$cluster, distance_matrix)
# Summary for 3 clusters
summary(silhouette_3)
## Silhouette of 250 units in 3 clusters from silhouette.default(x = model_3$cluster, dist = distance_matrix) :
## Cluster sizes and average silhouette widths:
## 83 83 84
## 0.5651368 0.5646827 0.4397129
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005791 0.408340 0.615206 0.522844 0.670733 0.706537
# K-means for 4 clusters
model_4 <- kmeans(Psydata, centers = 4, nstart = 25)
# Silhouette analysis for 4 clusters
silhouette_4 <- silhouette(model_4$cluster, distance_matrix)
# Summary for 4 clusters
summary(silhouette_4)
## Silhouette of 250 units in 4 clusters from silhouette.default(x = model_4$cluster, dist = distance_matrix) :
## Cluster sizes and average silhouette widths:
## 63 62 62 63
## 0.5336025 0.4146281 0.5326654 0.4064599
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.004457 0.339388 0.541374 0.471824 0.627521 0.677918
# K-means for 2 clusters
model_6 <- kmeans(Psydata, centers = 2, nstart = 25)
silhouette_6 <- silhouette(model_6$cluster, distance_matrix)
summary(silhouette_6)
## Silhouette of 250 units in 2 clusters from silhouette.default(x = model_6$cluster, dist = distance_matrix) :
## Cluster sizes and average silhouette widths:
## 125 125
## 0.5904591 0.5899188
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00581 0.53618 0.67637 0.59019 0.71517 0.73088
Psydata$cluster <- model_6$cluster
Demodata$cluster <- model_6$cluster
# Cross-tabulation for Age
table_genderc <- table(Demodata$cluster, Demodata$Gender)
# Output the cross-tabulation
print("Cross-Tabulation for Gender + Cluster:")
## [1] "Cross-Tabulation for Gender + Cluster:"
print(table_genderc)
##
## 1 2
## 1 61 64
## 2 69 56
# Cross-tabulation for Age
table_MSC <- table(Demodata$cluster, Demodata$`Marital Status`)
# Output the cross-tabulation
print("Cross-Tabulation for Marital Status + Cluster:")
## [1] "Cross-Tabulation for Marital Status + Cluster:"
print(table_MSC)
##
## 1 2 3
## 1 63 12 50
## 2 64 16 45
# Cross-tabulation for Age
table_FTPC <- table(Demodata$cluster, Demodata$`1st Time Purchase`)
# Output the cross-tabulation
print("Cross-Tabulation for First time purchase + Cluster:")
## [1] "Cross-Tabulation for First time purchase + Cluster:"
print(table_FTPC)
##
## 1 2
## 1 16 109
## 2 21 104
# Cross-tabulation for Age
table_Agec <- table(Demodata$cluster, Demodata$`Age Category`)
# Output the cross-tabulation
print("Cross-Tabulation for Age category + Cluster:")
## [1] "Cross-Tabulation for Age category + Cluster:"
print(table_Agec)
##
## 1 2 3 4 5 6
## 1 9 21 27 11 35 22
## 2 15 22 20 20 28 20
# Cross-tabulation for Age
table_CCC <- table(Demodata$cluster, Demodata$`Children Category`)
# Output the cross-tabulation
print("Cross-Tabulation for Children category + Cluster:")
## [1] "Cross-Tabulation for Children category + Cluster:"
print(table_CCC)
##
## 0 1 2
## 1 68 25 32
## 2 80 23 22
# Cross-tabulation for Age
table_ICC <- table(Demodata$cluster, Demodata$`Income Category`)
# Output the cross-tabulation
print("Cross-Tabulation for Income category + Cluster:")
## [1] "Cross-Tabulation for Income category + Cluster:"
print(table_ICC)
##
## 1 2 3 4 5 6
## 1 11 22 24 22 28 18
## 2 12 24 22 24 23 20