install.packages("tidyverse", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

install.packages("dyplyr", repos = "https://cloud.r-project.org")

## Warning: package 'dyplyr' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

library(dplyr)
install.packages("psych", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages

library(psych)

## Warning: package 'psych' was built under R version 4.2.3

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

install.packages("factoextra", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

install.packages("NbClust", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages

library(NbClust)
library(ggplot2)
install.packages("ggfortify", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpP0hCgJ/downloaded_packages

library(ggfortify)
library(stats)
library(corrplot)

## corrplot 0.92 loaded

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(readxl)

Read the data for the Demographic data

dt1 <- read_csv("MKTMT2.csv")

## New names:
## Rows: 256 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (10): Demographic Data, ...2, ...3, ...4, ...5, ...6, ...7, ...8, ...9, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`

Read the Data For Psychographic data

dt3 <- read_csv("MKTMT.csv")

## New names:
## Rows: 256 Columns: 89
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (63): Psychographic Data, ...2, ...3, ...4, ...5, ...6, ...7, ...8, ...9... lgl
## (26): ...64, ...65, ...66, ...67, ...68, ...69, ...70, ...71, ...72, ......
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...32`
## • `` -> `...33`
## • `` -> `...34`
## • `` -> `...35`
## • `` -> `...36`
## • `` -> `...37`
## • `` -> `...38`
## • `` -> `...39`
## • `` -> `...40`
## • `` -> `...41`
## • `` -> `...42`
## • `` -> `...43`
## • `` -> `...44`
## • `` -> `...45`
## • `` -> `...46`
## • `` -> `...47`
## • `` -> `...48`
## • `` -> `...49`
## • `` -> `...50`
## • `` -> `...51`
## • `` -> `...52`
## • `` -> `...53`
## • `` -> `...54`
## • `` -> `...55`
## • `` -> `...56`
## • `` -> `...57`
## • `` -> `...58`
## • `` -> `...59`
## • `` -> `...60`
## • `` -> `...61`
## • `` -> `...62`
## • `` -> `...63`
## • `` -> `...64`
## • `` -> `...65`
## • `` -> `...66`
## • `` -> `...67`
## • `` -> `...68`
## • `` -> `...69`
## • `` -> `...70`
## • `` -> `...71`
## • `` -> `...72`
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`

Cleaning the data

Demodata <- dt1[-1:-5,]

Psydata <- dt3[-1:-5,-64:-89]

Fixing the issue with labels

names(Demodata) <- as.character(unlist(Demodata[1, ]))

Demodata <- Demodata[-1,]

names(Psydata) <- as.character(unlist(Psydata[1,]))
Psydata <- Psydata[-1,]

Question 6 Conduct cross tabulation analysis

# Cross-tabulation for gender 
table_gender <- table(Demodata$`Preference Group`, Demodata$Gender)

Chisq_gender <- chisq.test(table_gender)

# Output the cross-tabulation
print("Cross-Tabulation for Gender:")

## [1] "Cross-Tabulation for Gender:"

print(table_gender)

##    
##      1  2
##   1 54 62
##   2 36 36
##   3 40 22

# Output chi-squared test result for gender
print("Chi-Squared Test Result for Gender:")

## [1] "Chi-Squared Test Result for Gender:"

print(Chisq_gender)

## 
##  Pearson's Chi-squared test
## 
## data:  table_gender
## X-squared = 5.3861, df = 2, p-value = 0.06767

Demodata$`Age Category` <- as.factor(Demodata$`Age Category`)
Demodata$Gender <- as.factor(Demodata$Gender)
Demodata$`Marital Status` <- as.factor(Demodata$`Marital Status`)
Demodata$`1st Time Purchase` <- as.factor(Demodata$`1st Time Purchase`)
Demodata$`Children Category` <- as.factor(Demodata$`Children Category`)
Demodata$`Income Category` <- as.factor(Demodata$`Income Category`)

# Cross-tabulation for Age 
table_age <- table(Demodata$`Preference Group`, Demodata$`Age Category`)

Chisq_age <- chisq.test(table_age)

# Output the cross-tabulation
print("Cross-Tabulation for Age Category:")

## [1] "Cross-Tabulation for Age Category:"

print(table_age)

##    
##      1  2  3  4  5  6
##   1 10 18 23 11 36 18
##   2  3 13 12 11 15 18
##   3 11 12 12  9 12  6

# Output chi-squared test result for Age
print("Chi-Squared Test Result for Age Category:")

## [1] "Chi-Squared Test Result for Age Category:"

print(Chisq_age)

## 
##  Pearson's Chi-squared test
## 
## data:  table_age
## X-squared = 16.571, df = 10, p-value = 0.08442

# Cross-tabulation for  
table_MS <- table(Demodata$`Preference Group`, Demodata$`Marital Status`)

Chisq_MS <- chisq.test(table_MS)

# Output the cross-tabulation
print("Cross-Tabulation for Marital Status:")

## [1] "Cross-Tabulation for Marital Status:"

print(table_MS)

##    
##      1  2  3
##   1 66 14 36
##   2 34  6 32
##   3 27  8 27

# Output chi-squared test result for MS
print("Chi-Squared Test Result for Marital Status:")

## [1] "Chi-Squared Test Result for Marital Status:"

print(Chisq_MS)

## 
##  Pearson's Chi-squared test
## 
## data:  table_MS
## X-squared = 5.2093, df = 4, p-value = 0.2665

# Cross-tabulation for Number of Children category 
table_NC <- table(Demodata$`Preference Group`, Demodata$`Children Category`)

Chisq_NC <- chisq.test(table_NC)

# Output the cross-tabulation
print("Cross-Tabulation for Children Category:")

## [1] "Cross-Tabulation for Children Category:"

print(table_NC)

##    
##      0  1  2
##   1 62 29 25
##   2 45 12 15
##   3 41  7 14

# Output chi-squared test result for NC
print("Chi-Squared Test Result for Children Category:")

## [1] "Chi-Squared Test Result for Children Category:"

print(Chisq_NC)

## 
##  Pearson's Chi-squared test
## 
## data:  table_NC
## X-squared = 5.6242, df = 4, p-value = 0.229

# Cross-tabulation for first time purchase 
table_ftp <- table(Demodata$`Preference Group`, Demodata$`1st Time Purchase`)

Chisq_ftp <- chisq.test(table_ftp)

# Output the cross-tabulation
print("Cross-Tabulation for first time purchase:")

## [1] "Cross-Tabulation for first time purchase:"

print(table_ftp)

##    
##       1   2
##   1  13 103
##   2   8  64
##   3  16  46

# Output chi-squared test result for first time purchase 
print("Chi-Squared Test Result for first time purchase:")

## [1] "Chi-Squared Test Result for first time purchase:"

print(Chisq_ftp)

## 
##  Pearson's Chi-squared test
## 
## data:  table_ftp
## X-squared = 7.9211, df = 2, p-value = 0.01905

# Cross-tabulation for Income category
table_ic <- table(Demodata$`Preference Group`, Demodata$`Income Category`)

Chisq_ic <- chisq.test(table_ic)

# Output the cross-tabulation
print("Cross-Tabulation for Income category:")

## [1] "Cross-Tabulation for Income category:"

print(table_ic)

##    
##      1  2  3  4  5  6
##   1 11 19 18 19 28 21
##   2  5 15 16 16 12  8
##   3  7 12 12 11 11  9

# Output chi-squared test result for gender
print("Chi-Squared Test Result for Income category:")

## [1] "Chi-Squared Test Result for Income category:"

print(Chisq_ic)

## 
##  Pearson's Chi-squared test
## 
## data:  table_ic
## X-squared = 6.148, df = 10, p-value = 0.8027

library(nnet)

Multinomial Logit analysis:

model <- multinom(`Preference Group` ~ Gender + `Marital Status` + `1st Time Purchase` + `Age Category` + `Children Category` + `Income Category`, data = Demodata)

## # weights:  54 (34 variable)
## initial  value 274.653072 
## iter  10 value 242.348533
## iter  20 value 242.121883
## final  value 242.120989 
## converged

summary(model)

## Call:
## multinom(formula = `Preference Group` ~ Gender + `Marital Status` + 
##     `1st Time Purchase` + `Age Category` + `Children Category` + 
##     `Income Category`, data = Demodata)
## 
## Coefficients:
##   (Intercept)    Gender2 `Marital Status`2 `Marital Status`3
## 2   -1.206737 -0.1832794        -0.3994818         0.4340006
## 3    1.010198 -0.9089321         0.1019688         0.4504932
##   `1st Time Purchase`2 `Age Category`2 `Age Category`3 `Age Category`4
## 2           -0.3207994       0.9932451       0.7085791      1.31314357
## 3           -1.2400532      -0.1803708      -0.4817254     -0.02764787
##   `Age Category`5 `Age Category`6 `Children Category`1 `Children Category`2
## 2       0.2918722       1.2510204           -0.5615958          -0.16765720
## 3      -1.0987485      -0.9952638           -1.0295955           0.08703588
##   `Income Category`2 `Income Category`3 `Income Category`4 `Income Category`5
## 2          0.6915027          0.8019001          0.6306727       -0.002483459
## 3          0.6386279          0.4871914          0.5047717        0.039813363
##   `Income Category`6
## 2        -0.36476058
## 3         0.04378515
## 
## Std. Errors:
##   (Intercept)   Gender2 `Marital Status`2 `Marital Status`3
## 2   0.9314591 0.3266593         0.5606070         0.3408875
## 3   0.7762766 0.3598778         0.5370417         0.3722750
##   `1st Time Purchase`2 `Age Category`2 `Age Category`3 `Age Category`4
## 2            0.5138924       0.7711673       0.7729856       0.8116159
## 3            0.4763903       0.6253002       0.6233675       0.6832566
##   `Age Category`5 `Age Category`6 `Children Category`1 `Children Category`2
## 2       0.7513390       0.7734935            0.4180463            0.4064363
## 3       0.6063919       0.7057009            0.5046900            0.4296165
##   `Income Category`2 `Income Category`3 `Income Category`4 `Income Category`5
## 2          0.6766709          0.6756445          0.6732419          0.6768921
## 3          0.6883135          0.6908293          0.6917375          0.6855149
##   `Income Category`6
## 2          0.7164009
## 3          0.6986548
## 
## Residual Deviance: 484.242 
## AIC: 552.242

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

Anova(model)

## Analysis of Deviance Table (Type II tests)
## 
## Response: Preference Group
##                     LR Chisq Df Pr(>Chisq)  
## Gender                6.8374  2    0.03275 *
## `Marital Status`      3.5588  4    0.46900  
## `1st Time Purchase`   7.1211  2    0.02842 *
## `Age Category`       14.7059 10    0.14316  
## `Children Category`   5.7687  4    0.21710  
## `Income Category`     7.8519 10    0.64330  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Cluster analysis:

library(cluster)

# Try out different numbers of clusters
for(k in 2:5) {
  print(paste("K-means with", k, "clusters"))
  model2 <- kmeans(Psydata, centers = k, nstart = 25)
  print(model2$size) 
}

## [1] "K-means with 2 clusters"
## [1] 125 125
## [1] "K-means with 3 clusters"
## [1] 84 83 83
## [1] "K-means with 4 clusters"
## [1] 62 62 63 63
## [1] "K-means with 5 clusters"
## [1] 49 51 49 52 49

sil_width <- silhouette(model2$cluster, dist(Psydata))
summary(sil_width)

## Silhouette of 250 units in 5 clusters from silhouette.default(x = model2$cluster, dist = dist(Psydata)) :
##  Cluster sizes and average silhouette widths:
##        49        51        49        52        49 
## 0.4973565 0.3817432 0.3811023 0.5026281 0.3769831 
## Individual silhouette widths:
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.0004548  0.2989258  0.4910947  0.4284889  0.5816374  0.6487175

# K-means for 3 clusters
model_3 <- kmeans(Psydata, centers = 3, nstart = 25)

# Compute distance matrix
distance_matrix <- dist(Psydata)

# Silhouette analysis for 3 clusters
silhouette_3 <- silhouette(model_3$cluster, distance_matrix)

# Summary for 3 clusters
summary(silhouette_3)

## Silhouette of 250 units in 3 clusters from silhouette.default(x = model_3$cluster, dist = distance_matrix) :
##  Cluster sizes and average silhouette widths:
##        83        83        84 
## 0.5651368 0.5646827 0.4397129 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005791 0.408340 0.615206 0.522844 0.670733 0.706537

# K-means for 4 clusters
model_4 <- kmeans(Psydata, centers = 4, nstart = 25)

# Silhouette analysis for 4 clusters
silhouette_4 <- silhouette(model_4$cluster, distance_matrix)

# Summary for 4 clusters
summary(silhouette_4)

## Silhouette of 250 units in 4 clusters from silhouette.default(x = model_4$cluster, dist = distance_matrix) :
##  Cluster sizes and average silhouette widths:
##        63        62        62        63 
## 0.5336025 0.4146281 0.5326654 0.4064599 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.004457 0.339388 0.541374 0.471824 0.627521 0.677918

# K-means for 2 clusters
model_6 <- kmeans(Psydata, centers = 2, nstart = 25)

silhouette_6 <- silhouette(model_6$cluster, distance_matrix)

summary(silhouette_6)

## Silhouette of 250 units in 2 clusters from silhouette.default(x = model_6$cluster, dist = distance_matrix) :
##  Cluster sizes and average silhouette widths:
##       125       125 
## 0.5904591 0.5899188 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00581 0.53618 0.67637 0.59019 0.71517 0.73088

Psydata$cluster <- model_6$cluster

Demodata$cluster <- model_6$cluster

# Cross-tabulation for Age 
table_genderc <- table(Demodata$cluster, Demodata$Gender)
# Output the cross-tabulation
print("Cross-Tabulation for Gender + Cluster:")

## [1] "Cross-Tabulation for Gender + Cluster:"

print(table_genderc)

##    
##      1  2
##   1 61 64
##   2 69 56

# Cross-tabulation for Age 
table_MSC <- table(Demodata$cluster, Demodata$`Marital Status`)
# Output the cross-tabulation
print("Cross-Tabulation for Marital Status + Cluster:")

## [1] "Cross-Tabulation for Marital Status + Cluster:"

print(table_MSC)

##    
##      1  2  3
##   1 63 12 50
##   2 64 16 45

# Cross-tabulation for Age 
table_FTPC <- table(Demodata$cluster, Demodata$`1st Time Purchase`)
# Output the cross-tabulation
print("Cross-Tabulation for First time purchase + Cluster:")

## [1] "Cross-Tabulation for First time purchase + Cluster:"

print(table_FTPC)

##    
##       1   2
##   1  16 109
##   2  21 104

# Cross-tabulation for Age 
table_Agec <- table(Demodata$cluster, Demodata$`Age Category`)
# Output the cross-tabulation
print("Cross-Tabulation for Age category + Cluster:")

## [1] "Cross-Tabulation for Age category + Cluster:"

print(table_Agec)

##    
##      1  2  3  4  5  6
##   1  9 21 27 11 35 22
##   2 15 22 20 20 28 20

# Cross-tabulation for Age 
table_CCC <- table(Demodata$cluster, Demodata$`Children Category`)
# Output the cross-tabulation
print("Cross-Tabulation for Children category + Cluster:")

## [1] "Cross-Tabulation for Children category + Cluster:"

print(table_CCC)

##    
##      0  1  2
##   1 68 25 32
##   2 80 23 22

# Cross-tabulation for Age 
table_ICC <- table(Demodata$cluster, Demodata$`Income Category`)
# Output the cross-tabulation
print("Cross-Tabulation for Income category + Cluster:")

## [1] "Cross-Tabulation for Income category + Cluster:"

print(table_ICC)

##    
##      1  2  3  4  5  6
##   1 11 22 24 22 28 18
##   2 12 24 22 24 23 20

Midterm Market Research

Diego De Armas

2024-03-26

Read the data for the Demographic data

Read the Data For Psychographic data

Cleaning the data

Fixing the issue with labels

Question 6 Conduct cross tabulation analysis

Multinomial Logit analysis:

Cluster analysis: