Install/Load PackAges
# Load the packages needed for data cleaning, analysis, and visualization
library(tidyverse) # Core data wrangling and plotting tools (dplyr, ggplot2, etc.)
library(janitor) # Helpful tools for cleaning data and summarizing missingness
library(psych) # Descriptive statistics and scale-related functions
library(patchwork) # Combine multiple ggplots into one figure
library(naniar) # Missing data summaries and visualization
library(stringr) # cleaning messy text data
library(forcats) # working with categorical variables
library(purrr) # doing things repeatedly without loops
library(zipcodeR) # turn zip codes into locations
library(maps) # draw the U.S. map
library(Hmisc) # for correlations and significance tests
Load the dataset
# Set working directory
setwd("/Users/maya/Desktop")
# Read the CSV (prevents unwanted factor conversion)
df <- read.csv("MKTG 4450.csv")
head(df)
names(df)
[1] "Start.Date" "EndDate" "Status"
[4] "IPAddress" "Progress" "Duration..in.seconds."
[7] "Finished" "RecordedDate" "ResponseId"
[10] "UserLanguage" "purchase.intention" "brand.attitude_NPS_GROUP"
[13] "brand.attitude" "seller.trust" "fairness_1"
[16] "fairness_2" "quality.perception" "perceived.benefits"
[19] "seller.control" "Attention.check" "Year.of.Birth"
[22] "EDU" "Ethnicity" "Ethnicity1"
[25] "Ethnicity2" "Ethnicity3" "Ethnicity4"
[28] "Ethnicity5" "Ethnicity6" "Ethnicity7"
[31] "Gender" "Income" "Zip.code"
[34] "open.comments" "Condition" "IV1_price_format"
[37] "IV2_Surcharge_type"
# Check variable names (the name of each column in your cleaned excel dataset)
names(df)
[1] "Start.Date" "EndDate" "Status"
[4] "IPAddress" "Progress" "Duration..in.seconds."
[7] "Finished" "RecordedDate" "ResponseId"
[10] "UserLanguage" "purchase.intention" "brand.attitude_NPS_GROUP"
[13] "brand.attitude" "seller.trust" "fairness_1"
[16] "fairness_2" "quality.perception" "perceived.benefits"
[19] "seller.control" "Attention.check" "Year.of.Birth"
[22] "EDU" "Ethnicity" "Ethnicity1"
[25] "Ethnicity2" "Ethnicity3" "Ethnicity4"
[28] "Ethnicity5" "Ethnicity6" "Ethnicity7"
[31] "Gender" "Income" "Zip.code"
[34] "open.comments" "Condition" "IV1_price_format"
[37] "IV2_Surcharge_type"
Check for missing values (nice to better understand the data, not
necessary since we’ve cleaned up the dataset)
# Missing data overview
colSums(is.na(df)) # Count how many missing values for each variable
Start.Date EndDate Status
0 0 0
IPAddress Progress Duration..in.seconds.
0 0 0
Finished RecordedDate ResponseId
0 0 0
UserLanguage purchase.intention brand.attitude_NPS_GROUP
0 0 0
brand.attitude seller.trust fairness_1
0 0 0
fairness_2 quality.perception perceived.benefits
0 0 0
seller.control Attention.check Year.of.Birth
0 0 0
EDU Ethnicity Ethnicity1
0 0 0
Ethnicity2 Ethnicity3 Ethnicity4
0 0 0
Ethnicity5 Ethnicity6 Ethnicity7
0 0 0
Gender Income Zip.code
0 0 0
open.comments Condition IV1_price_format
12 0 0
IV2_Surcharge_type
0
miss_var_summary(df) # A cleaner summary of missing values by variable
Recorde variaboes, if necessary
# fairness_1 and fairness_2 are supposed to measure the same idea,
# but fairness_2 is worded/scaled in the opposite direction.
# fairness_1: 1 = very unfairly, 7 = very fairly; fairness_2: 1 = extremely fairly, 7= extremely unfairly; This is why we need to reverse code fairness_2 to make the scale consistent with fairness_1
df$fairness_2_reverse <- 8 - df$fairness_2
df$fairness <- (df$fairness_1 + df$fairness_2_reverse)/2 # Create a composite fairness score by averaging the two fairness items
# Age: We measured year of birth, and we want to turn it into participants' age for easier interpretation
df$Year.of.Birth <- as.numeric(df$Year.of.Birth) # Convert Year.of.Birth to numeric in case it was imported as text
df$Age <- 2026 - df$Year.of.Birth
Prepare variables
Goal: Make sure your IVs are categorical and DVs are numerical (the
most common type of DVs; for linear regression) or categorical (for chi
square test or logistic regression)
# Make sure IVs are categorical factors. as.factor() tells R that these are categorical variables
df$IV1_price_format <- as.factor(df$IV1_price_format) # Categorical
df$IV2_surcharge_type <- as.factor(df$IV2_surcharge_type) # Categorical
# Make sure (most of the) DVs and covariates are numerical values
#DVs
df$purchase.intention <- as.numeric(df$purchase.intention) # Numeric
df$brand.attitude_NPS_GROUP <- as.factor(df$brand.attitude_NPS_GROUP) # Categorical
df$brand.attitude <- as.numeric(df$brand.attitude) # Numeric
df$seller.trust <- as.numeric(df$seller.trust) # Numeric
df$fairness <- as.numeric(df$fairness) # Numeric
# Covariates
df$quality.perception <- as.numeric(df$quality.perception) # Numeric
df$perceived.benefits <- as.numeric(df$perceived.benefits) # Numeric
df$seller.control <- as.numeric(df$seller.control) # Numeric
df$surcharge.novelty <- as.numeric(df$surcharge.novelty) # Numeric
a) Explore demographics
Goal: To understand who is in our data (their demographics)
# Make sure Age is numeric
df$Age <- as.numeric(df$Age)
# Age: summarize # of valid cases, mean, SD, min, median, and max (na.rm = TRUE: NAs are removed before calculation)
df %>%
summarise(
n_nonmissing = sum(!is.na(Age)),
mean = mean(Age, na.rm = TRUE),
sd = sd(Age, na.rm = TRUE),
min = min(Age, na.rm = TRUE),
median = median(Age, na.rm = TRUE),
max = max(Age, na.rm = TRUE)
)
# Age histogram
ggplot(df, aes(x = Age)) +
geom_histogram(
binwidth = 5, # Each bar covers a 5-year range
fill = "#7A0019", # Bar color
color = "white" # White border around bars
) +
labs(
title = "Age Distribution",
subtitle = "Sample Characteristics",
x = "Age (years)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Gender distribution: frequency table and proportions (since Gender is a categorical variable)
df %>%
count(Gender) %>%
mutate(prop = n / sum(n))
# Bar chart for gender counts
df %>%
count(Gender) %>%
ggplot(aes(x = fct_reorder(Gender, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Gender",
x = NULL,
y = "Count"
) +
theme_minimal()
# Education distribution: counts and proportions
df %>%
count(EDU) %>%
mutate(prop = n / sum(n)) %>%
arrange(desc(n))
# Bar chart for education counts
df %>%
count(EDU) %>%
ggplot(aes(x = fct_reorder(EDU, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Education",
x = NULL,
y = "Count"
) +
theme_minimal()
# Income distribution: counts and percentages
df %>%
count(Income) %>%
mutate(percent = round(100 * n / sum(n), 1))
# Bar chart of income categories
df %>%
count(Income) %>%
ggplot(aes(x = Income, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Income",
x = NULL,
y = "Count"
) +
theme_minimal()
# Zip code: count how many zip codes are present and the number of unique zip codes
df %>%
summarise(
n_zip_nonmissing = sum(!is.na(Zip.code) & Zip.code != ""),
n_unique_zip = n_distinct(Zip.code[!is.na(Zip.code) & Zip.code != ""])
)
# Show the 15 most common zip codes in the dataset
df %>%
count(Zip.code, sort = TRUE) %>%
filter(!is.na(Zip.code), Zip.code != "") %>%
slice_head(n = 15)
# Plot zip codes
# 1) Clean zip codes
zip_counts <- df %>%
filter(!is.na(Zip.code), Zip.code != "") %>% # Keep only non-missing zip codes
mutate(
Zip.code = str_extract(as.character(Zip.code), "\\d{5}"), # Exact 5-digit zip code
Zip.code = str_pad(Zip.code, width = 5, side = "left", pad = "0") # Add leading zeros if needed
) %>%
filter(!is.na(Zip.code)) %>%
count(Zip.code, sort = TRUE) # Count participants per zip code
# 2) Look up zip metadata (lat/lng) with reverse_zipcode()
zip_lookup <- map_dfr(zip_counts$Zip.code, ~{
out <- tryCatch(
reverse_zipcode(.x),
error = function(e) NULL
)
out
})
# 3) Keep only the columns needed for mapping
zip_lookup <- zip_lookup %>%
select(zipcode, lat, lng, state, major_city)
# 4) Merge counts with coordinates
df_map <- zip_counts %>%
left_join(zip_lookup, by = c("Zip.code" = "zipcode"))
# 5) Optional: check zips that did not match
df_map %>%
filter(is.na(lat) | is.na(lng))
# 6) US base map
us_map <- map_data("state")
# 7) Plot
ggplot() +
geom_polygon(
data = us_map,
aes(x = long, y = lat, group = group),
fill = "gray95",
color = "white",
linewidth = 0.2
) +
geom_point(
data = df_map,
aes(x = lng, y = lat, size = n, color = n),
alpha = 0.75
) +
scale_size(range = c(2, 8)) +
scale_color_viridis_c() +
coord_fixed(1.3) +
labs(
title = "Participant ZIP Codes Across the United States",
x = NULL,
y = NULL,
size = "Count",
color = "Count"
) +
theme_minimal()
b) Distributions of covariates
Goal: To understand who is in our data (their consumption
tendencies)
# Overall descriptives for covariates
covariates <- df %>% # Create a new data frame with only four covariate variables
select(
quality.perception,
perceived.benefits,
seller.control,
surcharge.novelty,
)
# Get descriptive stats for these covariates
describe(covariates)
b) Distributions of covariates (Visualizing the distributions)
Goal: Plot histograms and explore variables individually
# Histogram: quality.perception
p6 <- ggplot(df, aes(x = quality.perception)) + # initiate ggplot using dataset "df" and use quality.perception as the x-axis
geom_histogram(
binwidth = 1, # each bar represents a width of 1 unit (this fits interval/ratio scales)
boundary = 0.5, # make bars centered on integers
fill = "#7A0019", # set bar fill color
color = "white" # set border color of bars
) +
geom_vline( # add a vertical line
xintercept = mean(df$quality.perception, na.rm = TRUE), # position line at the mean
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Quality Perception", # add labels and titles
subtitle = "Dashed line = Mean",
x = "PQuality Perception (1–7 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) + # apply a clean minimal theme with base front size 14
theme(
plot.title = element_text(hjust = 0.5, face = "bold"), # center and bold the title
plot.subtitle = element_text(hjust = 0.5), # center the subtitle
panel.grid.major.x = element_blank(), # remove vertical major grid lines
panel.grid.minor = element_blank() # remove all minor grid lines
)
# Histogram: perceived benefits
p7 <- ggplot(df, aes(x = perceived.benefits)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$perceived.benefits, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Perceived Benefits",
subtitle = "Dashed line = Mean",
x = "Perceived benefits (1–5 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Histogram: seller control
p8 <- ggplot(df, aes(x = seller.control)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$seller.control, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Seller Control",
subtitle = "Dashed line = Mean",
x = "Seller Control (1–7 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Histogram: surcharge novelty
p9 <- ggplot(df, aes(x = surcharge.novelty)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$surcharge.novelty, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Surcharge Novelty",
subtitle = "Dashed line = Mean",
x = "Surcharge Novelty (1–7 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
p6 + p7 + p8 + p9 # Option#1: Combine the four plots into one figure
p6 # Option#2: Plot the four plots on separate pages
p7
p7
p9
Overall descriptive stats
Goal: Check if the randomization is successful (the number of
participants should be largely the same across conditions)
# Count number of participants in each 2 x 2 experimental cell
df %>%
count(IV1_price_format, IV2_surcharge_type)
Variable distributions
# Overall descriptives for DVs
DVs <- df %>%
select(
purchase.intention,
brand.attitude,
seller.trust,
fairness
)
describe(DVs)
Distributions of DVs (Visualizing the distributions)
Goal: Plot histograms and explore variables individually
# Histogram: purchase intention
p1 <- ggplot(df, aes(x = purchase.intention)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$purchase.intention, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Purchase Intention",
subtitle = "Dashed line = Mean",
x = "Purchase Intention (1–7 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Histogram: brand attitude
p2 <- ggplot(df, aes(x = brand.attitude)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$brand.attitude, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Brand Attitude",
subtitle = "Dashed line = Mean",
x = "Brand Attitude (1–10 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Bar chart, brand attitude (NPS)
p3 <- ggplot(df, aes(x = brand.attitude_NPS_GROUP)) +
geom_bar(fill = "#7A0019") +
labs(
title = "Distribution of NPS Groups",
x = "NPS Group",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Histogram: seller trust
p4 <- ggplot(df, aes(x = seller.trust)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$seller.trust, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Seller Trust",
subtitle = "Dashed line = Mean",
x = "Seller trust (1–7 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# Histogram: fairness
p5 <- ggplot(df, aes(x = fairness)) +
geom_histogram(
binwidth = 1,
boundary = 0.5,
fill = "#7A0019",
color = "white"
) +
geom_vline(
xintercept = mean(df$fairness, na.rm = TRUE),
linetype = "dashed",
linewidth = 1
) +
labs(
title = "Distribution of Fairness",
subtitle = "Dashed line = Mean",
x = "Fairness (1–7 scale)",
y = "Number of Participants"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
p1
p2
p3
p4
p5
Scatterplots for relationships among DVs
Goal: Visually assess many variable relationships at once
# Plot 1: brand attitude by purchase intention
ggplot(df, aes(x = brand.attitude, y = purchase.intention)) + # aes(...) = aesthetics (tell RStudio what goes on axes). Here we are plotting brand attitude on the x-axis and purchase intention on the y-axis
geom_smooth(method = "lm", se = TRUE) + # We are using a linear regression model ("lm") to draw a best-fitting line between brand attitude and purchase intention
geom_jitter(width = 0.2, height = 0.2) + # this function helps us plot each participant as a dot
theme_minimal() + # Make the plot cleaner and easier to read
labs(title = "Brand Attitude and Purchase Intention",
x = "Brand Attitude", y = "Purchase Intention") # Add labels to the figure, x-axis, and y-axis
# Plot 2: seller trust by purchase intention
ggplot(df, aes(x = seller.trust, y = purchase.intention)) +
geom_smooth(method = "lm", se = TRUE) +
geom_jitter(width = 0.2, height = 0.2) +
theme_minimal() +
labs(title = "Seller Trust and Purchase Intention",
x = "Seller Trust", y = "Purchase Intention")
# Plot 3: fairness by purchase intention
ggplot(df, aes(x = fairness, y = purchase.intention)) +
geom_smooth(method = "lm", se = TRUE) +
geom_jitter(width = 0.2, height = 0.2) +
theme_minimal() +
labs(title = "Fairness and Purchase Intention",
x = "Perceived Fairness", y = "Purchase Intention")
# Plot 4: seller trust by brand attitude
ggplot(df, aes(x = seller.trust, y = brand.attitude)) +
geom_smooth(method = "lm", se = TRUE) +
geom_jitter(width = 0.2, height = 0.2) +
theme_minimal() +
labs(title = "Seller Trust and Brand Attitude",
x = "Seller Trust", y = "Brand Attitude")
# Plot 5: fairness by brand attitude
ggplot(df, aes(x = fairness, y = brand.attitude)) +
geom_smooth(method = "lm", se = TRUE) +
geom_jitter(width = 0.2, height = 0.2) +
theme_minimal() +
labs(title = "Fairness and Brand Attitude",
x = "Perceived Fairness", y = "Brand Attitude")
# Plot 6: fairness by seller trust
ggplot(df, aes(x = fairness, y = seller.trust)) +
geom_smooth(method = "lm", se = TRUE) +
geom_jitter(width = 0.2, height = 0.2) +
theme_minimal() +
labs(title = "Fairness and Seller Trust",
x = "Perceived Fairness", y = "Seller Trust")
Correlation matrix
# Select the variables to include in the correlation matrix
corr_vars <- df%>%
select(
purchase.intention,
brand.attitude,
seller.trust,
fairness,
quality.perception,
perceived.benefits,
seller.control,
surcharge.novelty
)
# Compute correlations and p-values
cor_results <- rcorr(as.matrix(corr_vars))
# Correlation matrix
cor_matrix <- cor_results$r
# P-value matrix
p_matrix <- cor_results$P
# View correlation matrix
round(cor_matrix, 2)
# Convert correlation matrix to long format
cor_df <- as.data.frame(cor_matrix) %>%
rownames_to_column("var1") %>%
pivot_longer(-var1, names_to = "var2", values_to = "correlation")
# Convert p-value matrix to long format
p_df <- as.data.frame(p_matrix) %>%
rownames_to_column("var1") %>%
pivot_longer(-var1, names_to = "var2", values_to = "p_value")
# Merge correlations and p-values
cor_df <- cor_df %>%
left_join(p_df, by = c("var1", "var2")) %>%
mutate(
sig = case_when(
p_value < .001 ~ "***",
p_value < .01 ~ "**",
p_value < .05 ~ "*",
TRUE ~ ""
)
)
# Plot the correlation matrix
ggplot(cor_df, aes(x = var1, y = var2)) +
geom_point(aes(size = abs(correlation), color = correlation)) +
geom_text(aes(label = paste0(round(correlation, 2), sig)), size = 3) +
scale_size(range = c(2, 8)) +
scale_color_gradient2(
low = "#B2182B",
mid = "white",
high = "#2166AC",
midpoint = 0
) +
coord_equal() +
labs(
title = "Correlation Matrix (with Significance)",
x = "",
y = "",
color = "Correlation",
size = "|r|"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold")
)
# Your R code here
summary(cars)
plot(cars)
---
title: "Pricing project demo; data exploration; Spring 2026"
output: html_notebook
---

############################ 

# Install/Load PackAges

############################ 

```{r}

# Load the packages needed for data cleaning, analysis, and visualization
library(tidyverse) # Core data wrangling and plotting tools (dplyr, ggplot2, etc.)
library(janitor) # Helpful tools for cleaning data and summarizing missingness
library(psych) # Descriptive statistics and scale-related functions
library(patchwork) # Combine multiple ggplots into one figure
library(naniar)  # Missing data summaries and visualization

library(stringr) # cleaning messy text data
library(forcats) # working with categorical variables

library(purrr) # doing things repeatedly without loops
library(zipcodeR) # turn zip codes into locations
library(maps) # draw the U.S. map

library(Hmisc) # for correlations and significance tests
```

############################ 

# Load the dataset

############################ 

```{r}
# Set working directory
setwd("/Users/maya/Desktop")

# Read the CSV (prevents unwanted factor conversion)
df <- read.csv("MKTG 4450.csv")
head(df)
names(df)
```

```{r}
# Check variable names (the name of each column in your cleaned excel dataset)
names(df)
```

############################ 

# Prepare the data

############################ 

# Check for missing values (nice to better understand the data, not necessary since we've cleaned up the dataset)

```{r}
# Missing data overview
colSums(is.na(df)) # Count how many missing values for each variable
miss_var_summary(df) # A cleaner summary of missing values by variable
```

# Recorde variaboes, if necessary

```{r}
# fairness_1 and fairness_2 are supposed to measure the same idea,
# but fairness_2 is worded/scaled in the opposite direction.
# fairness_1: 1 = very unfairly, 7 = very fairly; fairness_2: 1 = extremely fairly, 7= extremely unfairly; This is why we need to reverse code fairness_2 to make the scale consistent with fairness_1
df$fairness_2_reverse <- 8 - df$fairness_2
df$fairness <- (df$fairness_1 + df$fairness_2_reverse)/2 # Create a composite fairness score by averaging the two fairness items

# Age: We measured year of birth, and we want to turn it into participants' age for easier interpretation
df$Year.of.Birth <- as.numeric(df$Year.of.Birth) # Convert Year.of.Birth to numeric in case it was imported as text
df$Age <- 2026 - df$Year.of.Birth
```

# Prepare variables

# Goal: Make sure your IVs are categorical and DVs are numerical (the most common type of DVs; for linear regression) or categorical (for chi square test or logistic regression)

```{r}
# Make sure IVs are categorical factors. as.factor() tells R that these are categorical variables
df$IV1_price_format <- as.factor(df$IV1_price_format) # Categorical
df$IV2_surcharge_type <- as.factor(df$IV2_surcharge_type) # Categorical
```

```{r}
# Make sure (most of the) DVs and covariates are numerical values
#DVs
df$purchase.intention <- as.numeric(df$purchase.intention) # Numeric
df$brand.attitude_NPS_GROUP <- as.factor(df$brand.attitude_NPS_GROUP) # Categorical
df$brand.attitude <- as.numeric(df$brand.attitude) # Numeric
df$seller.trust <- as.numeric(df$seller.trust) # Numeric
df$fairness <- as.numeric(df$fairness) # Numeric

# Covariates
df$quality.perception <- as.numeric(df$quality.perception) # Numeric
df$perceived.benefits <- as.numeric(df$perceived.benefits) # Numeric
df$seller.control <- as.numeric(df$seller.control) # Numeric
df$surcharge.novelty <- as.numeric(df$surcharge.novelty) # Numeric
```

############################ 

# Assess the sample

############################ 

# a) Explore demographics

# Goal: To understand who is in our data (their demographics)

```{r}
# Make sure Age is numeric
df$Age <- as.numeric(df$Age)

# Age: summarize # of valid cases, mean, SD, min, median, and max (na.rm = TRUE: NAs are removed before calculation)
df %>%
  summarise(
    n_nonmissing = sum(!is.na(Age)),
    mean = mean(Age, na.rm = TRUE),
    sd = sd(Age, na.rm = TRUE),
    min = min(Age, na.rm = TRUE),
    median = median(Age, na.rm = TRUE),
    max = max(Age, na.rm = TRUE)
  )

# Age histogram
ggplot(df, aes(x = Age)) +
  geom_histogram(
    binwidth = 5,       # Each bar covers a 5-year range
    fill = "#7A0019",   # Bar color
    color = "white"    # White border around bars
  ) +
  labs(
    title = "Age Distribution",
    subtitle = "Sample Characteristics",
    x = "Age (years)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )
```

```{r}
# Gender distribution: frequency table and proportions (since Gender is a categorical variable)
df %>%
  count(Gender) %>%
  mutate(prop = n / sum(n))

# Bar chart for gender counts
df %>%
  count(Gender) %>%
  ggplot(aes(x = fct_reorder(Gender, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Gender",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
```

```{r}
# Education distribution: counts and proportions
df %>%
  count(EDU) %>%
  mutate(prop = n / sum(n)) %>%
  arrange(desc(n))

# Bar chart for education counts
df %>%
  count(EDU) %>%
  ggplot(aes(x = fct_reorder(EDU, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Education",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
```

```{r}
# Income distribution: counts and percentages
df %>%
  count(Income) %>%
  mutate(percent = round(100 * n / sum(n), 1))

# Bar chart of income categories
df %>%
  count(Income) %>%
  ggplot(aes(x = Income, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Income",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
```

```{r}
# Zip code: count how many zip codes are present and the number of unique zip codes 
df %>%
  summarise(
    n_zip_nonmissing = sum(!is.na(Zip.code) & Zip.code != ""),
    n_unique_zip = n_distinct(Zip.code[!is.na(Zip.code) & Zip.code != ""])
  )

# Show the 15 most common zip codes in the dataset
df %>%
  count(Zip.code, sort = TRUE) %>%
  filter(!is.na(Zip.code), Zip.code != "") %>%
  slice_head(n = 15)
```

```{r}
# Plot zip codes
# 1) Clean zip codes
zip_counts <- df %>%
  filter(!is.na(Zip.code), Zip.code != "") %>%   # Keep only non-missing zip codes
  mutate(
    Zip.code = str_extract(as.character(Zip.code), "\\d{5}"),  # Exact 5-digit zip code
    Zip.code = str_pad(Zip.code, width = 5, side = "left", pad = "0") # Add leading zeros if needed
  ) %>% 
  filter(!is.na(Zip.code)) %>%
  count(Zip.code, sort = TRUE) # Count participants per zip code

# 2) Look up zip metadata (lat/lng) with reverse_zipcode()
zip_lookup <- map_dfr(zip_counts$Zip.code, ~{
  out <- tryCatch(
    reverse_zipcode(.x),
    error = function(e) NULL
  )
  out
})

# 3) Keep only the columns needed for mapping
zip_lookup <- zip_lookup %>%
  select(zipcode, lat, lng, state, major_city)

# 4) Merge counts with coordinates
df_map <- zip_counts %>%
  left_join(zip_lookup, by = c("Zip.code" = "zipcode"))

# 5) Optional: check zips that did not match
df_map %>%
  filter(is.na(lat) | is.na(lng))

# 6) US base map
us_map <- map_data("state")

# 7) Plot
ggplot() +
  geom_polygon(
    data = us_map,
    aes(x = long, y = lat, group = group),
    fill = "gray95",
    color = "white",
    linewidth = 0.2
  ) +
  geom_point(
    data = df_map,
    aes(x = lng, y = lat, size = n, color = n),
    alpha = 0.75
  ) +
  scale_size(range = c(2, 8)) +
  scale_color_viridis_c() +
  coord_fixed(1.3) +
  labs(
    title = "Participant ZIP Codes Across the United States",
    x = NULL,
    y = NULL,
    size = "Count",
    color = "Count"
  ) +
  theme_minimal()
```

# b) Distributions of covariates

# Goal: To understand who is in our data (their consumption tendencies)

```{r}
# Overall descriptives for covariates
covariates <- df %>% # Create a new data frame with only four covariate variables
  select(
    quality.perception,
    perceived.benefits,
    seller.control,
    surcharge.novelty,
  )

# Get descriptive stats for these covariates
describe(covariates)
```

# b) Distributions of covariates (Visualizing the distributions)

# Goal: Plot histograms and explore variables individually

```{r}
# Histogram: quality.perception
p6 <- ggplot(df, aes(x = quality.perception)) + # initiate ggplot using dataset "df" and use quality.perception as the x-axis
  geom_histogram(
    binwidth = 1,    # each bar represents a width of 1 unit (this fits interval/ratio scales)
    boundary = 0.5,  # make bars centered on integers
    fill = "#7A0019", # set bar fill color
    color = "white"   # set border color of bars
  ) +
  geom_vline(  # add a vertical line
    xintercept = mean(df$quality.perception, na.rm = TRUE), # position line at the mean
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Quality Perception", # add labels and titles
    subtitle = "Dashed line = Mean",
    x = "PQuality Perception (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) + # apply a clean minimal theme with base front size 14
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), # center and bold the title
    plot.subtitle = element_text(hjust = 0.5), # center the subtitle
    panel.grid.major.x = element_blank(), # remove vertical major grid lines
    panel.grid.minor = element_blank() # remove all minor grid lines
  )

# Histogram: perceived benefits
p7 <- ggplot(df, aes(x = perceived.benefits)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$perceived.benefits, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Perceived Benefits",
    subtitle = "Dashed line = Mean",
    x = "Perceived benefits (1–5 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: seller control
p8 <- ggplot(df, aes(x = seller.control)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$seller.control, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Seller Control",
    subtitle = "Dashed line = Mean",
    x = "Seller Control (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: surcharge novelty
p9 <- ggplot(df, aes(x = surcharge.novelty)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$surcharge.novelty, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Surcharge Novelty",
    subtitle = "Dashed line = Mean",
    x = "Surcharge Novelty (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

p6 + p7 + p8 + p9 # Option#1: Combine the four plots into one figure
p6 # Option#2: Plot the four plots on separate pages
p7
p7
p9
```

############################ 

# Assess randomization

############################ 

# Overall descriptive stats

# Goal: Check if the randomization is successful (the number of participants should be largely the same across conditions)

```{r}
# Count number of participants in each 2 x 2 experimental cell
df %>%
  count(IV1_price_format, IV2_surcharge_type)
```

############################ 

# Variable distributions

############################ 

```{r}
# Overall descriptives for DVs
DVs <- df %>%
  select(
    purchase.intention,
    brand.attitude,
    seller.trust,
    fairness
  )

describe(DVs)
```

# Distributions of DVs (Visualizing the distributions)

# Goal: Plot histograms and explore variables individually

```{r}
# Histogram: purchase intention
p1 <- ggplot(df, aes(x = purchase.intention)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$purchase.intention, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Purchase Intention",
    subtitle = "Dashed line = Mean",
    x = "Purchase Intention (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: brand attitude
p2 <- ggplot(df, aes(x = brand.attitude)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$brand.attitude, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Brand Attitude",
    subtitle = "Dashed line = Mean",
    x = "Brand Attitude (1–10 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Bar chart, brand attitude (NPS)
p3 <- ggplot(df, aes(x = brand.attitude_NPS_GROUP)) +
  geom_bar(fill = "#7A0019") +
  labs(
    title = "Distribution of NPS Groups",
    x = "NPS Group",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: seller trust
p4 <- ggplot(df, aes(x = seller.trust)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$seller.trust, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Seller Trust",
    subtitle = "Dashed line = Mean",
    x = "Seller trust (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: fairness
p5 <- ggplot(df, aes(x = fairness)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$fairness, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Fairness",
    subtitle = "Dashed line = Mean",
    x = "Fairness (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

p1 
p2 
p3 
p4 
p5
```

############################ 

# Variable relationships

############################ 

# Scatterplots for relationships among DVs

# Goal: Visually assess many variable relationships at once

```{r}
# Plot 1: brand attitude by purchase intention
ggplot(df, aes(x = brand.attitude, y = purchase.intention)) + # aes(...) = aesthetics (tell RStudio what goes on axes). Here we are plotting brand attitude on the x-axis and purchase intention on the y-axis
  geom_smooth(method = "lm", se = TRUE) + # We are using a linear regression model ("lm") to draw a best-fitting line between brand attitude and purchase intention 
  geom_jitter(width = 0.2, height = 0.2) + # this function helps us plot each participant as a dot
  theme_minimal() + # Make the plot cleaner and easier to read
  labs(title = "Brand Attitude and Purchase Intention", 
       x = "Brand Attitude", y = "Purchase Intention") # Add labels to the figure, x-axis, and y-axis

# Plot 2: seller trust by purchase intention
ggplot(df, aes(x = seller.trust, y = purchase.intention)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Seller Trust and Purchase Intention",
       x = "Seller Trust", y = "Purchase Intention")

# Plot 3: fairness by purchase intention
ggplot(df, aes(x = fairness, y = purchase.intention)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Purchase Intention",
       x = "Perceived Fairness", y = "Purchase Intention")

# Plot 4: seller trust by brand attitude
ggplot(df, aes(x = seller.trust, y = brand.attitude)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Seller Trust and Brand Attitude",
       x = "Seller Trust", y = "Brand Attitude")

# Plot 5: fairness by brand attitude
ggplot(df, aes(x = fairness, y = brand.attitude)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Brand Attitude",
       x = "Perceived Fairness", y = "Brand Attitude")

# Plot 6: fairness by seller trust
ggplot(df, aes(x = fairness, y = seller.trust)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Seller Trust",
       x = "Perceived Fairness", y = "Seller Trust")
```

# Correlation matrix

```{r}
# Select the variables to include in the correlation matrix
corr_vars <- df%>%
  select(
    purchase.intention,
    brand.attitude,
    seller.trust,
    fairness,
    quality.perception,
    perceived.benefits,
    seller.control,
    surcharge.novelty
  )

# Compute correlations and p-values
cor_results <- rcorr(as.matrix(corr_vars))

# Correlation matrix
cor_matrix <- cor_results$r

# P-value matrix
p_matrix <- cor_results$P

# View correlation matrix
round(cor_matrix, 2)
```

```{r}
# Convert correlation matrix to long format
cor_df <- as.data.frame(cor_matrix) %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "correlation")

# Convert p-value matrix to long format
p_df <- as.data.frame(p_matrix) %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "p_value")

# Merge correlations and p-values
cor_df <- cor_df %>%
  left_join(p_df, by = c("var1", "var2")) %>%
  mutate(
    sig = case_when(
      p_value < .001 ~ "***",
      p_value < .01  ~ "**",
      p_value < .05  ~ "*",
      TRUE ~ ""
    )
  )

# Plot the correlation matrix
ggplot(cor_df, aes(x = var1, y = var2)) +
  geom_point(aes(size = abs(correlation), color = correlation)) +
  geom_text(aes(label = paste0(round(correlation, 2), sig)), size = 3) +
  scale_size(range = c(2, 8)) +
  scale_color_gradient2(
    low = "#B2182B",
    mid = "white",
    high = "#2166AC",
    midpoint = 0
  ) +
  coord_equal() +
  labs(
    title = "Correlation Matrix (with Significance)",
    x = "",
    y = "",
    color = "Correlation",
    size = "|r|"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
```

```{r}
# Your R code here
summary(cars)
plot(cars)
```
