Install/Load PackAges


# Load the packages needed for data cleaning, analysis, and visualization
library(tidyverse) # Core data wrangling and plotting tools (dplyr, ggplot2, etc.)
library(janitor) # Helpful tools for cleaning data and summarizing missingness
library(psych) # Descriptive statistics and scale-related functions
library(patchwork) # Combine multiple ggplots into one figure
library(naniar)  # Missing data summaries and visualization

library(stringr) # cleaning messy text data
library(forcats) # working with categorical variables

library(purrr) # doing things repeatedly without loops
library(zipcodeR) # turn zip codes into locations
library(maps) # draw the U.S. map

library(Hmisc) # for correlations and significance tests

Load the dataset

# Set working directory
setwd("/Users/maya/Desktop")

# Read the CSV (prevents unwanted factor conversion)
df <- read.csv("MKTG 4450.csv")
head(df)
names(df)
 [1] "Start.Date"               "EndDate"                  "Status"                  
 [4] "IPAddress"                "Progress"                 "Duration..in.seconds."   
 [7] "Finished"                 "RecordedDate"             "ResponseId"              
[10] "UserLanguage"             "purchase.intention"       "brand.attitude_NPS_GROUP"
[13] "brand.attitude"           "seller.trust"             "fairness_1"              
[16] "fairness_2"               "quality.perception"       "perceived.benefits"      
[19] "seller.control"           "Attention.check"          "Year.of.Birth"           
[22] "EDU"                      "Ethnicity"                "Ethnicity1"              
[25] "Ethnicity2"               "Ethnicity3"               "Ethnicity4"              
[28] "Ethnicity5"               "Ethnicity6"               "Ethnicity7"              
[31] "Gender"                   "Income"                   "Zip.code"                
[34] "open.comments"            "Condition"                "IV1_price_format"        
[37] "IV2_Surcharge_type"      
# Check variable names (the name of each column in your cleaned excel dataset)
names(df)
 [1] "Start.Date"               "EndDate"                  "Status"                  
 [4] "IPAddress"                "Progress"                 "Duration..in.seconds."   
 [7] "Finished"                 "RecordedDate"             "ResponseId"              
[10] "UserLanguage"             "purchase.intention"       "brand.attitude_NPS_GROUP"
[13] "brand.attitude"           "seller.trust"             "fairness_1"              
[16] "fairness_2"               "quality.perception"       "perceived.benefits"      
[19] "seller.control"           "Attention.check"          "Year.of.Birth"           
[22] "EDU"                      "Ethnicity"                "Ethnicity1"              
[25] "Ethnicity2"               "Ethnicity3"               "Ethnicity4"              
[28] "Ethnicity5"               "Ethnicity6"               "Ethnicity7"              
[31] "Gender"                   "Income"                   "Zip.code"                
[34] "open.comments"            "Condition"                "IV1_price_format"        
[37] "IV2_Surcharge_type"      

Prepare the data

Check for missing values (nice to better understand the data, not necessary since we’ve cleaned up the dataset)

# Missing data overview
colSums(is.na(df)) # Count how many missing values for each variable
              Start.Date                  EndDate                   Status 
                       0                        0                        0 
               IPAddress                 Progress    Duration..in.seconds. 
                       0                        0                        0 
                Finished             RecordedDate               ResponseId 
                       0                        0                        0 
            UserLanguage       purchase.intention brand.attitude_NPS_GROUP 
                       0                        0                        0 
          brand.attitude             seller.trust               fairness_1 
                       0                        0                        0 
              fairness_2       quality.perception       perceived.benefits 
                       0                        0                        0 
          seller.control          Attention.check            Year.of.Birth 
                       0                        0                        0 
                     EDU                Ethnicity               Ethnicity1 
                       0                        0                        0 
              Ethnicity2               Ethnicity3               Ethnicity4 
                       0                        0                        0 
              Ethnicity5               Ethnicity6               Ethnicity7 
                       0                        0                        0 
                  Gender                   Income                 Zip.code 
                       0                        0                        0 
           open.comments                Condition         IV1_price_format 
                      12                        0                        0 
      IV2_Surcharge_type 
                       0 
miss_var_summary(df) # A cleaner summary of missing values by variable

Recorde variaboes, if necessary

# fairness_1 and fairness_2 are supposed to measure the same idea,
# but fairness_2 is worded/scaled in the opposite direction.
# fairness_1: 1 = very unfairly, 7 = very fairly; fairness_2: 1 = extremely fairly, 7= extremely unfairly; This is why we need to reverse code fairness_2 to make the scale consistent with fairness_1
df$fairness_2_reverse <- 8 - df$fairness_2
df$fairness <- (df$fairness_1 + df$fairness_2_reverse)/2 # Create a composite fairness score by averaging the two fairness items

# Age: We measured year of birth, and we want to turn it into participants' age for easier interpretation
df$Year.of.Birth <- as.numeric(df$Year.of.Birth) # Convert Year.of.Birth to numeric in case it was imported as text
df$Age <- 2026 - df$Year.of.Birth

Prepare variables

Goal: Make sure your IVs are categorical and DVs are numerical (the most common type of DVs; for linear regression) or categorical (for chi square test or logistic regression)

# Make sure IVs are categorical factors. as.factor() tells R that these are categorical variables
df$IV1_price_format <- as.factor(df$IV1_price_format) # Categorical
df$IV2_surcharge_type <- as.factor(df$IV2_surcharge_type) # Categorical
# Make sure (most of the) DVs and covariates are numerical values
#DVs
df$purchase.intention <- as.numeric(df$purchase.intention) # Numeric
df$brand.attitude_NPS_GROUP <- as.factor(df$brand.attitude_NPS_GROUP) # Categorical
df$brand.attitude <- as.numeric(df$brand.attitude) # Numeric
df$seller.trust <- as.numeric(df$seller.trust) # Numeric
df$fairness <- as.numeric(df$fairness) # Numeric

# Covariates
df$quality.perception <- as.numeric(df$quality.perception) # Numeric
df$perceived.benefits <- as.numeric(df$perceived.benefits) # Numeric
df$seller.control <- as.numeric(df$seller.control) # Numeric
df$surcharge.novelty <- as.numeric(df$surcharge.novelty) # Numeric

Assess the sample

a) Explore demographics

Goal: To understand who is in our data (their demographics)

# Make sure Age is numeric
df$Age <- as.numeric(df$Age)

# Age: summarize # of valid cases, mean, SD, min, median, and max (na.rm = TRUE: NAs are removed before calculation)
df %>%
  summarise(
    n_nonmissing = sum(!is.na(Age)),
    mean = mean(Age, na.rm = TRUE),
    sd = sd(Age, na.rm = TRUE),
    min = min(Age, na.rm = TRUE),
    median = median(Age, na.rm = TRUE),
    max = max(Age, na.rm = TRUE)
  )

# Age histogram
ggplot(df, aes(x = Age)) +
  geom_histogram(
    binwidth = 5,       # Each bar covers a 5-year range
    fill = "#7A0019",   # Bar color
    color = "white"    # White border around bars
  ) +
  labs(
    title = "Age Distribution",
    subtitle = "Sample Characteristics",
    x = "Age (years)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )
# Gender distribution: frequency table and proportions (since Gender is a categorical variable)
df %>%
  count(Gender) %>%
  mutate(prop = n / sum(n))

# Bar chart for gender counts
df %>%
  count(Gender) %>%
  ggplot(aes(x = fct_reorder(Gender, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Gender",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
# Education distribution: counts and proportions
df %>%
  count(EDU) %>%
  mutate(prop = n / sum(n)) %>%
  arrange(desc(n))

# Bar chart for education counts
df %>%
  count(EDU) %>%
  ggplot(aes(x = fct_reorder(EDU, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Education",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
# Income distribution: counts and percentages
df %>%
  count(Income) %>%
  mutate(percent = round(100 * n / sum(n), 1))

# Bar chart of income categories
df %>%
  count(Income) %>%
  ggplot(aes(x = Income, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Income",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
# Zip code: count how many zip codes are present and the number of unique zip codes 
df %>%
  summarise(
    n_zip_nonmissing = sum(!is.na(Zip.code) & Zip.code != ""),
    n_unique_zip = n_distinct(Zip.code[!is.na(Zip.code) & Zip.code != ""])
  )

# Show the 15 most common zip codes in the dataset
df %>%
  count(Zip.code, sort = TRUE) %>%
  filter(!is.na(Zip.code), Zip.code != "") %>%
  slice_head(n = 15)
# Plot zip codes
# 1) Clean zip codes
zip_counts <- df %>%
  filter(!is.na(Zip.code), Zip.code != "") %>%   # Keep only non-missing zip codes
  mutate(
    Zip.code = str_extract(as.character(Zip.code), "\\d{5}"),  # Exact 5-digit zip code
    Zip.code = str_pad(Zip.code, width = 5, side = "left", pad = "0") # Add leading zeros if needed
  ) %>% 
  filter(!is.na(Zip.code)) %>%
  count(Zip.code, sort = TRUE) # Count participants per zip code

# 2) Look up zip metadata (lat/lng) with reverse_zipcode()
zip_lookup <- map_dfr(zip_counts$Zip.code, ~{
  out <- tryCatch(
    reverse_zipcode(.x),
    error = function(e) NULL
  )
  out
})

# 3) Keep only the columns needed for mapping
zip_lookup <- zip_lookup %>%
  select(zipcode, lat, lng, state, major_city)

# 4) Merge counts with coordinates
df_map <- zip_counts %>%
  left_join(zip_lookup, by = c("Zip.code" = "zipcode"))

# 5) Optional: check zips that did not match
df_map %>%
  filter(is.na(lat) | is.na(lng))

# 6) US base map
us_map <- map_data("state")

# 7) Plot
ggplot() +
  geom_polygon(
    data = us_map,
    aes(x = long, y = lat, group = group),
    fill = "gray95",
    color = "white",
    linewidth = 0.2
  ) +
  geom_point(
    data = df_map,
    aes(x = lng, y = lat, size = n, color = n),
    alpha = 0.75
  ) +
  scale_size(range = c(2, 8)) +
  scale_color_viridis_c() +
  coord_fixed(1.3) +
  labs(
    title = "Participant ZIP Codes Across the United States",
    x = NULL,
    y = NULL,
    size = "Count",
    color = "Count"
  ) +
  theme_minimal()

b) Distributions of covariates

Goal: To understand who is in our data (their consumption tendencies)

# Overall descriptives for covariates
covariates <- df %>% # Create a new data frame with only four covariate variables
  select(
    quality.perception,
    perceived.benefits,
    seller.control,
    surcharge.novelty,
  )

# Get descriptive stats for these covariates
describe(covariates)

b) Distributions of covariates (Visualizing the distributions)

Goal: Plot histograms and explore variables individually

# Histogram: quality.perception
p6 <- ggplot(df, aes(x = quality.perception)) + # initiate ggplot using dataset "df" and use quality.perception as the x-axis
  geom_histogram(
    binwidth = 1,    # each bar represents a width of 1 unit (this fits interval/ratio scales)
    boundary = 0.5,  # make bars centered on integers
    fill = "#7A0019", # set bar fill color
    color = "white"   # set border color of bars
  ) +
  geom_vline(  # add a vertical line
    xintercept = mean(df$quality.perception, na.rm = TRUE), # position line at the mean
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Quality Perception", # add labels and titles
    subtitle = "Dashed line = Mean",
    x = "PQuality Perception (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) + # apply a clean minimal theme with base front size 14
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), # center and bold the title
    plot.subtitle = element_text(hjust = 0.5), # center the subtitle
    panel.grid.major.x = element_blank(), # remove vertical major grid lines
    panel.grid.minor = element_blank() # remove all minor grid lines
  )

# Histogram: perceived benefits
p7 <- ggplot(df, aes(x = perceived.benefits)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$perceived.benefits, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Perceived Benefits",
    subtitle = "Dashed line = Mean",
    x = "Perceived benefits (1–5 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: seller control
p8 <- ggplot(df, aes(x = seller.control)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$seller.control, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Seller Control",
    subtitle = "Dashed line = Mean",
    x = "Seller Control (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: surcharge novelty
p9 <- ggplot(df, aes(x = surcharge.novelty)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$surcharge.novelty, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Surcharge Novelty",
    subtitle = "Dashed line = Mean",
    x = "Surcharge Novelty (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

p6 + p7 + p8 + p9 # Option#1: Combine the four plots into one figure
p6 # Option#2: Plot the four plots on separate pages
p7
p7
p9

Assess randomization

Overall descriptive stats

Goal: Check if the randomization is successful (the number of participants should be largely the same across conditions)

# Count number of participants in each 2 x 2 experimental cell
df %>%
  count(IV1_price_format, IV2_surcharge_type)

Variable distributions

# Overall descriptives for DVs
DVs <- df %>%
  select(
    purchase.intention,
    brand.attitude,
    seller.trust,
    fairness
  )

describe(DVs)

Distributions of DVs (Visualizing the distributions)

Goal: Plot histograms and explore variables individually

# Histogram: purchase intention
p1 <- ggplot(df, aes(x = purchase.intention)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$purchase.intention, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Purchase Intention",
    subtitle = "Dashed line = Mean",
    x = "Purchase Intention (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: brand attitude
p2 <- ggplot(df, aes(x = brand.attitude)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$brand.attitude, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Brand Attitude",
    subtitle = "Dashed line = Mean",
    x = "Brand Attitude (1–10 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Bar chart, brand attitude (NPS)
p3 <- ggplot(df, aes(x = brand.attitude_NPS_GROUP)) +
  geom_bar(fill = "#7A0019") +
  labs(
    title = "Distribution of NPS Groups",
    x = "NPS Group",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: seller trust
p4 <- ggplot(df, aes(x = seller.trust)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$seller.trust, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Seller Trust",
    subtitle = "Dashed line = Mean",
    x = "Seller trust (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: fairness
p5 <- ggplot(df, aes(x = fairness)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$fairness, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Fairness",
    subtitle = "Dashed line = Mean",
    x = "Fairness (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

p1 
p2 
p3 
p4 
p5

Variable relationships

Scatterplots for relationships among DVs

Goal: Visually assess many variable relationships at once

# Plot 1: brand attitude by purchase intention
ggplot(df, aes(x = brand.attitude, y = purchase.intention)) + # aes(...) = aesthetics (tell RStudio what goes on axes). Here we are plotting brand attitude on the x-axis and purchase intention on the y-axis
  geom_smooth(method = "lm", se = TRUE) + # We are using a linear regression model ("lm") to draw a best-fitting line between brand attitude and purchase intention 
  geom_jitter(width = 0.2, height = 0.2) + # this function helps us plot each participant as a dot
  theme_minimal() + # Make the plot cleaner and easier to read
  labs(title = "Brand Attitude and Purchase Intention", 
       x = "Brand Attitude", y = "Purchase Intention") # Add labels to the figure, x-axis, and y-axis

# Plot 2: seller trust by purchase intention
ggplot(df, aes(x = seller.trust, y = purchase.intention)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Seller Trust and Purchase Intention",
       x = "Seller Trust", y = "Purchase Intention")

# Plot 3: fairness by purchase intention
ggplot(df, aes(x = fairness, y = purchase.intention)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Purchase Intention",
       x = "Perceived Fairness", y = "Purchase Intention")

# Plot 4: seller trust by brand attitude
ggplot(df, aes(x = seller.trust, y = brand.attitude)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Seller Trust and Brand Attitude",
       x = "Seller Trust", y = "Brand Attitude")

# Plot 5: fairness by brand attitude
ggplot(df, aes(x = fairness, y = brand.attitude)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Brand Attitude",
       x = "Perceived Fairness", y = "Brand Attitude")

# Plot 6: fairness by seller trust
ggplot(df, aes(x = fairness, y = seller.trust)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Seller Trust",
       x = "Perceived Fairness", y = "Seller Trust")

Correlation matrix

# Select the variables to include in the correlation matrix
corr_vars <- df%>%
  select(
    purchase.intention,
    brand.attitude,
    seller.trust,
    fairness,
    quality.perception,
    perceived.benefits,
    seller.control,
    surcharge.novelty
  )

# Compute correlations and p-values
cor_results <- rcorr(as.matrix(corr_vars))

# Correlation matrix
cor_matrix <- cor_results$r

# P-value matrix
p_matrix <- cor_results$P

# View correlation matrix
round(cor_matrix, 2)
# Convert correlation matrix to long format
cor_df <- as.data.frame(cor_matrix) %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "correlation")

# Convert p-value matrix to long format
p_df <- as.data.frame(p_matrix) %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "p_value")

# Merge correlations and p-values
cor_df <- cor_df %>%
  left_join(p_df, by = c("var1", "var2")) %>%
  mutate(
    sig = case_when(
      p_value < .001 ~ "***",
      p_value < .01  ~ "**",
      p_value < .05  ~ "*",
      TRUE ~ ""
    )
  )

# Plot the correlation matrix
ggplot(cor_df, aes(x = var1, y = var2)) +
  geom_point(aes(size = abs(correlation), color = correlation)) +
  geom_text(aes(label = paste0(round(correlation, 2), sig)), size = 3) +
  scale_size(range = c(2, 8)) +
  scale_color_gradient2(
    low = "#B2182B",
    mid = "white",
    high = "#2166AC",
    midpoint = 0
  ) +
  coord_equal() +
  labs(
    title = "Correlation Matrix (with Significance)",
    x = "",
    y = "",
    color = "Correlation",
    size = "|r|"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
# Your R code here
summary(cars)
plot(cars)
---
title: "Pricing project demo; data exploration; Spring 2026"
output: html_notebook
---

############################ 

# Install/Load PackAges

############################ 

```{r}

# Load the packages needed for data cleaning, analysis, and visualization
library(tidyverse) # Core data wrangling and plotting tools (dplyr, ggplot2, etc.)
library(janitor) # Helpful tools for cleaning data and summarizing missingness
library(psych) # Descriptive statistics and scale-related functions
library(patchwork) # Combine multiple ggplots into one figure
library(naniar)  # Missing data summaries and visualization

library(stringr) # cleaning messy text data
library(forcats) # working with categorical variables

library(purrr) # doing things repeatedly without loops
library(zipcodeR) # turn zip codes into locations
library(maps) # draw the U.S. map

library(Hmisc) # for correlations and significance tests
```

############################ 

# Load the dataset

############################ 

```{r}
# Set working directory
setwd("/Users/maya/Desktop")

# Read the CSV (prevents unwanted factor conversion)
df <- read.csv("MKTG 4450.csv")
head(df)
names(df)
```

```{r}
# Check variable names (the name of each column in your cleaned excel dataset)
names(df)
```

############################ 

# Prepare the data

############################ 

# Check for missing values (nice to better understand the data, not necessary since we've cleaned up the dataset)

```{r}
# Missing data overview
colSums(is.na(df)) # Count how many missing values for each variable
miss_var_summary(df) # A cleaner summary of missing values by variable
```

# Recorde variaboes, if necessary

```{r}
# fairness_1 and fairness_2 are supposed to measure the same idea,
# but fairness_2 is worded/scaled in the opposite direction.
# fairness_1: 1 = very unfairly, 7 = very fairly; fairness_2: 1 = extremely fairly, 7= extremely unfairly; This is why we need to reverse code fairness_2 to make the scale consistent with fairness_1
df$fairness_2_reverse <- 8 - df$fairness_2
df$fairness <- (df$fairness_1 + df$fairness_2_reverse)/2 # Create a composite fairness score by averaging the two fairness items

# Age: We measured year of birth, and we want to turn it into participants' age for easier interpretation
df$Year.of.Birth <- as.numeric(df$Year.of.Birth) # Convert Year.of.Birth to numeric in case it was imported as text
df$Age <- 2026 - df$Year.of.Birth
```

# Prepare variables

# Goal: Make sure your IVs are categorical and DVs are numerical (the most common type of DVs; for linear regression) or categorical (for chi square test or logistic regression)

```{r}
# Make sure IVs are categorical factors. as.factor() tells R that these are categorical variables
df$IV1_price_format <- as.factor(df$IV1_price_format) # Categorical
df$IV2_surcharge_type <- as.factor(df$IV2_surcharge_type) # Categorical
```

```{r}
# Make sure (most of the) DVs and covariates are numerical values
#DVs
df$purchase.intention <- as.numeric(df$purchase.intention) # Numeric
df$brand.attitude_NPS_GROUP <- as.factor(df$brand.attitude_NPS_GROUP) # Categorical
df$brand.attitude <- as.numeric(df$brand.attitude) # Numeric
df$seller.trust <- as.numeric(df$seller.trust) # Numeric
df$fairness <- as.numeric(df$fairness) # Numeric

# Covariates
df$quality.perception <- as.numeric(df$quality.perception) # Numeric
df$perceived.benefits <- as.numeric(df$perceived.benefits) # Numeric
df$seller.control <- as.numeric(df$seller.control) # Numeric
df$surcharge.novelty <- as.numeric(df$surcharge.novelty) # Numeric
```

############################ 

# Assess the sample

############################ 

# a) Explore demographics

# Goal: To understand who is in our data (their demographics)

```{r}
# Make sure Age is numeric
df$Age <- as.numeric(df$Age)

# Age: summarize # of valid cases, mean, SD, min, median, and max (na.rm = TRUE: NAs are removed before calculation)
df %>%
  summarise(
    n_nonmissing = sum(!is.na(Age)),
    mean = mean(Age, na.rm = TRUE),
    sd = sd(Age, na.rm = TRUE),
    min = min(Age, na.rm = TRUE),
    median = median(Age, na.rm = TRUE),
    max = max(Age, na.rm = TRUE)
  )

# Age histogram
ggplot(df, aes(x = Age)) +
  geom_histogram(
    binwidth = 5,       # Each bar covers a 5-year range
    fill = "#7A0019",   # Bar color
    color = "white"    # White border around bars
  ) +
  labs(
    title = "Age Distribution",
    subtitle = "Sample Characteristics",
    x = "Age (years)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )
```

```{r}
# Gender distribution: frequency table and proportions (since Gender is a categorical variable)
df %>%
  count(Gender) %>%
  mutate(prop = n / sum(n))

# Bar chart for gender counts
df %>%
  count(Gender) %>%
  ggplot(aes(x = fct_reorder(Gender, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Gender",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
```

```{r}
# Education distribution: counts and proportions
df %>%
  count(EDU) %>%
  mutate(prop = n / sum(n)) %>%
  arrange(desc(n))

# Bar chart for education counts
df %>%
  count(EDU) %>%
  ggplot(aes(x = fct_reorder(EDU, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Education",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
```

```{r}
# Income distribution: counts and percentages
df %>%
  count(Income) %>%
  mutate(percent = round(100 * n / sum(n), 1))

# Bar chart of income categories
df %>%
  count(Income) %>%
  ggplot(aes(x = Income, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Income",
    x = NULL,
    y = "Count"
  ) +
  theme_minimal()
```

```{r}
# Zip code: count how many zip codes are present and the number of unique zip codes 
df %>%
  summarise(
    n_zip_nonmissing = sum(!is.na(Zip.code) & Zip.code != ""),
    n_unique_zip = n_distinct(Zip.code[!is.na(Zip.code) & Zip.code != ""])
  )

# Show the 15 most common zip codes in the dataset
df %>%
  count(Zip.code, sort = TRUE) %>%
  filter(!is.na(Zip.code), Zip.code != "") %>%
  slice_head(n = 15)
```

```{r}
# Plot zip codes
# 1) Clean zip codes
zip_counts <- df %>%
  filter(!is.na(Zip.code), Zip.code != "") %>%   # Keep only non-missing zip codes
  mutate(
    Zip.code = str_extract(as.character(Zip.code), "\\d{5}"),  # Exact 5-digit zip code
    Zip.code = str_pad(Zip.code, width = 5, side = "left", pad = "0") # Add leading zeros if needed
  ) %>% 
  filter(!is.na(Zip.code)) %>%
  count(Zip.code, sort = TRUE) # Count participants per zip code

# 2) Look up zip metadata (lat/lng) with reverse_zipcode()
zip_lookup <- map_dfr(zip_counts$Zip.code, ~{
  out <- tryCatch(
    reverse_zipcode(.x),
    error = function(e) NULL
  )
  out
})

# 3) Keep only the columns needed for mapping
zip_lookup <- zip_lookup %>%
  select(zipcode, lat, lng, state, major_city)

# 4) Merge counts with coordinates
df_map <- zip_counts %>%
  left_join(zip_lookup, by = c("Zip.code" = "zipcode"))

# 5) Optional: check zips that did not match
df_map %>%
  filter(is.na(lat) | is.na(lng))

# 6) US base map
us_map <- map_data("state")

# 7) Plot
ggplot() +
  geom_polygon(
    data = us_map,
    aes(x = long, y = lat, group = group),
    fill = "gray95",
    color = "white",
    linewidth = 0.2
  ) +
  geom_point(
    data = df_map,
    aes(x = lng, y = lat, size = n, color = n),
    alpha = 0.75
  ) +
  scale_size(range = c(2, 8)) +
  scale_color_viridis_c() +
  coord_fixed(1.3) +
  labs(
    title = "Participant ZIP Codes Across the United States",
    x = NULL,
    y = NULL,
    size = "Count",
    color = "Count"
  ) +
  theme_minimal()
```

# b) Distributions of covariates

# Goal: To understand who is in our data (their consumption tendencies)

```{r}
# Overall descriptives for covariates
covariates <- df %>% # Create a new data frame with only four covariate variables
  select(
    quality.perception,
    perceived.benefits,
    seller.control,
    surcharge.novelty,
  )

# Get descriptive stats for these covariates
describe(covariates)
```

# b) Distributions of covariates (Visualizing the distributions)

# Goal: Plot histograms and explore variables individually

```{r}
# Histogram: quality.perception
p6 <- ggplot(df, aes(x = quality.perception)) + # initiate ggplot using dataset "df" and use quality.perception as the x-axis
  geom_histogram(
    binwidth = 1,    # each bar represents a width of 1 unit (this fits interval/ratio scales)
    boundary = 0.5,  # make bars centered on integers
    fill = "#7A0019", # set bar fill color
    color = "white"   # set border color of bars
  ) +
  geom_vline(  # add a vertical line
    xintercept = mean(df$quality.perception, na.rm = TRUE), # position line at the mean
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Quality Perception", # add labels and titles
    subtitle = "Dashed line = Mean",
    x = "PQuality Perception (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) + # apply a clean minimal theme with base front size 14
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), # center and bold the title
    plot.subtitle = element_text(hjust = 0.5), # center the subtitle
    panel.grid.major.x = element_blank(), # remove vertical major grid lines
    panel.grid.minor = element_blank() # remove all minor grid lines
  )

# Histogram: perceived benefits
p7 <- ggplot(df, aes(x = perceived.benefits)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$perceived.benefits, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Perceived Benefits",
    subtitle = "Dashed line = Mean",
    x = "Perceived benefits (1–5 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: seller control
p8 <- ggplot(df, aes(x = seller.control)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$seller.control, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Seller Control",
    subtitle = "Dashed line = Mean",
    x = "Seller Control (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: surcharge novelty
p9 <- ggplot(df, aes(x = surcharge.novelty)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$surcharge.novelty, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Surcharge Novelty",
    subtitle = "Dashed line = Mean",
    x = "Surcharge Novelty (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

p6 + p7 + p8 + p9 # Option#1: Combine the four plots into one figure
p6 # Option#2: Plot the four plots on separate pages
p7
p7
p9
```

############################ 

# Assess randomization

############################ 

# Overall descriptive stats

# Goal: Check if the randomization is successful (the number of participants should be largely the same across conditions)

```{r}
# Count number of participants in each 2 x 2 experimental cell
df %>%
  count(IV1_price_format, IV2_surcharge_type)
```

############################ 

# Variable distributions

############################ 

```{r}
# Overall descriptives for DVs
DVs <- df %>%
  select(
    purchase.intention,
    brand.attitude,
    seller.trust,
    fairness
  )

describe(DVs)
```

# Distributions of DVs (Visualizing the distributions)

# Goal: Plot histograms and explore variables individually

```{r}
# Histogram: purchase intention
p1 <- ggplot(df, aes(x = purchase.intention)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$purchase.intention, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Purchase Intention",
    subtitle = "Dashed line = Mean",
    x = "Purchase Intention (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: brand attitude
p2 <- ggplot(df, aes(x = brand.attitude)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$brand.attitude, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Brand Attitude",
    subtitle = "Dashed line = Mean",
    x = "Brand Attitude (1–10 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Bar chart, brand attitude (NPS)
p3 <- ggplot(df, aes(x = brand.attitude_NPS_GROUP)) +
  geom_bar(fill = "#7A0019") +
  labs(
    title = "Distribution of NPS Groups",
    x = "NPS Group",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: seller trust
p4 <- ggplot(df, aes(x = seller.trust)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$seller.trust, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Seller Trust",
    subtitle = "Dashed line = Mean",
    x = "Seller trust (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

# Histogram: fairness
p5 <- ggplot(df, aes(x = fairness)) +
  geom_histogram(
    binwidth = 1, 
    boundary = 0.5,
    fill = "#7A0019",
    color = "white"
  ) +
  geom_vline(
    xintercept = mean(df$fairness, na.rm = TRUE),
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Distribution of Fairness",
    subtitle = "Dashed line = Mean",
    x = "Fairness (1–7 scale)",
    y = "Number of Participants"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

p1 
p2 
p3 
p4 
p5
```

############################ 

# Variable relationships

############################ 

# Scatterplots for relationships among DVs

# Goal: Visually assess many variable relationships at once

```{r}
# Plot 1: brand attitude by purchase intention
ggplot(df, aes(x = brand.attitude, y = purchase.intention)) + # aes(...) = aesthetics (tell RStudio what goes on axes). Here we are plotting brand attitude on the x-axis and purchase intention on the y-axis
  geom_smooth(method = "lm", se = TRUE) + # We are using a linear regression model ("lm") to draw a best-fitting line between brand attitude and purchase intention 
  geom_jitter(width = 0.2, height = 0.2) + # this function helps us plot each participant as a dot
  theme_minimal() + # Make the plot cleaner and easier to read
  labs(title = "Brand Attitude and Purchase Intention", 
       x = "Brand Attitude", y = "Purchase Intention") # Add labels to the figure, x-axis, and y-axis

# Plot 2: seller trust by purchase intention
ggplot(df, aes(x = seller.trust, y = purchase.intention)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Seller Trust and Purchase Intention",
       x = "Seller Trust", y = "Purchase Intention")

# Plot 3: fairness by purchase intention
ggplot(df, aes(x = fairness, y = purchase.intention)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Purchase Intention",
       x = "Perceived Fairness", y = "Purchase Intention")

# Plot 4: seller trust by brand attitude
ggplot(df, aes(x = seller.trust, y = brand.attitude)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Seller Trust and Brand Attitude",
       x = "Seller Trust", y = "Brand Attitude")

# Plot 5: fairness by brand attitude
ggplot(df, aes(x = fairness, y = brand.attitude)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Brand Attitude",
       x = "Perceived Fairness", y = "Brand Attitude")

# Plot 6: fairness by seller trust
ggplot(df, aes(x = fairness, y = seller.trust)) +
  geom_smooth(method = "lm", se = TRUE) +
  geom_jitter(width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(title = "Fairness and Seller Trust",
       x = "Perceived Fairness", y = "Seller Trust")
```

# Correlation matrix

```{r}
# Select the variables to include in the correlation matrix
corr_vars <- df%>%
  select(
    purchase.intention,
    brand.attitude,
    seller.trust,
    fairness,
    quality.perception,
    perceived.benefits,
    seller.control,
    surcharge.novelty
  )

# Compute correlations and p-values
cor_results <- rcorr(as.matrix(corr_vars))

# Correlation matrix
cor_matrix <- cor_results$r

# P-value matrix
p_matrix <- cor_results$P

# View correlation matrix
round(cor_matrix, 2)
```

```{r}
# Convert correlation matrix to long format
cor_df <- as.data.frame(cor_matrix) %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "correlation")

# Convert p-value matrix to long format
p_df <- as.data.frame(p_matrix) %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "p_value")

# Merge correlations and p-values
cor_df <- cor_df %>%
  left_join(p_df, by = c("var1", "var2")) %>%
  mutate(
    sig = case_when(
      p_value < .001 ~ "***",
      p_value < .01  ~ "**",
      p_value < .05  ~ "*",
      TRUE ~ ""
    )
  )

# Plot the correlation matrix
ggplot(cor_df, aes(x = var1, y = var2)) +
  geom_point(aes(size = abs(correlation), color = correlation)) +
  geom_text(aes(label = paste0(round(correlation, 2), sig)), size = 3) +
  scale_size(range = c(2, 8)) +
  scale_color_gradient2(
    low = "#B2182B",
    mid = "white",
    high = "#2166AC",
    midpoint = 0
  ) +
  coord_equal() +
  labs(
    title = "Correlation Matrix (with Significance)",
    x = "",
    y = "",
    color = "Correlation",
    size = "|r|"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
```

```{r}
# Your R code here
summary(cars)
plot(cars)
```
