Data Loading and Preparation

# ========================================
# CHANGE THIS LINE TO YOUR FILE LOCATION
# ========================================

# Option 1: File in same folder as this .Rmd file (just filename)
file_path <- "Marketing Analytics Survey_October 29, 2025_18.57.csv"

# Option 2: File in a specific folder (uncomment and edit one of these)
# file_path <- "~/Downloads/Marketing Analytics Survey_October 29, 2025_18.57.csv"  # Mac/Linux
# file_path <- "C:/Users/YourName/Downloads/Marketing Analytics Survey_October 29, 2025_18.57.csv"  # Windows

# Option 3: Let R find it (uncomment this to search for any matching CSV)
# csv_files <- list.files(pattern = "Marketing.*Survey.*\\.csv$", full.names = TRUE, recursive = FALSE)
# if(length(csv_files) > 0) { file_path <- csv_files[1] }

# ========================================

# Check if file exists
if(!file.exists(file_path)) {
  cat("ERROR: Cannot find file!\n")
  cat("Looking for:", file_path, "\n")
  cat("Current directory:", getwd(), "\n")
  cat("\nCSV files in current directory:\n")
  csv_list <- list.files(pattern = "\\.csv$")
  if(length(csv_list) > 0) {
    for(i in seq_along(csv_list)) {
      cat("  ", i, ". ", csv_list[i], "\n", sep="")
    }
    cat("\nUpdate line 22 with one of these filenames.\n")
  } else {
    cat("  No CSV files found!\n")
    cat("\nPossible solutions:\n")
    cat("  1. Move your CSV to:", getwd(), "\n")
    cat("  2. Or use full path in line 22\n")
  }
  stop("File not found - see messages above")
}

# Read the CSV - first check how many columns we actually have
data_test <- read.csv(file_path, nrows = 1, stringsAsFactors = FALSE)
actual_cols <- ncol(data_test)
cat("Actual columns in file:", actual_cols, "\n")
## Actual columns in file: 27
# Read the CSV properly - skip the first 2 header rows
data_raw <- read.csv(file_path, skip = 2, stringsAsFactors = FALSE, 
                     na.strings = c("", "NA"), header = FALSE)
cat("✓ File loaded:", file_path, "\n")
## ✓ File loaded: Marketing Analytics Survey_October 29, 2025_18.57.csv
cat("  Rows:", nrow(data_raw), "\n")
##   Rows: 27
cat("  Columns:", ncol(data_raw), "\n\n")
##   Columns: 29
# Assign column names based on actual number of columns
col_names <- c("StartDate", "EndDate", "Status", "IPAddress", "Progress", 
               "Duration", "Finished", "RecordedDate", "ResponseId", 
               "RecipientLastName", "RecipientFirstName", "RecipientEmail", 
               "ExternalReference", "LocationLatitude", "LocationLongitude", 
               "DistributionChannel", "UserLanguage", "AgeGroup", 
               "DiscouragingFactors", "AIvsProfessionalTrust", 
               "AdviceSource", "WhyThisSource", "AIExperience", 
               "RecognizeRisks", "FactCheckConfidence", "AlgorithmInfluence", 
               "ConnectionSuggestions")

# Adjust if there are extra columns
if(ncol(data_raw) > length(col_names)) {
  extra_cols <- ncol(data_raw) - length(col_names)
  col_names <- c(col_names, paste0("Extra", 1:extra_cols))
  cat("Note: Found", extra_cols, "extra columns\n")
} else if(ncol(data_raw) < length(col_names)) {
  col_names <- col_names[1:ncol(data_raw)]
  cat("Note: Using first", ncol(data_raw), "column names\n")
}
## Note: Found 2 extra columns
colnames(data_raw) <- col_names
cat("✓ Column names assigned\n")
## ✓ Column names assigned
# Clean the data - remove empty age responses
data_clean <- data_raw %>%
  filter(!is.na(AgeGroup), AgeGroup != "", AgeGroup != "NA")

cat("✓ Clean responses:", nrow(data_clean), "\n\n")
## ✓ Clean responses: 25
# Preview
cat("Sample data:\n")
## Sample data:
print(head(data_clean %>% select(AgeGroup, AdviceSource), 3))
##               AgeGroup              AdviceSource
## 1 {"ImportId":"QID12"}  {"ImportId":"QID3_TEXT"}
## 2                14-25 NerdWallet or CreditKarma
## 3                14-25              The internet

Demographics

Age Distribution

age_counts <- data_clean %>%
  count(AgeGroup) %>%
  mutate(Percentage = n / sum(n) * 100)

ggplot(age_counts, aes(x = reorder(AgeGroup, -n), y = n, fill = AgeGroup)) +
  geom_bar(stat = "identity", color = "black", alpha = 0.8) +
  geom_text(aes(label = paste0(n, "\n(", round(Percentage, 1), "%)")), 
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "Survey Respondents by Age Group",
       x = "Age Group",
       y = "Number of Responses") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 16, hjust = 0.5))

kable(age_counts, col.names = c("Age Group", "Count", "Percentage"))
Age Group Count Percentage
14-25 17 68
26-35 6 24
36-45 1 4
{“ImportId”:“QID12”} 1 4

Trust in Financial Advice Sources

# Categorize trust responses
data_clean <- data_clean %>%
  mutate(TrustCategory = case_when(
    grepl("don't trust|do not trust|not at all", AIvsProfessionalTrust, ignore.case = TRUE) ~ "Don't Trust AI/Influencers",
    grepl("trust AI|trust.*more|AI.*accurate|trust a lot", AIvsProfessionalTrust, ignore.case = TRUE) ~ "Trust AI More",
    grepl("same|equal|depends|50", AIvsProfessionalTrust, ignore.case = TRUE) ~ "Equal/Depends",
    grepl("somewhat|kind of|a bit", AIvsProfessionalTrust, ignore.case = TRUE) ~ "Somewhat Trust AI",
    !is.na(AIvsProfessionalTrust) & AIvsProfessionalTrust != "" ~ "Other/Unclear",
    TRUE ~ NA_character_
  ))

trust_summary <- data_clean %>%
  filter(!is.na(TrustCategory)) %>%
  count(TrustCategory) %>%
  mutate(Percentage = n / sum(n) * 100) %>%
  arrange(desc(n))

ggplot(trust_summary, aes(x = reorder(TrustCategory, n), y = n, fill = TrustCategory)) +
  geom_bar(stat = "identity", color = "black", alpha = 0.8) +
  geom_text(aes(label = paste0(n, " (", round(Percentage, 1), "%)")), 
            hjust = -0.1, size = 4) +
  coord_flip() +
  labs(title = "Trust in AI/Influencer vs Professional Financial Advice",
       x = "",
       y = "Number of Responses") +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))


Primary Sources for Financial Advice

# Categorize advice sources
data_clean <- data_clean %>%
  mutate(AdviceCategory = case_when(
    grepl("family|friend|trusted|partner|sister|dad|stepmom|husband|loved", AdviceSource, ignore.case = TRUE) ~ "Family/Friends",
    grepl("AI|chat|GPT|gemini", AdviceSource, ignore.case = TRUE) ~ "AI Tools",
    grepl("bank|financial advisor", AdviceSource, ignore.case = TRUE) ~ "Bank/Financial Advisor",
    grepl("social|tik tok|youtube|internet|influencer", AdviceSource, ignore.case = TRUE) ~ "Social Media/Internet",
    grepl("myself|me|no one|don't", AdviceSource, ignore.case = TRUE) ~ "Self/No One",
    grepl("NerdWallet|CreditKarma", AdviceSource, ignore.case = TRUE) ~ "Financial Websites",
    !is.na(AdviceSource) & AdviceSource != "" ~ "Other",
    TRUE ~ NA_character_
  ))

source_summary <- data_clean %>%
  filter(!is.na(AdviceCategory)) %>%
  count(AdviceCategory) %>%
  mutate(Percentage = n / sum(n) * 100) %>%
  arrange(desc(n))

ggplot(source_summary, aes(x = reorder(AdviceCategory, n), y = n, fill = AdviceCategory)) +
  geom_bar(stat = "identity", color = "black", alpha = 0.8) +
  geom_text(aes(label = paste0(n, "\n", round(Percentage, 1), "%")), 
            hjust = -0.1, size = 4) +
  coord_flip() +
  labs(title = "Where People Go First for Financial Advice",
       x = "",
       y = "Number of Responses") +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))


Barriers to Consulting Professional Advisors

data_clean <- data_clean %>%
  mutate(BarrierCategory = case_when(
    grepl("money|cost|fee|free|financial|spending", DiscouragingFactors, ignore.case = TRUE) ~ "Cost/Money",
    grepl("time|availability|busy", DiscouragingFactors, ignore.case = TRUE) ~ "Time/Availability",
    grepl("trust|judgment|dismissed|sell|scam|young|stranger", DiscouragingFactors, ignore.case = TRUE) ~ "Trust/Judgment Issues",
    grepl("don't|don’t|not.*help", DiscouragingFactors, ignore.case = TRUE) ~ "Not Helpful",
    !is.na(DiscouragingFactors) & DiscouragingFactors != "" ~ "Other",
    TRUE ~ NA_character_
  ))

barrier_summary <- data_clean %>%
  filter(!is.na(BarrierCategory)) %>%
  count(BarrierCategory) %>%
  arrange(desc(n))

ggplot(barrier_summary, aes(x = reorder(BarrierCategory, n), y = n, fill = BarrierCategory)) +
  geom_bar(stat = "identity", color = "black", alpha = 0.8) +
  geom_text(aes(label = n), hjust = -0.2, size = 5, fontface = "bold") +
  coord_flip() +
  labs(title = "Key Barriers to Consulting Professional Financial Advisors",
       x = "",
       y = "Number of Mentions") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))


AI Experience Summary

ai_exp <- data_clean %>%
  filter(!is.na(AIExperience) & AIExperience != "") %>%
  mutate(HasUsedAI = case_when(
    grepl("never|no|not|haven't|have not|nope", AIExperience, ignore.case = TRUE) ~ "Never Used AI",
    grepl("helpful|accurate|well|good|useful|better", AIExperience, ignore.case = TRUE) ~ "Positive Experience",
    TRUE ~ "Has Used AI"
  ))

ai_summary <- ai_exp %>%
  count(HasUsedAI) %>%
  mutate(Percentage = n / sum(n) * 100)

ggplot(ai_summary, aes(x = "", y = n, fill = HasUsedAI)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(HasUsedAI, "\n", n, " (", round(Percentage, 1), "%)")), 
            position = position_stack(vjust = 0.5), size = 4) +
  labs(title = "AI Usage for Financial Advice") +
  scale_fill_brewer(palette = "Pastel1") +
  theme_void() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
        legend.position = "none")


Misinformation Awareness

# Risk recognition
risk_data <- data_clean %>%
  filter(!is.na(RecognizeRisks) & RecognizeRisks != "") %>%
  mutate(RiskAwareness = case_when(
    grepl("^yes|^i believe so|recognize|aware", RecognizeRisks, ignore.case = TRUE) ~ "Yes",
    grepl("^no|don’t|do not|not recogni", RecognizeRisks, ignore.case = TRUE) ~ "No",
    grepl("maybe|sometimes|some might|some", RecognizeRisks, ignore.case = TRUE) ~ "Mixed/Uncertain",
    TRUE ~ "Other"
  ))

risk_summary <- risk_data %>%
  count(RiskAwareness) %>%
  mutate(Percentage = n / sum(n) * 100)

# Fact-checking confidence
factcheck_data <- data_clean %>%
  filter(!is.na(FactCheckConfidence) & FactCheckConfidence != "") %>%
  mutate(FactCheckAbility = case_when(
    grepl("^yes|confident|better|^i believe so", FactCheckConfidence, ignore.case = TRUE) ~ "Yes/Confident",
    grepl("^no|not confident|not often|don’t", FactCheckConfidence, ignore.case = TRUE) ~ "No/Not Confident",
    grepl("maybe|some|not sure|probably|depends", FactCheckConfidence, ignore.case = TRUE) ~ "Mixed/Uncertain",
    TRUE ~ "Other"
  ))

factcheck_summary <- factcheck_data %>%
  count(FactCheckAbility) %>%
  mutate(Percentage = n / sum(n) * 100)

# Combined bar chart
combined_data <- bind_rows(
  risk_summary %>% mutate(Question = "Recognize Risks?", Response = RiskAwareness),
  factcheck_summary %>% mutate(Question = "Can Fact-Check?", Response = FactCheckAbility)
) %>% select(Question, Response, n, Percentage)

ggplot(combined_data, aes(x = Response, y = n, fill = Question)) +
  geom_bar(stat = "identity", position = "dodge", color = "black", alpha = 0.8) +
  geom_text(aes(label = paste0(n, "\n", round(Percentage, 1), "%")),
            position = position_dodge(width = 0.9), vjust = -0.5, size = 3.5) +
  labs(title = "Misinformation Awareness & Fact-Checking Confidence",
       x = "",
       y = "Number of Responses",
       fill = "") +
  scale_fill_manual(values = c("#E74C3C", "#3498DB")) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        legend.position = "top")


Key Recommendations from Respondents

# Extract keywords from suggestions
suggestions <- data_clean %>%
  filter(!is.na(ConnectionSuggestions) & ConnectionSuggestions != "") %>%
  pull(ConnectionSuggestions)

# Common themes
themes <- data.frame(
  Theme = c("Social Media", "Relatable/Simple", "Trust/Professional", "Free/Accessible", "Education/Understanding"),
  Keywords = c("social.*media|tik.*tok|online|instagram",
               "relatable|simple|easy|digest|understand|fun|appeal",
               "trust|professional|genuine|proof|reliable",
               "free|consult|accessible|available",
               "educat|learn|explain|information|terms")
)

theme_counts <- sapply(1:nrow(themes), function(i) {
  sum(grepl(themes$Keywords[i], suggestions, ignore.case = TRUE))
})

theme_df <- data.frame(
  Theme = themes$Theme,
  Mentions = theme_counts
) %>%
  filter(Mentions > 0) %>%
  arrange(desc(Mentions))

ggplot(theme_df, aes(x = reorder(Theme, Mentions), y = Mentions, fill = Theme)) +
  geom_bar(stat = "identity", color = "black", alpha = 0.8) +
  geom_text(aes(label = Mentions), hjust = -0.2, size = 5, fontface = "bold") +
  coord_flip() +
  labs(title = "Key Themes in Recommendations for Financial Companies",
       x = "",
       y = "Number of Mentions") +
  scale_fill_brewer(palette = "Dark2") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))


Sample Recommendations

recommendations <- data_clean %>%
  filter(!is.na(ConnectionSuggestions) & ConnectionSuggestions != "") %>%
  select(AgeGroup, ConnectionSuggestions) %>%
  head(10)

kable(recommendations, 
      col.names = c("Age Group", "Suggestions for Companies"),
      caption = "Sample Respondent Suggestions")
Sample Respondent Suggestions
Age Group Suggestions for Companies
{“ImportId”:“QID12”} {“ImportId”:“QID11_TEXT”}
14-25 Be more relatable on social media.
14-25 To be more appealing and make learning about finance fun, not boring.
26-35 Social Media Marketing
14-25
36-45 social media like jumping on Tik Tok
14-25 target audience
14-25 Make financial issues and terms easier to digest and understand
26-35 Yes
14-25 Market more on social media

Summary Statistics

cat("## Survey Overview\n\n")
## ## Survey Overview
cat("**Total Valid Responses:**", nrow(data_clean), "\n\n")
## **Total Valid Responses:** 25
cat("**Age Groups:**", paste(unique(data_clean$AgeGroup), collapse = ", "), "\n\n")
## **Age Groups:** {"ImportId":"QID12"}, 14-25, 26-35, 36-45
cat("**Most Common Advice Source:**", source_summary$AdviceCategory[1], 
    "(", source_summary$n[1], "responses )\n\n")
## **Most Common Advice Source:** Family/Friends ( 9 responses )
cat("**Top Barrier:**", barrier_summary$BarrierCategory[1], 
    "(", barrier_summary$n[1], "mentions )\n\n")
## **Top Barrier:** Cost/Money ( 11 mentions )
cat("**Most Common Trust Sentiment:**", trust_summary$TrustCategory[1], 
    "(", trust_summary$n[1], "responses )\n\n")
## **Most Common Trust Sentiment:** Other/Unclear ( 7 responses )