#Packages

library(tidyverse)
library(readxl)
library(dplyr)
library(ggplot2)
library(ggalluvial)
library(naniar)
library(knitr)
library(forcats)
library(kableExtra)
library(webshot2)
library(scales)
library(sf)
library(tigris)
library(viridis)
library(stringr)
library(sandwich)
library(ggeffects)
library(lmtest)
#Functions for graphs
plot_demo_bar <- function(data, var, xlab) {
  ggplot(data, aes(x = {{ var }},
                   y = after_stat(..count.. / sum(..count..)),
                   fill = {{ var }})) +
    geom_bar(color = "black") +
    scale_y_continuous(
      expand = c(0, 0),
      name = "Percent",
      labels = scales::percent_format(accuracy = 0.1)
    ) +
    scale_fill_brewer(palette = "Pastel4") +
    labs(x = xlab) +
    theme_minimal(base_size = 12) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))
}

#Importing the Data

Pesticide<- read_excel("Pesticide data.xlsx")
## New names:
## • `Q32...205` -> `Q32...190`
## • `Q32...207` -> `Q32...192`
##Columns that are completely deleted due to privacy or not needed
Pesticide$IPAddress<- NULL
Pesticide$Status<- NULL
Pesticide$RecipientLastName<- NULL
Pesticide$RecipientFirstName<- NULL
Pesticide$RecipientEmail<- NULL
Pesticide$ExternalReference<- NULL
Pesticide$LocationLatitude<- NULL
Pesticide$LocationLongitude<- NULL
Pesticide$StartDate<- NULL
Pesticide$EndDate<- NULL
Pesticide$Progress<- NULL
Pesticide$`Duration (in seconds)`<- NULL
Pesticide$Finished<- NULL
Pesticide$RecordedDate<- NULL
Pesticide_Clean<- read.csv("cleaned_Pesticide.csv")
Pesticide_Clean<- Pesticide_Clean[-1,]

#Qustion 1

Pesticide_License_Type<- Pesticide_Clean$Q2
Pesticide_License_Type_NA<- ifelse(is.na(Pesticide_License_Type), "Missing", as.character(Pesticide_License_Type))

recode for bar chart

Pesticide_License_Type <- as.data.frame(Pesticide_License_Type)
Pesticide_License_Type_recode <- Pesticide_License_Type %>%
    mutate(
    Applicator = dplyr::recode(Pesticide_License_Type,
                    "1" = "Private Applicator",
                    "2" = "Commerical Applicator",
                    "3" = "Commerical Applicator with Contractor License")
    )
Pesticide_License_Type_recode <- Pesticide_License_Type_recode %>% drop_na()
applicator <- ggplot(data = Pesticide_License_Type_recode, aes(x = Applicator, fill = Applicator)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), 
          vjust = -0.3, size = 4)+
  scale_fill_manual(
    values = c("steelblue", "darkorchid", "deeppink2", "darkred")
    ) + 
  labs(x = "License Type", y = "Count", fill = "License Type", title = "Counts of Type of Pesticide Applicator") +
  theme_void()+
  theme(axis.text.x = element_blank(),
      axis.ticks.x = element_blank())

applicator

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/applicator_piechart.png", applicator)
## Saving 7 x 5 in image
Pesticide_License_Type <- Pesticide_License_Type %>% drop_na()

applicator <- plot_demo_bar(data=Pesticide_License_Type_recode, Applicator, "Type of Pesticide Applicator")
## Warning: Unknown palette: "Pastel4"
applicator

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/applicator_barchart.png", applicator)
## Saving 7 x 5 in image
length(Pesticide_License_Type) #Total Responses
## [1] 1
table(Pesticide_License_Type, useNA = "ifany")
## Pesticide_License_Type
##   1   2   3 
## 556  38   3
prop.table(table(Pesticide_License_Type))*100 #Without Missing Data
## Pesticide_License_Type
##          1          2          3 
## 93.1323283  6.3651591  0.5025126
prop.table(table(Pesticide_License_Type, useNA = "ifany"))*100 #With Missing Data
## Pesticide_License_Type
##          1          2          3 
## 93.1323283  6.3651591  0.5025126

#Question 2

counts_Q2<- colSums(Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")] == 1, na.rm = TRUE)
table_Q2<- data.frame(
  option = names(counts_Q2),
  count = counts_Q2, 
  percent = round(counts_Q2/ sum(counts_Q2) *100, 1)
)
table_Q2
sum(rowSums(is.na(Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")])) == 9)
## [1] 36
(36/633) *100 # Percent Missing
## [1] 5.687204

#Question 3

Q2_reg<- Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")]

Q3_reg<- Pesticide_Clean[, c("Q4_1", "Q4_2", "Q4_3", "Q4_4", "Q4_5", "Q4_6", "Q4_7", "Q4_8", "Q4_9", "Q4_10", "Q4_11")]

Q2<- Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")]

Q3<- Pesticide_Clean[, c("Q4_1", "Q4_2", "Q4_3", "Q4_4", "Q4_5", "Q4_6", "Q4_7", "Q4_8", "Q4_9", "Q4_10", "Q4_11")]
Q2[is.na(Q2)] <- 0

Q3[is.na(Q3)] <- 0

skipLogic_Q3<- list(
  Q4_1 = Q2$Q3_1 == 1 | Q2$Q3_2 == 1, 
  Q4_2 = Q2$Q3_4 == 1,
  Q4_3 = Q2$Q3_5 == 1,
  Q4_4 = Q2$Q3_3 == 1,
  Q4_5 = Q2$Q3_2 == 1, 
  Q4_6 = Q2$Q3_6 == 1,
  Q4_7 = Q2$Q3_7 == 1, 
  Q4_8 = Q2$Q3_1 == 1 | Q2$Q3_2 == 1,
  Q4_9 = Q2$Q3_8 == 1,
  Q4_10 = Q2$Q3_1 == 1 | Q2$Q3_2 == 1,
  Q4_11 = Q2$Q3_8 == 1
)

Q4_table_title<- c(Q4_1 = "Plant Ag",
  Q4_2 = "Animal Ag",
  Q4_3 = "Forestry",
  Q4_4 = "Ornamental/Turf",
  Q4_5 = "Seed Treatment",
  Q4_6 = "Aquatic",
  Q4_7 = "ROW",
  Q4_8 = "Aerial Equipment",
  Q4_9 = "Pest Control",
  Q4_10 = "Ag Commodity Fumigation",
  Q4_11 = "Mosquito Control")

result_Q3<- data.frame(
  Option = names(skipLogic_Q3),
  Eligible = sapply(skipLogic_Q3, sum),
  Selected = sapply(names(skipLogic_Q3), function(col) {
    skip<- skipLogic_Q3[[col]]
    sum(Q3[skip, col] == 1, na.rm = TRUE)
  }
    )
)

result_Q3$Percentage<- round(result_Q3$Selected / result_Q3$Eligible *100, 1)

result_Q3$Option<- Q4_table_title[result_Q3$Option]
results_Q3_table_ready<- result_Q3 %>%
  rename(
    "Pesticide License" = Option,
    "Eligible Participants" = Eligible,
    "Selected Participants" = Selected,
    "Percentage Selected (%)" = Percentage
  )

kable(results_Q3_table_ready, 
      caption = "Survey Question 3 Eligibility and Selection Summary",
      row.names = FALSE)
Survey Question 3 Eligibility and Selection Summary
Pesticide License Eligible Participants Selected Participants Percentage Selected (%)
Plant Ag 360 331 91.9
Animal Ag 189 138 73.0
Forestry 136 101 74.3
Ornamental/Turf 87 67 77.0
Seed Treatment 211 26 12.3
Aquatic 24 14 58.3
ROW 26 15 57.7
Aerial Equipment 360 4 1.1
Pest Control 109 14 12.8
Ag Commodity Fumigation 360 18 5.0
Mosquito Control 109 17 15.6

#Q2 & Q3 Alluvial Plot:

Q2_Q3_Skip_Labels <- c(
  Q3_1 = "Fruits/Vegetables",
  Q3_2 = "Row Crops",
  Q3_3 = "Green Industry",
  Q3_4 = "Livestock",
  Q3_5 = "Forestry",
  Q3_6 = "Aquatic",
  Q3_7 = "Utility/ROW",
  Q3_8 = "Pest Management",
  Q3_9 = "Other Industry",

  Q4_1 = "Plant Ag",
  Q4_2 = "Animal Ag",
  Q4_3 = "Forestry",
  Q4_4 = "Ornamental/Turf",
  Q4_5 = "Seed Treatment",
  Q4_6 = "Aquatic",
  Q4_7 = "ROW",
  Q4_8 = "Aerial Equipment",
  Q4_9 = "Pest Control",
  Q4_10 = "Ag Commodity Fumigation",
  Q4_11 = "Mosquito Control"
)
Q2_long<- Q2 %>%
  mutate(ID = 1:n()) %>%
  pivot_longer(
    cols = starts_with("Q3_"),
    names_to = ("Q2_option"),
    values_to = "selected"
  ) %>%
  filter(selected == 1)

Q3_long<- Q3 %>%
  mutate(ID = 1:n()) %>%
  pivot_longer(
    cols = starts_with("Q4_"),
    names_to = ("Q3_option"),
    values_to = "selected"
  ) %>%
  filter(selected == 1)
long_format_merge_2_3<- merge(Q2_long[, c("ID", "Q2_option")],
                              Q3_long[, c("ID", "Q3_option")],
                              by = "ID")
long_format_merge_2_3$Q2_option<- Q2_Q3_Skip_Labels[long_format_merge_2_3$Q2_option]
long_format_merge_2_3$Q3_option<- Q2_Q3_Skip_Labels[long_format_merge_2_3$Q3_option]
Q2_colors<- c(
  "Fruits/Vegetables" = "#8DD3C7",
  "Row Crops" = "gold",
  "Green Industry" ="#BEBADA",
  "Livestock" ="#FB8072",
  "Forestry" ="#80B1D3",
  "Aquatic" ="#FBD462",
  "Utility/ROW" ="#B3DE69",
  "Pest Management" ="#FCCDE5",
  "Other Industry" ="#D9D9D9"
)

alluvial <- ggplot(long_format_merge_2_3, aes(axis1 = Q2_option, axis2 = Q3_option)) + 
  geom_alluvium(aes(fill = Q2_option), alpha = 0.9, width = 1/15) +
  geom_stratum(width = 1/8, fill= "grey95", color="black") +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  scale_fill_manual(values = Q2_colors,
                    name = "Industry Options") +
  scale_x_discrete(limits = c("Survey Q2", "Survey Q3")) + 
  theme_void(base_size = (12)) 

alluvial

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/alluvial.png", alluvial)
## Saving 7 x 5 in image

#Question 4

Question4= Pesticide_Clean %>%
  select(County_Code = Q24_1)

Question4= Question4 %>%
  mutate(
    County = dplyr::recode(as.character(County_Code),
      "1"="Appling","2"="Atkinson","3"="Bacon","4"="Baker","5"="Baldwin",
      "6"="Banks","7"="Barrow","8"="Bartow","9"="Ben Hill","10"="Berrien",
      "11"="Bibb","12"="Bleckley","13"="Brantley","14"="Brooks","15"="Bryan",
      "16"="Bulloch","17"="Burke","18"="Butts","19"="Calhoun","20"="Camden",
      "21"="Candler","22"="Carroll","23"="Catoosa","24"="Charlton","25"="Chatham",
      "26"="Chattahoochee","27"="Chattooga","28"="Cherokee","29"="Clarke","30"="Clay",
      "31"="Clayton","32"="Clinch","33"="Cobb","34"="Coffee","35"="Colquitt",
      "36"="Columbia","37"="Cook","38"="Coweta","39"="Crawford","40"="Crisp",
      "41"="Dade","42"="Dawson","43"="Decatur","44"="DeKalb","45"="Dodge",
      "46"="Dooly","47"="Dougherty","48"="Douglas","49"="Early","50"="Echols",
      "51"="Effingham","52"="Elbert","53"="Emanuel","54"="Evans","55"="Fannin",
      "56"="Fayette","57"="Floyd","58"="Forsyth","59"="Franklin","60"="Fulton",
      "61"="Gilmer","62"="Glascock","63"="Glynn","64"="Gordon","65"="Grady",
      "66"="Greene","67"="Gwinnett","68"="Habersham","69"="Hall","70"="Hancock",
      "71"="Haralson","72"="Harris","73"="Hart","74"="Heard","75"="Henry",
      "76"="Houston","77"="Irwin","78"="Jackson","79"="Jasper","80"="Jeff Davis",
      "81"="Jefferson","82"="Jenkins","83"="Johnson","84"="Jones","85"="Lamar",
      "86"="Lanier","87"="Laurens","88"="Lee","89"="Liberty","90"="Lincoln",
      "91"="Long","92"="Lowndes","93"="Lumpkin","94"="McDuffie","95"="McIntosh",
      "96"="Macon","97"="Madison","98"="Marion","99"="Meriwether","100"="Miller",
      "101"="Mitchell","102"="Monroe","103"="Montgomery","104"="Morgan","105"="Murray",
      "106"="Muscogee","107"="Newton","108"="Oconee","109"="Oglethorpe","110"="Paulding",
      "111"="Peach","112"="Pickens","113"="Pierce","114"="Pike","115"="Polk",
      "116"="Pulaski","117"="Putnam","118"="Quitman","119"="Rabun","120"="Randolph",
      "121"="Richmond","122"="Rockdale","123"="Schley","124"="Screven","125"="Seminole",
      "126"="Spalding","127"="Stephens","128"="Stewart","129"="Sumter","130"="Talbot",
      "131"="Taliaferro","132"="Tattnall","133"="Taylor","134"="Telfair","135"="Terrell",
      "136"="Thomas","137"="Tift","138"="Toombs","139"="Towns","140"="Treutlen",
      "141"="Troup","142"="Turner","143"="Twiggs","144"="Union","145"="Upson",
      "146"="Walker","147"="Walton","148"="Ware","149"="Warren","150"="Washington",
      "151"="Wayne","152"="Webster","153"="Wheeler","154"="White","155"="Whitfield",
      "156"="Wilcox","157"="Wilkes","158"="Wilkinson","159"="Worth"))

#Checking Missingness

data.frame(
  Total = nrow(Question4),
  Missing = sum(is.na(Question4$County)),
  Percent_Missing = mean(is.na(Question4$County))*100
)

Heamap

County_Counts= Question4%>%
  count(County, name = "responses")

options(tigris_use_cache = TRUE)

ga_map= counties(state = "GA", class = "sf")
## Retrieving data for the year 2024
ga_map$County= gsub(" County", "", ga_map$NAME)

map_data= ga_map %>%
  left_join(County_Counts, by = "County")

map_data$responses[is.na(map_data$responses)] <- 0

ggplot(map_data) +
  geom_sf(aes(fill = responses), color = "white", size = 0.15) +
  scale_fill_viridis(option = "plasma", direction = -1) +
  labs(
    title = "Survey Respondents by Georgia County",
    subtitle = "Question 4: County Location",
    fill = "Responses") +
  theme_void() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right")

#Question 5

Pesticide_Clean$Q6<- factor(
  Pesticide_Clean$Q6, 
  levels = c(1, 2),
  labels = c("Yes", "No")
)

table(Pesticide_Clean$Q6, useNA = "ifany")
## 
##  Yes   No <NA> 
##  107  456   70
#Need more info to continue

#Question 6

Q6_data <- as.data.frame(Pesticide_Clean[,186])

colnames(Q6_data) <- c("Employees")

Q6_data$Employees <- as.character(Q6_data$Employees)

Q6_data <- Q6_data %>%
  mutate(
    Employees = dplyr::recode(as.character(Employees),
      "1" = "Sole operator",
      "2" = "Between 1 and 10 employees",
      "3" = "Between 11 and 24 employees",
      "4" = "Between 25 and 50 employees",
      "5" = "Between 51 and 75 employees",
      "6" = "Between 76 and 100 employees",
      "7" = "More than 100 employees"
    ),
    Employees = factor(Employees, levels = c(
      "Sole operator",
      "Between 1 and 10 employees",
      "Between 11 and 24 employees",
      "Between 25 and 50 employees",
      "Between 51 and 75 employees",
      "Between 76 and 100 employees",
      "More than 100 employees"
    ))
  ) %>%
  filter(!is.na(Employees))
Q6_table <- Q6_data %>%
  count(Employees, name = "Count") %>%
  arrange(Employees) %>%   # optional but nice
  mutate(
    Percent_num = 100 * Count / sum(Count),
    Percent = paste0(round(Percent_num, 1), "%")
  ) %>%
  select(-Percent_num)

Q6_table %>%
  kable(
    caption = "Survey Question 6: Number of Employees",
    col.names = c("Number of Employees", "Count", "Percent"),
    align = c("l", "c", "c"),
    booktabs = TRUE
  ) %>%
  kable_styling(
    full_width = FALSE,
    bootstrap_options = c("striped", "hover", "condensed")
  )
Survey Question 6: Number of Employees
Number of Employees Count Percent
Sole operator 332 59%
Between 1 and 10 employees 170 30.2%
Between 11 and 24 employees 30 5.3%
Between 25 and 50 employees 11 2%
Between 51 and 75 employees 5 0.9%
Between 76 and 100 employees 2 0.4%
More than 100 employees 13 2.3%

#Question 7

Q7_data <- as.data.frame(Pesticide_Clean[,187])
colnames(Q7_data) <- c("Years")

Q7_data <- Q7_data %>%
  mutate(
    Years = dplyr::recode(as.character(Years),
      "1" = "Less than 1 year",
      "2" = "Between 1 to 5 years",
      "3" = "Between 6 to 10 years",
      "4" = "Between 11 to 15 years",
      "5" = "Between 16 to 20 years",
      "6" = "More than 20 years"
    ),
    Years = factor(Years, levels = c(
      "Less than 1 year",
      "Between 1 to 5 years",
      "Between 6 to 10 years",
      "Between 11 to 15 years",
      "Between 16 to 20 years",
      "More than 20 years"
    ))
  ) %>%
  filter(!is.na(Years))
Q7_table <- Q7_data %>%
  count(Years, name = "Count") %>%
  arrange(Years) %>%   # optional but explicit
  mutate(
    Percent_num = 100 * Count / sum(Count),
    Percent = paste0(round(Percent_num, 1), "%")
  ) %>%
  select(-Percent_num)

Q7_table %>%
  kable(
    caption = "Survey Question 7: How long have you held a license?",
    col.names = c("Years Licensed", "Count", "Percent"),
    align = c("l", "c", "c"),
    booktabs = TRUE
  ) %>%
  kable_styling(
    full_width = FALSE,
    bootstrap_options = c("striped", "hover", "condensed")
  )
Survey Question 7: How long have you held a license?
Years Licensed Count Percent
Less than 1 year 14 2.6%
Between 1 to 5 years 203 37.7%
Between 6 to 10 years 112 20.8%
Between 11 to 15 years 54 10%
Between 16 to 20 years 50 9.3%
More than 20 years 106 19.7%

#Question 8

Pesticide_Clean$Q10 <- as.numeric(as.character(Pesticide_Clean$Q10))

Pesticide_Clean$Q10<- factor(Pesticide_Clean$Q10, 
                            levels = c(1, 2),
                            labels = c(
                              "Initial Certification",
                              "Recertification"
                            )
                           )

Pesticide_Clean$Q10<- fct_explicit_na(
  Pesticide_Clean$Q10,
  na_level = "Missing"
)
## Warning: `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
counts_Q8<- table(Pesticide_Clean$Q10, useNA = "ifany")
counts_Q8
## 
## Initial Certification       Recertification               Missing 
##                   222                   317                    94
percent_Q8<- round(prop.table(counts_Q8)*100, 1)
percent_Q8
## 
## Initial Certification       Recertification               Missing 
##                  35.1                  50.1                  14.8
table_Q8<- data.frame(
  Status = names(counts_Q8), 
  Count = as.numeric(counts_Q8),
  Percent = percent_Q8
)

table_Q8$Percent.Var1<- NULL

table_Q8

##Certification Graph

#Fixing Data for Q9 and Q13

Q13<- Pesticide_Clean[, c("Q12_1", "Q12_2", "Q12_3", "Q12_4", "Q12_5", "Q12_6", "Q12_7")]

colnames(Q13)<- c(
  "No training offered when I needed them",
  "Training sessions were not held at a convenient time",
  "The training costs too much money",
  "A different trainer was closer",
  "I attended a full day training workshop to receive multiple licenses",
  "The instructor at my class was from UGA but a different company sponsored it",
  "Other"
)
Q13_freq<- colSums(Q13 == 1, na.rm = TRUE)
Q13_freq
##                                       No training offered when I needed them 
##                                                                           16 
##                         Training sessions were not held at a convenient time 
##                                                                           13 
##                                            The training costs too much money 
##                                                                            1 
##                                               A different trainer was closer 
##                                                                            7 
##         I attended a full day training workshop to receive multiple licenses 
##                                                                            3 
## The instructor at my class was from UGA but a different company sponsored it 
##                                                                            5 
##                                                                        Other 
##                                                                           29
Pesticide_Clean$Q12_7_TEXT<- str_trim(tolower(Pesticide_Clean$Q12_7_TEXT))

Pesticide_Clean$Q12_7_TEXT<- gsub("extion", "extension", Pesticide_Clean$Q12_7_TEXT)

Pesticide_Clean$Q12_7_TEXT<- gsub("4", "extension", Pesticide_Clean$Q12_7_TEXT)
fix_skip_logic_Q9_Q13<- Pesticide_Clean$Q11_2 == 1 & Pesticide_Clean$Q12_7 == 1 & grepl("extension", Pesticide_Clean$Q12_7_TEXT, ignore.case = TRUE)

sum(fix_skip_logic_Q9_Q13)
## [1] 7
Pesticide_Clean$Q11_1[fix_skip_logic_Q9_Q13]<- 1
Pesticide_Clean$Q11_2[fix_skip_logic_Q9_Q13]<- NA
Q13_columns<- c("Q12_1", "Q12_2", "Q12_3", "Q12_4", "Q12_5", "Q12_6", "Q12_7")

Pesticide_Clean[fix_skip_logic_Q9_Q13, Q13_columns]<- NA
Pesticide_Clean$Q12_7_TEXT[fix_skip_logic_Q9_Q13]<- NA

#Question 9

counts_Q9<- colSums(Pesticide_Clean[, c("Q11_1", "Q11_2")] == 1, na.rm =TRUE)
names(counts_Q9)<- c("UGA", "Other")
eligibible_Q9<- sum(Pesticide_Clean$Q10 == "Recertification", na.rm = TRUE)
eligibible_Q9
## [1] 317
percent_Q9<- round(counts_Q9 / eligibible_Q9 * 100, 1)
percent_Q9
##   UGA Other 
##  85.8  17.0
table_Q9<- data.frame(
  Training = names(counts_Q9),
  Selected = as.numeric(counts_Q9),
  Percent = percent_Q9
)

table_Q9

#Question 10

unique(Pesticide_Clean$Q32...205)
##  [1] "0"                                         
##  [2] "can't remember"                            
##  [3] "$20 "                                      
##  [4] NA                                          
##  [5] "$10 "                                      
##  [6] "$25 "                                      
##  [7] "$25.00 "                                   
##  [8] "15"                                        
##  [9] "25"                                        
## [10] "$5 "                                       
## [11] "$30 "                                      
## [12] "10"                                        
## [13] "$15 "                                      
## [14] "Nothing"                                   
## [15] "35"                                        
## [16] "20"                                        
## [17] "I0"                                        
## [18] "i am not sure . may be 25.00"              
## [19] "don't remember"                            
## [20] "I don’t remember"                          
## [21] "It was online---do not remember exact cost"
## [22] "50-100"                                    
## [23] "not sure"                                  
## [24] "$35 "                                      
## [25] "50"                                        
## [26] "$20.00 Application Fee"                    
## [27] "100"                                       
## [28] "Not sure"                                  
## [29] "5"                                         
## [30] "don't remember - maybe $50"                
## [31] "O"                                         
## [32] "7.50 per hour"                             
## [33] "40"                                        
## [34] "Don't remember"                            
## [35] "$5 per credit"                             
## [36] "0\n"                                       
## [37] "Don’t remember"                            
## [38] "$12.50 "                                   
## [39] "30?"                                       
## [40] "I don't remember"                          
## [41] "?"                                         
## [42] "110"                                       
## [43] "$75 "                                      
## [44] "$35.00 "                                   
## [45] "N/a"                                       
## [46] "$50.00 "                                   
## [47] "Unsure"                                    
## [48] "8"
Q10_unfiltered<- str_trim(tolower(Pesticide_Clean$Q32...205))

Q10_unfiltered<- gsub("\\$", "", Q10_unfiltered)

Q10_unfiltered<- gsub("\\?", "", Q10_unfiltered)

Q10_unfiltered<- gsub("application fee", "", Q10_unfiltered)

Q10_unfiltered<- gsub("per hour", "", Q10_unfiltered)

Q10_unfiltered<- gsub("i", "", Q10_unfiltered)

Q10_unfiltered[Q10_unfiltered == "nothng"]<- 0
Q10_clean<- Q10_unfiltered

Q10_miss_NA<- rep("Valid", length(Q10_clean))

excluded_values<- c(
  "can't remember", " am not sure . may be 25.00", "don't remember", "don't remember", "t was onlne---do not remember exact cost", "not sure", "don't remember - maybe 50", "5 per credt", " don't remember", "?", "unsure", "o", "", "n/a", "don’t remember", " don’t remember", "50-100"
)

Q10_miss_NA[Q10_clean %in% excluded_values]<- "Excluded"
Q10_clean[Q10_clean %in% excluded_values]<- NA

Q10_miss_NA[is.na(Pesticide_Clean$Q32...205)]<- "Missing"
unique(Q10_clean)
##  [1] "0"      NA       "20"     "10"     "25"     "25.00"  "15"     "5"     
##  [9] "30"     "35"     "50"     "20.00 " "100"    "7.50 "  "40"     "12.50" 
## [17] "110"    "75"     "35.00"  "50.00"  "8"
Q10_numeric<- as.numeric(gsub("[^0-9\\.]", "", Q10_clean))
Q10_ranges<- cut(
  Q10_numeric,
  breaks = c(-1, 0, 10, 30, 50, Inf),
  labels = c("$0", "$1-10", "$11-30", "$31-50", "$51+"),
  right = TRUE
)
counts_Q10<- table(Q10_ranges, useNA = "ifany")
counts_Q10
## Q10_ranges
##     $0  $1-10 $11-30 $31-50   $51+   <NA> 
##    459     15     27     10      3    119
percent_Q10<- round(prop.table(counts_Q10)*100, 1)
percent_Q10
## Q10_ranges
##     $0  $1-10 $11-30 $31-50   $51+   <NA> 
##   72.5    2.4    4.3    1.6    0.5   18.8
table_Q10<- data.frame(
  Status = names(counts_Q10), 
  Count = as.numeric(counts_Q10),
  Percent = percent_Q10
)

table_Q10$Percent.Q10_ranges<- NULL

table_Q10
kable(
  table_Q10 %>% filter(!is.na(table_Q10$Status)),
  col.names = c("Cost Range ($)", "Count", "Percent(%)")
)
Cost Range ($) Count Percent(%)
$0 459 72.5
$1-10 15 2.4
$11-30 27 4.3
$31-50 10 1.6
$51+ 3 0.5

#Bar Chart including the NA

#In this question NA is missing data and the values that were imput that were not appropriate for the question.
Q10_data_fram<- data.frame(Q10_ranges = fct_explicit_na(Q10_ranges, na_level = "Missing"), Status = Q10_miss_NA)

Q10_data_fram$Q10_ranges[Q10_data_fram$Status == "Excluded"]<- "Excluded"
## Warning in `[<-.factor`(`*tmp*`, Q10_data_fram$Status == "Excluded", value =
## structure(c(1L, : invalid factor level, NA generated
Q10_data_fram$Q10_ranges<- factor(Q10_data_fram$Q10_ranges)

ggplot(Q10_data_fram, aes(x = Q10_ranges)) +
  geom_bar(fill = "coral") +
  geom_text(stat = "count", aes(label = ..count..)) +
  labs(
    title = "Cost of Virtual Training per Hour",
    x = "Cost Range ($ per hour)",
    y = "Number of responses"
  ) +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Question 11

unique(Pesticide_Clean$Q31)
##  [1] "25"             "0"              "can't remember" "45"            
##  [5] "50"             NA               "30"             "$75 "          
##  [9] "150"            "$30 in person"  "$25.00 "        "60"            
## [13] "Online classes" "$30 "           "10"             "40"            
## [17] "75"             "?"              "20"             "0\n"           
## [21] "don't remember" "I don’t recall" "Less than $50"  "100 (I think)" 
## [25] "100"            "200"            "Not sure"       "dont remember" 
## [29] "Don't remember" "O"              "Don’t remember" "2"             
## [33] "15"             "29"             "N/a"
Q11_unfiltered<- str_trim(tolower(Pesticide_Clean$Q31))

Q11_unfiltered<- gsub("\\$", "", Q11_unfiltered)

Q11_unfiltered<- gsub("in person", "", Q11_unfiltered)

Q11_unfiltered<- gsub("i think", "", Q11_unfiltered)

Q11_unfiltered<- gsub("\\()", "", Q11_unfiltered)
Q11_clean<- Q11_unfiltered

Q11_miss_NA<- rep("Valid", length(Q11_clean))

excluded_values<- c( "?", "i don’t recall", "don't remember", "n/a", "o", "can't remember", "NA", "don't remember", "not sure","don't remember", "dont remember", "online classes",  "don’t remember", "less than 50"
)

Q11_miss_NA[Q11_clean %in% excluded_values]<- "Excluded"
Q11_clean[Q11_clean %in% excluded_values]<- NA

Q11_miss_NA[is.na(Pesticide_Clean$Q31)]<- "Missing"
unique(Q11_clean)
##  [1] "25"    "0"     NA      "45"    "50"    "30"    "75"    "150"   "30 "  
## [10] "25.00" "60"    "10"    "40"    "20"    "100 "  "100"   "200"   "2"    
## [19] "15"    "29"
Q11_numeric<- as.numeric(gsub("[^0-9\\.]", "", Q11_clean))
Q11_ranges<- cut(
  Q11_numeric,
  breaks = c(-1, 0, 10, 30, 50, Inf),
  labels = c("0", "1-10", "11-30", "31-50", "51+"),
  right = TRUE
)
counts_Q11<- table(Q11_ranges, useNA = "ifany")
counts_Q11
## Q11_ranges
##     0  1-10 11-30 31-50   51+  <NA> 
##   489     3    17     6     7   111
percent_Q11<- round(prop.table(counts_Q11)*100, 1)
percent_Q11
## Q11_ranges
##     0  1-10 11-30 31-50   51+  <NA> 
##  77.3   0.5   2.7   0.9   1.1  17.5
table_Q11<- data.frame(
  Status = names(counts_Q11), 
  Count = as.numeric(counts_Q11),
  Percent = percent_Q11
)

table_Q11$Percent.Q11_ranges<- NULL

table_Q11
kable(
  table_Q11 %>% filter(!is.na(table_Q11$Status)),
  col.names = c("Cost Range ($)", "Count", "Percent(%)")
)
Cost Range ($) Count Percent(%)
0 489 77.3
1-10 3 0.5
11-30 17 2.7
31-50 6 0.9
51+ 7 1.1
#In this question NA is missing data and the values that were imput that were not appropriate for the question.
Q11_data_fram<- data.frame(Q11_ranges = fct_explicit_na(Q11_ranges, na_level = "Missing"), Status = Q11_miss_NA)

Q11_data_fram$Q11_ranges[Q11_data_fram$Status == "Excluded"]<- "Excluded"
## Warning in `[<-.factor`(`*tmp*`, Q11_data_fram$Status == "Excluded", value =
## structure(c(3L, : invalid factor level, NA generated
Q11_data_fram$Q11_ranges<- factor(Q11_data_fram$Q11_ranges)

ggplot(Q11_data_fram, aes(x = Q11_ranges)) +
  geom_bar(fill = "steelblue") +
  geom_text(stat = "count", aes(label = ..count..)) +
  labs(
    title = "Cost of Virtual Training per Half Day",
    x = "Cost Range ($ per hour)",
    y = "Number of responses"
  ) +
  theme_minimal()

#Question 12

unique(Pesticide_Clean$Q32...207)
##  [1] "50"                                    
##  [2] "0"                                     
##  [3] "can't remember"                        
##  [4] "90"                                    
##  [5] "75"                                    
##  [6] NA                                      
##  [7] "70"                                    
##  [8] "125"                                   
##  [9] "$125 "                                 
## [10] "200"                                   
## [11] "50?"                                   
## [12] "$40-50 in person"                      
## [13] "35"                                    
## [14] "$50.00 "                               
## [15] "120"                                   
## [16] "Online classes"                        
## [17] "20"                                    
## [18] "25"                                    
## [19] "40"                                    
## [20] "85"                                    
## [21] "60"                                    
## [22] "0\n"                                   
## [23] "don't remember"                        
## [24] "I don’t recalll.    Approximately 250$"
## [25] "o"                                     
## [26] "Less than $50"                         
## [27] "50-100"                                
## [28] "100"                                   
## [29] "300"                                   
## [30] "Not sure"                              
## [31] "dont remember"                         
## [32] "10"                                    
## [33] "Don't remember"                        
## [34] "Don’t remember"                        
## [35] "O"                                     
## [36] "15"                                    
## [37] "N/a"                                   
## [38] "Maybe $50 don't remember"              
## [39] "\n0"
Q12_unfiltered<- str_trim(tolower(Pesticide_Clean$Q32...207))

Q12_unfiltered<- gsub("\\$", "", Q12_unfiltered)

Q12_unfiltered<- gsub("\\?", "", Q12_unfiltered)

Q12_unfiltered<- gsub("in person", "", Q12_unfiltered)
Q12_clean<- Q12_unfiltered

Q12_miss_NA<- rep("Valid", length(Q12_clean))

excluded_values<- c("can't remember", "online classes", "don't remember", "less than 50", "o", "not sure", "dont remember", "don’t remember", "maybe 50 don't remember"
)

Q12_miss_NA[Q12_clean %in% excluded_values]<- "Excluded"
Q12_clean[Q12_clean %in% excluded_values]<- NA

Q12_miss_NA[is.na(Pesticide_Clean$Q32...207)]<- "Missing"
Q12_clean[Q12_clean %in% c("40-50 ")]<- 45

Q12_clean[Q12_clean %in% c("50-100")]<- 75

Q12_clean[Q12_clean %in% c("i don’t recalll.    approximately 250")]<- 250
unique(Q12_clean)
##  [1] "50"    "0"     NA      "90"    "75"    "70"    "125"   "200"   "45"   
## [10] "35"    "50.00" "120"   "20"    "25"    "40"    "85"    "60"    "250"  
## [19] "100"   "300"   "10"    "15"    "n/a"
Q12_numeric<- as.numeric(gsub("[^0-9\\.]", "", Q12_clean))
Q12_ranges<- cut(
  Q12_numeric,
  breaks = c(-1, 0, 10, 30, 50, Inf),
  labels = c("0", "1-10", "11-30", "31-50", "51+"),
  right = TRUE
)
counts_Q12<- table(Q12_ranges, useNA = "ifany")
counts_Q12
## Q12_ranges
##     0  1-10 11-30 31-50   51+  <NA> 
##   489     1     8    11    15   109
percent_Q12<- round(prop.table(counts_Q12)*100, 1)
percent_Q12
## Q12_ranges
##     0  1-10 11-30 31-50   51+  <NA> 
##  77.3   0.2   1.3   1.7   2.4  17.2
table_Q12<- data.frame(
  Status = names(counts_Q12), 
  Count = as.numeric(counts_Q12),
  Percent = percent_Q12
)

table_Q12$Percent.Q12_ranges<- NULL

table_Q12
kable(
  table_Q12 %>% filter (!is.na(table_Q12$Status)),
  col.names = c("Cost Range ($)", "Count", "Percent(%)")
)
Cost Range ($) Count Percent(%)
0 489 77.3
1-10 1 0.2
11-30 8 1.3
31-50 11 1.7
51+ 15 2.4
#In this question NA is missing data and the values that were imput that were not appropriate for the question.
Q12_data_fram<- data.frame(Q12_ranges = fct_explicit_na(Q12_ranges, na_level = "Missing"), Status = Q12_miss_NA)

Q12_data_fram$Q12_ranges[Q12_data_fram$Status == "Excluded"]<- "Excluded"
## Warning in `[<-.factor`(`*tmp*`, Q12_data_fram$Status == "Excluded", value =
## structure(c(4L, : invalid factor level, NA generated
Q12_data_fram$Q12_ranges<- factor(Q12_data_fram$Q12_ranges)

ggplot(Q12_data_fram, aes(x = Q12_ranges)) +
  geom_bar(fill = "violet") +
  geom_text(stat = "count", aes(label = ..count..)) +
  labs(
    title = "Cost of A Full Day of Non UGA Training",
    x = "Cost Range ($ per hour)",
    y = "Number of responses"
  ) +
  theme_minimal()

#Code to help create table

eligible_Q12 <- Pesticide_Clean$Q10 == "Recertification"
eligible_Q12[is.na(eligible_Q12)] <- FALSE

has_valid_Q12 <- !is.na(Q12_numeric)

Q12_missing_status <- dplyr::case_when(
  !eligible_Q12 ~ "Not Eligible (Skip-Logic)",
  eligible_Q12 & has_valid_Q12 ~ "Eligible, Answered",
  eligible_Q12 & !has_valid_Q12 ~ "Eligible, No Response"
)

table(Q12_missing_status)
## Q12_missing_status
##        Eligible, Answered     Eligible, No Response Not Eligible (Skip-Logic) 
##                       308                         9                       316

#Table for Q8 and Q12

N_total <- nrow(Pesticide_Clean)

# -------------------------
# Q8 (Certification Status)
# -------------------------
q8_missing  <- sum(Pesticide_Clean$Q10 == "Missing" | is.na(Pesticide_Clean$Q10), na.rm = TRUE)
q8_answered <- N_total - q8_missing
q8_resp_rate <- round(100 * q8_answered / N_total, 1)

# % nonresponse for Q8 (out of total)
q8_nonresp_pct <- round(100 * q8_missing / N_total, 1)

# -------------------------
# Q12 (Training Cost) — shown only if Recertification
# -------------------------
eligible_q12 <- Pesticide_Clean$Q10 == "Recertification"
eligible_q12[is.na(eligible_q12)] <- FALSE

N_eligible_q12 <- sum(eligible_q12)
N_not_eligible_q12 <- N_total - N_eligible_q12

answered_q12 <- eligible_q12 & !is.na(Q12_numeric)
N_answered_q12 <- sum(answered_q12, na.rm = TRUE)

N_nonresponse_q12 <- N_eligible_q12 - N_answered_q12

q12_resp_rate_eligible <- ifelse(
  N_eligible_q12 > 0,
  round(100 * N_answered_q12 / N_eligible_q12, 1),
  NA_real_
)

# % nonresponse for Q12 (out of eligible)
q12_nonresp_pct <- ifelse(
  N_eligible_q12 > 0,
  round(100 * N_nonresponse_q12 / N_eligible_q12, 1),
  NA_real_
)

# % skip-logic for Q12 (out of total)
q12_skip_pct <- round(100 * N_not_eligible_q12 / N_total, 1)

# -------------------------
# Build table (your preferred column order + 2 new % columns)
# -------------------------
missingness_tbl <- tibble(
  Question = c(
    "Q8 (Certification Status)",
    "Q12 (Training Cost; shown only if Recertification)"
  ),
  `Total N` = c(N_total, N_total),
  Answered = c(q8_answered, N_answered_q12),
  `Missing (Item Nonresponse)` = c(q8_missing, N_nonresponse_q12),
  `Eligible (Saw Question)` = c(NA_integer_, N_eligible_q12),
  `Not Eligible (Skip-Logic)` = c(NA_integer_, N_not_eligible_q12),
  `Response Rate (Eligible)` = c(
    paste0(q8_resp_rate, "%"),
    paste0(q12_resp_rate_eligible, "%")
  ),
  `Nonresponse Missingness %` = c(
    paste0(q8_nonresp_pct, "%"),
    paste0(q12_nonresp_pct, "%")
  ),
  `Skip-Logic Missingness %` = c(
    NA_character_,
    paste0(q12_skip_pct, "%")
  )
)

missingness_tbl %>%
  kable(
    caption = "Item Nonresponse and Skip-Logic Missingness for Questions 8 and 12",
    align = "lcccccccc",
    booktabs = TRUE
  ) %>%
  kable_styling(
    full_width = FALSE,
    bootstrap_options = c("striped", "hover", "condensed")
  )
Item Nonresponse and Skip-Logic Missingness for Questions 8 and 12
Question Total N Answered Missing (Item Nonresponse) Eligible (Saw Question) Not Eligible (Skip-Logic) Response Rate (Eligible) Nonresponse Missingness % Skip-Logic Missingness %
Q8 (Certification Status) 633 539 94 NA NA 85.2% 14.8% NA
Q12 (Training Cost; shown only if Recertification) 633 308 9 317 316 97.2% 2.8% 49.9%

#Question 13

Q13<- Pesticide_Clean[, c("Q12_1", "Q12_2", "Q12_3", "Q12_4", "Q12_5", "Q12_6", "Q12_7")]

colnames(Q13)<- c(
  "No training offered when I needed them",
  "Training sessions were not held at a convenient time",
  "The training costs too much money",
  "A different trainer was closer",
  "I attended a full day training workshop to receive multiple licenses",
  "The instructor at my class was from UGA but a different company sponsored it",
  "Other"
)
Q13_freq<- colSums(Q13 == 1, na.rm = TRUE)
Q13_freq
##                                       No training offered when I needed them 
##                                                                           14 
##                         Training sessions were not held at a convenient time 
##                                                                           11 
##                                            The training costs too much money 
##                                                                            1 
##                                               A different trainer was closer 
##                                                                            7 
##         I attended a full day training workshop to receive multiple licenses 
##                                                                            3 
## The instructor at my class was from UGA but a different company sponsored it 
##                                                                            5 
##                                                                        Other 
##                                                                           22

#Additional Breakdown of Other Reasons for Question 13

unique(Pesticide_Clean$Q12_7_TEXT)
##  [1] NA                                                                                                                                                                                                                                                                                                                                                                                              
##  [2] "reciprocation agreement with neighboring state"                                                                                                                                                                                                                                                                                                                                                
##  [3] "i just studied the manuals and went to take the initial test. i can’t even remember where. i get all my updated training at ggcsa conferences."                                                                                                                                                                                                                                                
##  [4] "i trained myself. i read the book"                                                                                                                                                                                                                                                                                                                                                             
##  [5] "i study the books myself"                                                                                                                                                                                                                                                                                                                                                                      
##  [6] "i learned what i needed through the handbook and classes at abac."                                                                                                                                                                                                                                                                                                                             
##  [7] "i attend the annual training in pecan production meeting"                                                                                                                                                                                                                                                                                                                                      
##  [8] "i was not aware of training."                                                                                                                                                                                                                                                                                                                                                                  
##  [9] "i found a company that provides the training and i could set the time and duration.  also in the previous questions,  i was asked about how much it cost based on criteria that did not represent the \"time spent\" for recertification.  i can tell you that the training cost me roughly $70 for all three courses.  i think you should add another choice to fully explain each situation."
## [10] "as a quality control professional in commercial & military construction i attended several different training classes over my career including the us army coe in ft stewart, robins afb as well as  classes on soil erosion & control measures on jobsites & roads & highways in georgia."                                                                                                    
## [11] "offered free training locally."                                                                                                                                                                                                                                                                                                                                                                
## [12] "usually get credits thru in house training"                                                                                                                                                                                                                                                                                                                                                    
## [13] "i took the course online"                                                                                                                                                                                                                                                                                                                                                                      
## [14] "the topics were not relevant to my operation"                                                                                                                                                                                                                                                                                                                                                  
## [15] "former ohio pco who moved 10 year ago. completed pa in ga online for my small private green setup"                                                                                                                                                                                                                                                                                             
## [16] "got application at county agents office"                                                                                                                                                                                                                                                                                                                                                       
## [17] "just took test at local spot"                                                                                                                                                                                                                                                                                                                                                                  
## [18] "went on line"                                                                                                                                                                                                                                                                                                                                                                                  
## [19] "i have the private applicators license, so i just watched the videos and took a test at the end."                                                                                                                                                                                                                                                                                              
## [20] "didn't know you offered"                                                                                                                                                                                                                                                                                                                                                                       
## [21] "online training."                                                                                                                                                                                                                                                                                                                                                                              
## [22] "on the job training"                                                                                                                                                                                                                                                                                                                                                                           
## [23] "took online"

#Demographics

demo_data <- Pesticide_Clean[,231:235]

colnames(demo_data) = c("Gender", "Race", "Ethnicity", "Age", "Education_Level")

demo_data <- demo_data %>% mutate(across(everything(), as.character))

demo_data <- demo_data %>% 
  mutate(
    Gender = dplyr::recode(as.character(Gender),
                    "1" = "Male",
                    "2" = "Female",
                    "3" = "Prefer not to answer"),
    Race = dplyr::recode(as.character(Race),
                  "1" = "American Indian or Alaska Native",
                  "2" = "Asian",
                  "3" = "Black/African American",
                  "4" = "Native Hawaiian or Other Pacific Islander",
                  "5" = "White",
                  "6" = "More than one race",
                  "7" = "Other"),
    Ethnicity = dplyr::recode(as.character(Ethnicity),
                       "1" = "Hispanic/Latino",
                       "2" = "Not Hispanic/Latino"),
    Age = dplyr::recode(as.character(Age),
                 "9" = "Under 18",
                 "10" = "18–24",
                 "11" = "25–34",
                 "12" = "35–44",
                 "13" = "45–54",
                 "14" = "55–64",
                 "15" = "65–74",
                 "16" = "75–84",
                 "17" = "85+"),
    Education_Level = dplyr::recode(as.character(Education_Level),
                             "1" = "Less than 9th grade",
                             "2" = "9th to 12th grade, no diploma",
                             "3" = "High school graduate or GED",
                             "4" = "Some college, no degree",
                             "5" = "Associate’s degree",
                             "6" = "Bachelor’s degree",
                             "7" = "Graduate or professional degree"),
  )


#Filtering Data
demo_gender <- demo_data[, 1, drop = FALSE] %>%
  mutate(
    Gender = factor(Gender, levels = c(
      "Male",
      "Female",
      "Prefer not to answer"
    ))
  ) %>%
  filter(!is.na(Gender))

demo_race <- demo_data[, 2, drop = FALSE] %>%
  mutate(
    Race = factor(Race, levels = c(
      "American Indian or Alaska Native",
      "Asian",
      "Black/African American",
      "Native Hawaiian or Other Pacific Islander",
      "White",
      "More than one race",
      "Other"
    ))
  ) %>%
  filter(!is.na(Race), Race != "")

demo_ethnicity <- demo_data[, 3, drop = FALSE] %>%
  mutate(
    Ethnicity = factor(Ethnicity, levels = c(
      "Hispanic/Latino",
      "Not Hispanic/Latino"
    ))
  ) %>%
  filter(!is.na(Ethnicity), Ethnicity != "")

demo_age <- demo_data[, 4, drop = FALSE] %>%
  mutate(
    Age = factor(Age, levels = c(
      "Under 18",
      "18–24",
      "25–34",
      "35–44",
      "45–54",
      "55–64",
      "65–74",
      "75–84",
      "85+"
    ))
  ) %>%
  filter(!is.na(Age), Age != "")

demo_edu <- demo_data[, 5, drop = FALSE] %>%
  mutate(
    Education_Level = factor(Education_Level, levels = c(
      "Less than 9th grade",
      "9th to 12th grade, no diploma",
      "High school graduate or GED",
      "Some college, no degree",
      "Associate’s degree",
      "Bachelor’s degree",
      "Graduate or professional degree"
    ))
  ) %>%
  filter(!is.na(Education_Level), Education_Level != "")
gender_summary <- demo_gender %>%
  count(Gender) %>%
  mutate(percent = n / sum(n) * 100)

gender_summary

#Demmographics with NA Values

# Gender
plot_demo_bar(demo_data, Gender, "Gender")
## Warning: Unknown palette: "Pastel4"

# Race
plot_demo_bar(demo_data, Race, "Distribution of Race")
## Warning: Unknown palette: "Pastel4"

# Ethnicity
plot_demo_bar(demo_data, Ethnicity, "Ethnicity")
## Warning: Unknown palette: "Pastel4"

# Age (categories)
plot_demo_bar(demo_data, Age, "Distribution of Age")
## Warning: Unknown palette: "Pastel4"

# Education
plot_demo_bar(demo_data, Education_Level, "Distribution of Education Level")
## Warning: Unknown palette: "Pastel4"

#Taking out NA

# Gender
gender <- plot_demo_bar(demo_gender, Gender, "Gender")
## Warning: Unknown palette: "Pastel4"
gender

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/gender_barchart.png", gender)
## Saving 7 x 5 in image
# Race
race <- plot_demo_bar(demo_race, Race, "Distribution of Race")
## Warning: Unknown palette: "Pastel4"
race

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/race_barchart.png", race)
## Saving 7 x 5 in image
# Ethnicity
ethnicity <- plot_demo_bar(demo_ethnicity, Ethnicity, "Ethnicity")
## Warning: Unknown palette: "Pastel4"
ethnicity

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/ethnicity_barchart.png", ethnicity)
## Saving 7 x 5 in image
# Age (categories)
age <- plot_demo_bar(demo_age, Age, "Distribution of Age")
## Warning: Unknown palette: "Pastel4"
age

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/age_barchart.png", age)
## Saving 7 x 5 in image
# Education
education <- plot_demo_bar(demo_edu, Education_Level, "Distribution of Education Level")
## Warning: Unknown palette: "Pastel4"
education

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/education_barchart.png", education)
## Saving 7 x 5 in image

#Section 2 Midterm Outcomes: Behavior Change

#Creating Data Frame

safety_data <- Pesticide_Clean[,187:236]
safety_data <- safety_data[,16:23]
colnames(safety_data) = c("Gloves", "Long-Sleeved Shirts", "Long Pants",
                          "Closed Toed Shoes", "Wash Hands", "Wash Face",
                          "Take a Shower", "Separating Clothing")

safety_data <- safety_data %>%
  mutate(
    Gloves = dplyr::recode(as.character(Gloves),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Long-Sleeved Shirts` = dplyr::recode(as.character(`Long-Sleeved Shirts`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Long Pants` = dplyr::recode(as.character(`Long Pants`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Closed Toed Shoes` = dplyr::recode(as.character(`Closed Toed Shoes`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Wash Hands` = dplyr::recode(as.character(`Wash Hands`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Wash Face` = dplyr::recode(as.character(`Wash Face`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Take a Shower` = dplyr::recode(as.character(`Take a Shower`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never"),
    `Separating Clothing` = dplyr::recode(as.character(`Separating Clothing`),
                    "1" = "Always",
                    "2" = "Sometimes",
                    "3" = "Never")
  )

#Checking Missingness for Each Column of Safety

safety_summary <- safety_data %>%
  pivot_longer(
    cols = everything(),
    names_to = "Safety Measure",
    values_to = "Response"
  ) %>%
  group_by(`Safety Measure`) %>%
  summarise(
    Total_Responses = sum(!is.na(Response)),
    NA_Count = sum(is.na(Response)),
    Percent_Responded = round(
      100 * Total_Responses / (Total_Responses + NA_Count), 1
    ),
    .groups = "drop"
  )

kable(
  safety_summary,
  col.names = c(
    "Safety Measure",
    "Total Responses",
    "NA Count",
    "Responded (%)"
  ),
  align = c("l", "c", "c", "c"),
  caption = "Response Rates for Safety Practice Questions"
)
Response Rates for Safety Practice Questions
Safety Measure Total Responses NA Count Responded (%)
Closed Toed Shoes 526 107 83.1
Gloves 526 107 83.1
Long Pants 526 107 83.1
Long-Sleeved Shirts 526 107 83.1
Separating Clothing 526 107 83.1
Take a Shower 526 107 83.1
Wash Face 526 107 83.1
Wash Hands 526 107 83.1

#Checking that NA is is not partially missing

all_missing <- safety_data %>%
  mutate(all_na = if_all(everything(), is.na)) %>%
  pull(all_na)

table(all_missing)
## all_missing
## FALSE  TRUE 
##   526   107

#Removing Missigness for Data Visualization

safety_data_complete <- safety_data %>%
  drop_na()

#Start Visualizaton

safety_pct <- safety_data_complete %>%
  pivot_longer(
    cols = everything(),
    names_to = "Safety_Measure",
    values_to = "Response"
  ) %>%
  count(Safety_Measure, Response) %>%
  group_by(Safety_Measure) %>%
  mutate(
    Percent = 100 * n / sum(n)
  ) %>%
  ungroup() %>%
  mutate(
    Response = factor(Response, levels = c("Never", "Sometimes", "Always")),
    Safety_Measure = factor(Safety_Measure, levels = colnames(safety_data))
  )

safety <- ggplot(safety_pct, aes(x = Safety_Measure, y = Percent, fill = Response)) +
  geom_col(position = "dodge") +
  labs(
    x = "Safety Measure",
    y = "Percent of Respondents",
    fill = "Response"
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    labels = scales::percent_format(scale = 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))
safety

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/safety_bar.png", safety)
## Saving 7 x 5 in image

#Creating groups (pre and post application)

safety_grouped <- safety_data_complete %>%
  pivot_longer(
    cols = everything(),
    names_to = "Safety_Measure",
    values_to = "Response"
  ) %>%
  mutate(
    Group = case_when(
      Safety_Measure %in% c(
        "Gloves",
        "Long-Sleeved Shirts",
        "Long Pants",
        "Closed Toed Shoes"
      ) ~ "Personal Protection",
      Safety_Measure %in% c(
        "Separating Clothing",
        "Wash Hands",
        "Wash Face",
        "Take a Shower"
      ) ~ "Post-Application Hygiene"
    )
  )

#Visualization of Groups

safety_long_pct <- safety_data %>%
  pivot_longer(
    cols = everything(),
    names_to = "Safety_Measure",
    values_to = "Response"
  ) %>%
  mutate(
    Group = case_when(
      Safety_Measure %in% c("Gloves", "Long-Sleeved Shirts", "Long Pants",
                            "Closed Toed Shoes") ~ "Personal Protection",
      Safety_Measure %in% c("Separating Clothing", "Wash Hands", "Wash Face", 
                            "Take a Shower") ~ "Post-Application Hygiene",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(Response)) %>%
  count(Group, Safety_Measure, Response) %>%
  group_by(Safety_Measure) %>%
  mutate(Percent = 100 * n / sum(n)) %>%
  ungroup() %>%
  mutate(
    Response = factor(Response, levels = c("Never", "Sometimes", "Always")),
    Safety_Measure = factor(Safety_Measure, levels = colnames(safety_data))
  )

# 3 bars per variable (dodged), percent on y-axis
ggplot(safety_long_pct, aes(x = Safety_Measure, y = Percent, fill = Response)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  facet_wrap(~ Group, scales = "free_x") +
  labs(x = "Safety Measure", y = "Percent of Respondents", fill = "Response") +
  scale_y_continuous(
    limits = c(0, 100),
    labels = scales::percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 35, hjust = 1),
    legend.position = "top",
    panel.grid.major.x = element_blank()
  )

#Redoing grouping

safety_grouped_counts <- safety_data_complete %>%
  pivot_longer(
    cols = everything(),
    names_to = "Safety_Measure",
    values_to = "Response"
  ) %>%
  mutate(
    Response = factor(Response, levels = c("Always", "Sometimes", "Never")),
    Group = case_when(
      Safety_Measure %in% c("Gloves", "Long-Sleeved Shirts", "Long Pants", "Closed Toed Shoes") ~
        "Personal Protection",
      Safety_Measure %in% c("Separating Clothing", "Wash Hands", "Wash Face", "Take a Shower") ~
        "Post-Application Hygiene",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(Group), !is.na(Response), Response != "") %>%
  count(Group, Response, name = "Count")

safety_grouped_counts <- safety_grouped_counts %>%
  group_by(Group) %>%
  mutate(
    Percent = 100 * Count / sum(Count)
  ) %>%
  ungroup()

#Plotting

ggplot(safety_grouped_counts, aes(x = Group, y = Percent, fill = Response)) +
  geom_col(position = position_dodge(width = 0.8), color = "black") +
  scale_y_continuous(labels = function(x) paste0(round(x, 1), "%"), expand = c(0, 0)) +
  labs(x = NULL, y = "Percent", fill = "Response") +
  theme_minimal(base_size = 12)

#Question 2 in Part 2

safety_data_q2 <- Pesticide_Clean$Q14

safety_data_q2 <- data.frame(Q2 = safety_data_q2)

safety_data_q2 <- safety_data_q2 %>%
  mutate(
    Q2 = dplyr::recode(as.character(Q2),
                "1" = "Extremely helpful",
                "2" = "Very helpful",
                "3" = "Somewhat helpful",
                "4" = "Slightly helpful",
                "5" = "Not at all helpful"),
    Q2 = factor(Q2, levels = c(
      "Extremely helpful",
      "Very helpful",
      "Somewhat helpful",
      "Slightly helpful",
      "Not at all helpful"
    ), ordered = TRUE)
  )

safetyQ2 <- plot_demo_bar(
  data = safety_data_q2 %>% filter(!is.na(Q2)),
  var = Q2,
  xlab = "Helpfulness of Pesticide Applicator Training"
)
## Warning: Unknown palette: "Pastel4"
safetyQ2

ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/safetyQ2.png", safetyQ2)
## Saving 7 x 5 in image

#Analysis of Safety with Demographics

demo_data <- demo_data %>%
  mutate(Respondent_ID = row_number())

safety_data <- safety_data %>%
  mutate(Respondent_ID = row_number())

base_data <- demo_data %>%
  left_join(safety_data, by = "Respondent_ID")

base_data <- base_data %>%
  mutate(
    Q2_raw = Pesticide_Clean$Q14,                        
    Q2_numeric = as.numeric(as.character(Q2_raw)),        
    Q2_helpfulness = 6 - Q2_numeric                       
  )

#Creating Data Frame in Order of Respondent

safety_long <- base_data %>%
  pivot_longer(
    cols = c(
      Gloves, `Long-Sleeved Shirts`, `Long Pants`, `Closed Toed Shoes`,
      `Separating Clothing`, `Wash Hands`, `Wash Face`, `Take a Shower`
    ),
    names_to  = "Safety_Measure",
    values_to = "Response"
  ) %>%
  mutate(
    Group = case_when(
      Safety_Measure %in% c(
        "Gloves", "Long-Sleeved Shirts", "Long Pants", "Closed Toed Shoes"
      ) ~ "Personal Protection",
      Safety_Measure %in% c(
        "Separating Clothing", "Wash Hands", "Wash Face", "Take a Shower"
      ) ~ "Post-Application Hygiene"
    )
  )

#Scoring

safety_scored <- safety_long %>%
  mutate(
    Response_score = case_when(
      Response == "Always"    ~ 2,
      Response == "Sometimes" ~ 1,
      Response == "Never"     ~ 0,
      TRUE ~ NA_real_
    )
  )

#Creating Safety Index

safety_index <- safety_scored %>%
  group_by(Respondent_ID) %>%
  summarise(
    Avg_Compliance = mean(Response_score, na.rm = TRUE),
    n_items = sum(!is.na(Response_score)),
    .groups = "drop"
  )

n_distinct(safety_index$Respondent_ID)
## [1] 633
safety_index <- safety_scored %>%
  group_by(Respondent_ID) %>%
  summarise(
    Avg_Compliance = mean(Response_score, na.rm = TRUE),
    n_items = sum(!is.na(Response_score)),
    .groups = "drop"
  ) %>%
  mutate(
    Compliance_prop = Avg_Compliance / 2
  )

#Starting Analysis

analysis_data <- safety_index %>%
  filter(n_items > 0) %>%
  left_join(base_data, by = "Respondent_ID")

nrow(analysis_data)
## [1] 526

#Checking dropped people

n_distinct(analysis_data$Respondent_ID) 
## [1] 526
safety_index %>%
  summarise(
    total = n_distinct(Respondent_ID),
    excluded = sum(n_items == 0)
  ) #excluded in 214 

#Full Model with all Predictors

# Rebuild employee / years datasets with IDs

Q6_model <- as.data.frame(Pesticide_Clean[,186])
colnames(Q6_model) <- c("Employees")
Q6_model <- Q6_model %>%
  mutate(
    Respondent_ID = row_number(),
    Employees_num = dplyr::recode(as.character(Employees),
      "1" = "1",
      "2" = "2",
      "3" = "3",
      "4" = "4",
      "5" = "5",
      "6" = "6",
      "7" = "7"
    ),
    Employees_num = as.numeric(Employees_num)
  )

Q7_model <- as.data.frame(Pesticide_Clean[,187])
colnames(Q7_model) <- c("Years")
Q7_model <- Q7_model %>%
  mutate(
    Respondent_ID = row_number(),
    Years_num = dplyr::recode(as.character(Years),
      "1" = "1",
      "2" = "2",
      "3" = "3",
      "4" = "4",
      "5" = "5",
      "6" = "6"
    ),
    Years_num = as.numeric(Years_num)
  )


# Create numeric versions of ordinal demographics

base_model <- base_data %>%
  mutate(
    Age_num = case_when(
      Age == "Under 18" ~ 1,
      Age == "18–24" ~ 2,
      Age == "25–34" ~ 3,
      Age == "35–44" ~ 4,
      Age == "45–54" ~ 5,
      Age == "55–64" ~ 6,
      Age == "65–74" ~ 7,
      Age == "75–84" ~ 8,
      Age == "85+" ~ 9,
      TRUE ~ NA_real_
    ),
    Education_num = case_when(
      Education_Level == "Less than 9th grade" ~ 1,
      Education_Level == "9th to 12th grade, no diploma" ~ 2,
      Education_Level == "High school graduate or GED" ~ 3,
      Education_Level == "Some college, no degree" ~ 4,
      Education_Level == "Associate’s degree" ~ 5,
      Education_Level == "Bachelor’s degree" ~ 6,
      Education_Level == "Graduate or professional degree" ~ 7,
      TRUE ~ NA_real_
    )
  )

# Build final modeling dataset

model_data <- safety_index %>%
  filter(n_items > 0) %>%
  left_join(base_model, by = "Respondent_ID") %>%
  left_join(Q6_model %>% select(Respondent_ID, Employees_num), by = "Respondent_ID") %>%
  left_join(Q7_model %>% select(Respondent_ID, Years_num), by = "Respondent_ID")

# Make categorical predictors factors

model_data <- model_data %>%
  mutate(
    Gender = factor(Gender),
    Race_binary = ifelse(Race == "White", "White", "Non-White"),
    Race_binary = factor(Race_binary),
    Ethnicity = factor(Ethnicity)
  )

Question4 <- Question4 %>%
  mutate(Respondent_ID = row_number()) %>%
  mutate(
    Region = case_when(

      # NORTHEAST
      County %in% c(
        "Athens-Clarke","Baldwin","Banks","Barrow","Butts","Columbia",
        "Dawson","Elbert","Fannin","Franklin","Gilmer","Glascock",
        "Greene","Habersham","Hall","Hancock","Hart","Jackson",
        "Jasper","Jones","Lincoln","Lumpkin","Madison","McDuffie",
        "Monroe","Morgan","Oconee","Oglethorpe","Pickens","Putnam",
        "Rabun","Richmond","Stephens","Taliaferro","Towns","Union",
        "Walton","Warren","White","Wilkes"
      ) ~ "North East",

      # NORTHWEST
      County %in% c(
        "Bartow","Bibb","Carroll","Catoosa","Chattahoochee","Chattooga",
        "Cherokee","Clayton","Cobb","Coweta","Crawford","Dade",
        "DeKalb","Douglas","Fayette","Floyd","Forsyth","Fulton",
        "Gordon","Gwinnett","Haralson","Harris","Heard","Henry",
        "Lamar","Meriwether","Muscogee","Murray","Newton","Paulding",
        "Pike","Polk","Rockdale","Spalding","Talbot","Troup",
        "Upson","Walker","Whitfield"
      ) ~ "North West",

      # SOUTHEAST
      County %in% c(
        "Appling","Atkinson","Bacon","Bleckley","Brantley","Bryan",
        "Bulloch","Burke","Camden","Candler","Charlton","Chatham",
        "Coffee","Dodge","Effingham","Emanuel","Evans","Glynn",
        "Jeff Davis","Jefferson","Jenkins","Johnson","Laurens",
        "Liberty","Long","McIntosh","Montgomery","Pierce","Screven",
        "Tattnall","Telfair","Toombs","Treutlen","Twiggs",
        "Ware","Washington","Wayne","Wheeler","Wilkinson"
      ) ~ "South East",

      # SOUTHWEST
      County %in% c(
        "Baker","Ben Hill","Berrien","Brooks","Calhoun","Clay",
        "Clinch","Colquitt","Cook","Crisp","Decatur","Dooly",
        "Dougherty","Early","Echols","Grady","Houston","Irwin",
        "Lanier","Lee","Lowndes","Macon","Marion","Miller",
        "Mitchell","Peach","Pulaski","Quitman","Randolph",
        "Schley","Seminole","Stewart","Sumter","Taylor",
        "Terrell","Thomas","Tift","Turner","Webster",
        "Wilcox","Worth"
      ) ~ "South West",

      # METRO ATLANTA (subset of NW — we override)
      County %in% c(
        "Fulton","DeKalb","Cobb","Gwinnett","Clayton","Henry"
      ) ~ "Metro Atlanta",

      TRUE ~ NA_character_
    )
  )

model_data <- model_data %>%
  left_join(
    Question4 %>% select(Respondent_ID, Region),
    by = "Respondent_ID"
  ) %>%
  mutate(Region = factor(Region))

model_data <- model_data %>%
  mutate(
    Region = factor(Region,
      levels = c("North East","North West","South East","South West","Metro Atlanta")
    )
  )

model_data <- model_data %>%
  mutate(
    Region = ifelse(Region == "Metro Atlanta", "North West", Region),
    Region = factor(Region)
  )

model_data <- model_data %>%
  mutate(
    Region = case_when(
      Region == 1 ~ "North East",
      Region == 2 ~ "North West",
      Region == 3 ~ "South East",
      Region == 4 ~ "South West",
      TRUE ~ as.character(Region)
    ),
    Region = factor(Region)
  )
#Full Model
full_lm <- lm(
  Compliance_prop ~
    Age_num +
    Education_num +
    Employees_num +
    Years_num +
    Gender +
    Race_binary +
    Ethnicity +
    Q2_helpfulness +
    Region,
  data = model_data
)

summary(full_lm)
## 
## Call:
## lm(formula = Compliance_prop ~ Age_num + Education_num + Employees_num + 
##     Years_num + Gender + Race_binary + Ethnicity + Q2_helpfulness + 
##     Region, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79068 -0.08849  0.01341  0.11731  0.33322 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.700977   0.105435   6.648 1.18e-10 ***
## Age_num                       0.012187   0.006275   1.942  0.05296 .  
## Education_num                -0.002698   0.005927  -0.455  0.64922    
## Employees_num                 0.004829   0.007636   0.632  0.52759    
## Years_num                    -0.006774   0.005583  -1.213  0.22584    
## GenderMale                   -0.012776   0.027534  -0.464  0.64293    
## GenderPrefer not to answer   -0.069635   0.160822  -0.433  0.66529    
## Race_binaryWhite             -0.098809   0.031250  -3.162  0.00171 ** 
## EthnicityNot Hispanic/Latino  0.002567   0.070604   0.036  0.97102    
## Q2_helpfulness                0.048101   0.010173   4.728 3.32e-06 ***
## RegionNorth West             -0.006972   0.027040  -0.258  0.79668    
## RegionSouth East             -0.003881   0.023915  -0.162  0.87118    
## RegionSouth West             -0.075107   0.023805  -3.155  0.00175 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1527 on 341 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.1482, Adjusted R-squared:  0.1182 
## F-statistic: 4.945 on 12 and 341 DF,  p-value: 1.477e-07
#Checking higher order terms
model_data <- model_data %>%
  mutate(
    Age_sq = Age_num^2,
    Education_sq = Education_num^2,
    Employees_sq = Employees_num^2,
    Years_sq = Years_num^2
  )

lm_poly <- lm(
  Compliance_prop ~
    Age_num + Age_sq +
    Education_num + Education_sq +
    Employees_num + Employees_sq +
    Years_num + Years_sq +
    Gender + Race_binary + Ethnicity +
    Q2_helpfulness + Region,
  data = model_data
)

summary(lm_poly)
## 
## Call:
## lm(formula = Compliance_prop ~ Age_num + Age_sq + Education_num + 
##     Education_sq + Employees_num + Employees_sq + Years_num + 
##     Years_sq + Gender + Race_binary + Ethnicity + Q2_helpfulness + 
##     Region, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79518 -0.08983  0.01159  0.11487  0.33307 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.107088   0.180544   6.132 2.43e-09 ***
## Age_num                      -0.069309   0.037668  -1.840  0.06665 .  
## Age_sq                        0.007459   0.003423   2.179  0.03000 *  
## Education_num                -0.056488   0.046113  -1.225  0.22144    
## Education_sq                  0.005368   0.004577   1.173  0.24163    
## Employees_num                -0.018613   0.024471  -0.761  0.44742    
## Employees_sq                  0.003383   0.003521   0.961  0.33733    
## Years_num                    -0.050757   0.035426  -1.433  0.15285    
## Years_sq                      0.005993   0.004495   1.333  0.18331    
## GenderMale                   -0.004799   0.027468  -0.175  0.86140    
## GenderPrefer not to answer   -0.148742   0.162342  -0.916  0.36021    
## Race_binaryWhite             -0.096903   0.031052  -3.121  0.00196 ** 
## EthnicityNot Hispanic/Latino  0.014598   0.071063   0.205  0.83737    
## Q2_helpfulness                0.046799   0.010186   4.594 6.14e-06 ***
## RegionNorth West             -0.009235   0.026956  -0.343  0.73213    
## RegionSouth East             -0.004395   0.023884  -0.184  0.85410    
## RegionSouth West             -0.073587   0.023789  -3.093  0.00214 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1514 on 337 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.1725, Adjusted R-squared:  0.1333 
## F-statistic: 4.392 on 16 and 337 DF,  p-value: 7.012e-08
#Taking out missingness for step()
model_data_complete <- model_data %>%
  select(
    Avg_Compliance,
    Compliance_prop,
    Age_num, Age_sq,
    Education_num, Education_sq,
    Employees_num, Employees_sq,
    Years_num, Years_sq,
    Gender,
    Race_binary,
    Ethnicity,
    Q2_helpfulness,
    Region
  ) %>%
  drop_na()


#Refitting model wihtout missingness
lm_poly_complete <- lm(
  Compliance_prop ~
    Age_num + Age_sq +
    Education_num + Education_sq +
    Employees_num + Employees_sq +
    Years_num + Years_sq +
    Gender + Race_binary + Ethnicity +
    Q2_helpfulness + Region,
  data = model_data_complete
)

#Running step() model
step_model <- step(lm_poly_complete, direction = "both")
## Start:  AIC=-1320.19
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq + 
##     Employees_num + Employees_sq + Years_num + Years_sq + Gender + 
##     Race_binary + Ethnicity + Q2_helpfulness + Region
## 
##                  Df Sum of Sq    RSS     AIC
## - Gender          2   0.01934 7.7400 -1323.3
## - Ethnicity       1   0.00097 7.7216 -1322.2
## - Employees_num   1   0.01325 7.7339 -1321.6
## - Employees_sq    1   0.02115 7.7418 -1321.2
## - Education_sq    1   0.03152 7.7522 -1320.8
## - Education_num   1   0.03438 7.7551 -1320.6
## - Years_sq        1   0.04073 7.7614 -1320.3
## <none>                        7.7207 -1320.2
## - Years_num       1   0.04703 7.7677 -1320.0
## - Age_num         1   0.07756 7.7982 -1318.7
## - Age_sq          1   0.10880 7.8295 -1317.2
## - Race_binary     1   0.22312 7.9438 -1312.1
## - Region          3   0.35197 8.0726 -1310.4
## - Q2_helpfulness  1   0.48359 8.2043 -1300.7
## 
## Step:  AIC=-1323.3
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq + 
##     Employees_num + Employees_sq + Years_num + Years_sq + Race_binary + 
##     Ethnicity + Q2_helpfulness + Region
## 
##                  Df Sum of Sq    RSS     AIC
## - Ethnicity       1   0.00058 7.7406 -1325.3
## - Employees_num   1   0.01235 7.7524 -1324.7
## - Employees_sq    1   0.02087 7.7609 -1324.3
## - Education_sq    1   0.03024 7.7703 -1323.9
## - Education_num   1   0.03241 7.7724 -1323.8
## - Years_sq        1   0.04078 7.7808 -1323.4
## <none>                        7.7400 -1323.3
## - Years_num       1   0.04738 7.7874 -1323.1
## - Age_num         1   0.06673 7.8067 -1322.3
## - Age_sq          1   0.09758 7.8376 -1320.9
## + Gender          2   0.01934 7.7207 -1320.2
## - Race_binary     1   0.20818 7.9482 -1315.9
## - Region          3   0.35485 8.0949 -1313.4
## - Q2_helpfulness  1   0.48636 8.2264 -1303.7
## 
## Step:  AIC=-1325.28
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq + 
##     Employees_num + Employees_sq + Years_num + Years_sq + Race_binary + 
##     Q2_helpfulness + Region
## 
##                  Df Sum of Sq    RSS     AIC
## - Employees_num   1   0.01214 7.7527 -1326.7
## - Employees_sq    1   0.02040 7.7610 -1326.3
## - Education_sq    1   0.03030 7.7709 -1325.9
## - Education_num   1   0.03248 7.7731 -1325.8
## - Years_sq        1   0.04243 7.7830 -1325.3
## <none>                        7.7406 -1325.3
## - Years_num       1   0.04907 7.7897 -1325.0
## - Age_num         1   0.06617 7.8068 -1324.3
## + Ethnicity       1   0.00058 7.7400 -1323.3
## - Age_sq          1   0.09700 7.8376 -1322.9
## + Gender          2   0.01895 7.7216 -1322.2
## - Race_binary     1   0.20872 7.9493 -1317.9
## - Region          3   0.35467 8.0953 -1315.4
## - Q2_helpfulness  1   0.48695 8.2275 -1305.7
## 
## Step:  AIC=-1326.72
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq + 
##     Employees_sq + Years_num + Years_sq + Race_binary + Q2_helpfulness + 
##     Region
## 
##                  Df Sum of Sq    RSS     AIC
## - Employees_sq    1   0.01536 7.7681 -1328.0
## - Education_sq    1   0.03585 7.7886 -1327.1
## - Education_num   1   0.03793 7.7907 -1327.0
## - Years_sq        1   0.04209 7.7948 -1326.8
## <none>                        7.7527 -1326.7
## - Years_num       1   0.05006 7.8028 -1326.4
## - Age_num         1   0.07071 7.8234 -1325.5
## + Employees_num   1   0.01214 7.7406 -1325.3
## + Ethnicity       1   0.00038 7.7524 -1324.7
## - Age_sq          1   0.10604 7.8588 -1323.9
## + Gender          2   0.01813 7.7346 -1323.5
## - Race_binary     1   0.21641 7.9691 -1319.0
## - Region          3   0.34729 8.1000 -1317.2
## - Q2_helpfulness  1   0.49981 8.2526 -1306.6
## 
## Step:  AIC=-1328.02
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq + 
##     Years_num + Years_sq + Race_binary + Q2_helpfulness + Region
## 
##                  Df Sum of Sq    RSS     AIC
## - Education_sq    1   0.03701 7.8051 -1328.3
## - Education_num   1   0.03847 7.8066 -1328.3
## <none>                        7.7681 -1328.0
## - Years_sq        1   0.04641 7.8145 -1327.9
## - Years_num       1   0.05439 7.8225 -1327.5
## + Employees_sq    1   0.01536 7.7527 -1326.7
## - Age_num         1   0.07833 7.8464 -1326.5
## + Employees_num   1   0.00710 7.7610 -1326.3
## + Ethnicity       1   0.00001 7.7681 -1326.0
## - Age_sq          1   0.11140 7.8795 -1325.0
## + Gender          2   0.02070 7.7474 -1325.0
## - Race_binary     1   0.21614 7.9842 -1320.3
## - Region          3   0.33631 8.1044 -1319.0
## - Q2_helpfulness  1   0.50004 8.2681 -1307.9
## 
## Step:  AIC=-1328.34
## Compliance_prop ~ Age_num + Age_sq + Education_num + Years_num + 
##     Years_sq + Race_binary + Q2_helpfulness + Region
## 
##                  Df Sum of Sq    RSS     AIC
## - Education_num   1   0.00176 7.8069 -1330.3
## <none>                        7.8051 -1328.3
## - Years_sq        1   0.04470 7.8498 -1328.3
## + Education_sq    1   0.03701 7.7681 -1328.0
## - Years_num       1   0.05357 7.8587 -1327.9
## + Employees_sq    1   0.01652 7.7886 -1327.1
## - Age_num         1   0.07795 7.8830 -1326.8
## + Employees_num   1   0.00660 7.7985 -1326.6
## + Ethnicity       1   0.00001 7.8051 -1326.3
## - Age_sq          1   0.11145 7.9166 -1325.3
## + Gender          2   0.01935 7.7857 -1325.2
## - Race_binary     1   0.22222 8.0273 -1320.4
## - Region          3   0.33711 8.1422 -1319.4
## - Q2_helpfulness  1   0.51602 8.3211 -1307.7
## 
## Step:  AIC=-1330.26
## Compliance_prop ~ Age_num + Age_sq + Years_num + Years_sq + Race_binary + 
##     Q2_helpfulness + Region
## 
##                  Df Sum of Sq    RSS     AIC
## <none>                        7.8069 -1330.3
## - Years_sq        1   0.04503 7.8519 -1330.2
## - Years_num       1   0.05402 7.8609 -1329.8
## + Employees_sq    1   0.01530 7.7916 -1329.0
## - Age_num         1   0.07805 7.8849 -1328.7
## + Employees_num   1   0.00599 7.8009 -1328.5
## + Education_num   1   0.00176 7.8051 -1328.3
## + Education_sq    1   0.00030 7.8066 -1328.3
## + Ethnicity       1   0.00000 7.8069 -1328.3
## - Age_sq          1   0.11170 7.9186 -1327.2
## + Gender          2   0.01778 7.7891 -1327.1
## - Race_binary     1   0.22684 8.0337 -1322.1
## - Region          3   0.33770 8.1446 -1321.3
## - Q2_helpfulness  1   0.51875 8.3256 -1309.5
summary(step_model)
## 
## Call:
## lm(formula = Compliance_prop ~ Age_num + Age_sq + Years_num + 
##     Years_sq + Race_binary + Q2_helpfulness + Region, data = model_data_complete)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79200 -0.09415  0.01411  0.11932  0.31944 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.953936   0.116659   8.177 5.66e-15 ***
## Age_num          -0.066969   0.036111  -1.855  0.06452 .  
## Age_sq            0.007312   0.003296   2.219  0.02717 *  
## Years_num        -0.053914   0.034944  -1.543  0.12378    
## Years_sq          0.006240   0.004430   1.409  0.15984    
## Race_binaryWhite -0.094954   0.030034  -3.162  0.00171 ** 
## Q2_helpfulness    0.047718   0.009981   4.781 2.59e-06 ***
## RegionNorth West -0.010113   0.026469  -0.382  0.70265    
## RegionSouth East -0.002645   0.023520  -0.112  0.91052    
## RegionSouth West -0.070659   0.023446  -3.014  0.00277 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1506 on 344 degrees of freedom
## Multiple R-squared:  0.1633, Adjusted R-squared:  0.1414 
## F-statistic:  7.46 on 9 and 344 DF,  p-value: 5.645e-10
nrow(model_data_complete)
## [1] 354
nobs(step_model)
## [1] 354

####Partial Model####

# Create grouped safety scores
safety_grouped_scores <- safety_scored %>%
  mutate(
    Behavior_Group = case_when(
      Safety_Measure %in% c("Gloves", "Long-Sleeved Shirts", "Long Pants", "Closed Toed Shoes") 
        ~ "Personal Protection",
      Safety_Measure %in% c("Wash Hands", "Wash Face", "Take a Shower", "Separating Clothing") 
        ~ "Post-Application Hygiene"
    )
  ) %>%
  group_by(Respondent_ID, Behavior_Group) %>%
  summarise(
    Avg_Compliance = mean(Response_score, na.rm = TRUE),
    n_items = sum(!is.na(Response_score)),
    .groups = "drop"
  ) %>%
  mutate(
    Compliance_Prop = Avg_Compliance / 2   # scale 0–1
  )
pp_data <- safety_grouped_scores %>%
  filter(Behavior_Group == "Personal Protection")

pah_data <- safety_grouped_scores %>%
  filter(Behavior_Group == "Post-Application Hygiene")

pp_model_data <- pp_data %>%
  left_join(model_data, by = "Respondent_ID") %>%
  filter(!is.na(Compliance_Prop))

pah_model_data <- pah_data %>%
  left_join(model_data, by = "Respondent_ID") %>%
  filter(!is.na(Compliance_Prop))
lm_pp <- lm(
  Compliance_Prop ~ Age_num + Age_sq +
    Years_num + Years_sq +
    Race_binary + Q2_helpfulness,
  data = pp_model_data
)

summary(lm_pp)
## 
## Call:
## lm(formula = Compliance_Prop ~ Age_num + Age_sq + Years_num + 
##     Years_sq + Race_binary + Q2_helpfulness, data = pp_model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84409 -0.11049  0.03091  0.14895  0.28809 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.935867   0.127500   7.340 1.34e-12 ***
## Age_num          -0.060410   0.039078  -1.546 0.122970    
## Age_sq            0.006548   0.003563   1.838 0.066865 .  
## Years_num        -0.054831   0.038002  -1.443 0.149896    
## Years_sq          0.005936   0.004827   1.230 0.219618    
## Race_binaryWhite -0.065263   0.031196  -2.092 0.037106 *  
## Q2_helpfulness    0.039551   0.011311   3.497 0.000528 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1741 on 375 degrees of freedom
##   (144 observations deleted due to missingness)
## Multiple R-squared:  0.07102,    Adjusted R-squared:  0.05615 
## F-statistic: 4.778 on 6 and 375 DF,  p-value: 0.0001038
lm_pah <- lm(
  Compliance_Prop ~ Age_num + Age_sq +
    Years_num + Years_sq +
    Race_binary + Q2_helpfulness,
  data = pah_model_data
)

summary(lm_pah)
## 
## Call:
## lm(formula = Compliance_Prop ~ Age_num + Age_sq + Years_num + 
##     Years_sq + Race_binary + Q2_helpfulness, data = pah_model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84524 -0.10733  0.03439  0.14980  0.33304 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.745251   0.132124   5.641 3.34e-08 ***
## Age_num          -0.051527   0.040495  -1.272   0.2040    
## Age_sq            0.005876   0.003692   1.592   0.1123    
## Years_num        -0.020295   0.039380  -0.515   0.6066    
## Years_sq          0.002332   0.005002   0.466   0.6413    
## Race_binaryWhite -0.055813   0.032327  -1.726   0.0851 .  
## Q2_helpfulness    0.059559   0.011721   5.081 5.93e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1804 on 375 degrees of freedom
##   (144 observations deleted due to missingness)
## Multiple R-squared:  0.09421,    Adjusted R-squared:  0.07972 
## F-statistic: 6.501 on 6 and 375 DF,  p-value: 1.538e-06

#Economic Data Analysis #Question 1b:

Q16pta<- Pesticide_Clean$Q16_4
table(Q16pta, useNA = "always")
## Q16pta
##      0      1    100   1000  10000 100000   1100   1200  12000 125000   1260 
##    158      5      4      7      9      4      1      2      2      1      1 
## 143000   1500  15000 150000   1600  18000      2    200   2000 200000   2090 
##      1      4      2      2      1      1      1      1      4      1      1 
##   2400  24000   2500  25000   2800   3000  30000   3100  31200  32000  33000 
##      3      1      5      4      1      3      6      1      1      1      1 
##  35000  36000  37440   4000  40000 400000   4400  45000  46000  47000    500 
##      2      1      1      2      7      1      1      1      1      1      4 
##   5000  50000  55000    600   6000  60000  75000   8000  80000   <NA> 
##      7     14      1      2      2      2      1      1      1    340
Q16ptb<- Pesticide_Clean$Q16_5
table(Q16ptb, useNA = "always")
## Q16ptb
##      0      1    100   1000  10000 100000    101 110000  12000  14000 143000 
##    165      4      3      1      6      6      1      1      3      1      1 
##    150   1500  15000 150000  16000   1620      2    200   2000  20000 200000 
##      1      2      3      2      1      1      2      1      7      2      1 
## 215000 225000  23000   2500  25000 250000   3000  30000  31200   3200  32000 
##      1      1      1      4      2      1      6      3      1      1      1 
##   3500  35000  38000    400  40000  40400  42000    450  45000 450000  45760 
##      1      1      1      1      5      1      1      1      2      1      1 
##    500   5000  50000  52000   5500  55000   6000  60000   6500  65000    700 
##      1      6      6      1      1      1      3      7      1      1      1 
##  70000    750   7500  75000    800   8000  80000   9000   <NA> 
##      2      1      1      2      1      1      4      1    338
table(is.na(Pesticide_Clean$Q16_4), is.na(Pesticide_Clean$Q16_5))
##        
##         FALSE TRUE
##   FALSE   289    4
##   TRUE      6  334
#Creating my variables
Q16<- data.frame(
  before = Pesticide_Clean$Q16_4,
  after = Pesticide_Clean$Q16_5
)

Q16<- Q16[!is.na(Q16$before) & !is.na(Q16$after), ]

Q16$change<- as.numeric(Q16$after) - as.numeric(Q16$before)

nrow(Q16)
## [1] 289
summary(Q16)
##     before             after               change       
##  Length:289         Length:289         Min.   :-143500  
##  Class :character   Class :character   1st Qu.:      0  
##  Mode  :character   Mode  :character   Median :      0  
##                                        Mean   :   3603  
##                                        3rd Qu.:   2000  
##                                        Max.   : 100000
ggplot(Q16, aes(x = change)) +
  geom_histogram(bins = 10, color = "black", fill = "purple4") +
  scale_x_continuous(labels = comma) +
  labs(
    title = "Histogram of Change",
    x = "Change (After - Before)",
    y = "Number of Respondents"
  ) +
  theme_minimal()

#Paired t-test
t.test(as.numeric(Q16$after), as.numeric(Q16$before), paired = TRUE)
## 
##  Paired t-test
## 
## data:  as.numeric(Q16$after) and as.numeric(Q16$before)
## t = 3.1318, df = 288, p-value = 0.001916
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  1338.567 5867.274
## sample estimates:
## mean difference 
##         3602.92

#Question 1d:

Q18pta<- Pesticide_Clean$Q18_4
table(Q18pta, useNA = "always")
## Q18pta
##      0      1    100   1000  10000 100000    101     11  12000   1400    150 
##    152      4      5     10     13      2      1      1      1      1      1 
##   1500  15000 150000  17000   1800     20    200   2000  20000   2200   2400 
##      3      2      1      1      1      1      1     14      6      1      3 
##   2500  25000   2800    300   3000  30000   3500  35000    400   4000  40000 
##      9      2      2      2      5      2      1      1      2      3      1 
## 400000  42000   4500    500   5000  50000  54235   5500    600   6000    750 
##      1      1      1      9     10      8      1      1      2      3      1 
##    800   8000  80000    850   <NA> 
##      1      3      1      1    334
Q18ptb<- Pesticide_Clean$Q18_5
table(Q18ptb, useNA = "always")
## Q18ptb
##      0      1    100   1000  10000   1200   1300 140000    150   1500  15000 
##    161      3      5      6      6      6      1      1      3      4      4 
##   1600  16000  17000   1800   1850      2    200   2000  20000   2400    250 
##      1      1      1      2      1      1      5      7      4      2      2 
##   2500  25000    300   3000  30000   3500    400   4000  40000 400000    450 
##      4      4      3      4      2      2      1      4      2      1      1 
##   4500  45000     50    500   5000  50000  51335   5500  58000      6    600 
##      1      1      1      9      7      3      1      1      1      1      2 
##   6000   7000    750   7500     80    800   8000  80000    855    900   9000 
##      1      1      1      1      1      3      4      1      1      1      1 
##   <NA> 
##    334
table(is.na(Pesticide_Clean$Q18_4), is.na(Pesticide_Clean$Q18_5))
##        
##         FALSE TRUE
##   FALSE   295    4
##   TRUE      4  330
#Creating my variables
Q18<- data.frame(
  before = Pesticide_Clean$Q18_4,
  after = Pesticide_Clean$Q18_5
)

Q18<- Q18[!is.na(Q18$before) & !is.na(Q18$after), ]

Q18$change<- as.numeric(Q18$after) - as.numeric(Q18$before)

nrow(Q18)
## [1] 295
summary(Q18)
##     before             after               change       
##  Length:295         Length:295         Min.   :-100000  
##  Class :character   Class :character   1st Qu.:  -1000  
##  Mode  :character   Mode  :character   Median :      0  
##                                        Mean   :  -1486  
##                                        3rd Qu.:      0  
##                                        Max.   :  60000
ggplot(Q18, aes(x = change)) +
  geom_histogram(bins = 10, color = "black", fill = "green4") +
  scale_x_continuous(labels = comma) +
  labs(
    title = "Histogram of Change",
    x = "Change (After - Before)",
    y = "Number of Respondents"
  ) +
  theme_minimal()

#Paired t-test
t.test(as.numeric(Q18$after), as.numeric(Q18$before), paired = TRUE)
## 
##  Paired t-test
## 
## data:  as.numeric(Q18$after) and as.numeric(Q18$before)
## t = -2.4126, df = 294, p-value = 0.01645
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -2697.673  -273.751
## sample estimates:
## mean difference 
##       -1485.712

##Economic ANOVAs

profit_data <- data.frame(
  Respondent_ID = 1:nrow(Pesticide_Clean),
  profit_change = as.numeric(Pesticide_Clean$Q16_5) - as.numeric(Pesticide_Clean$Q16_4)
)

profit_data <- profit_data %>%
  filter(!is.na(profit_change)) %>%
  left_join(model_data, by = "Respondent_ID")
savings_data <- data.frame(
  Respondent_ID = 1:nrow(Pesticide_Clean),
  savings_change = as.numeric(Pesticide_Clean$Q18_5) - as.numeric(Pesticide_Clean$Q18_4)
)

savings_data <- savings_data %>%
  filter(!is.na(savings_change)) %>%
  left_join(model_data, by = "Respondent_ID")

#Profit ANOVAs

aov_profit_edu <- aov(profit_change ~ Education_num, data = profit_data)
summary(aov_profit_edu)
##                Df    Sum Sq   Mean Sq F value Pr(>F)
## Education_num   1 8.233e+08 823295030   2.082   0.15
## Residuals     276 1.092e+11 395513703               
## 11 observations deleted due to missingness
aov_profit_age <- aov(profit_change ~ Age_num, data = profit_data)
summary(aov_profit_age)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## Age_num       1 6.334e+07  63340118   0.159   0.69
## Residuals   276 1.099e+11 398267163               
## 11 observations deleted due to missingness
aov_profit_race <- aov(profit_change ~ Race_binary, data = profit_data)
summary(aov_profit_race)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## Race_binary   1 3.629e+07  36291677   0.091  0.763
## Residuals   275 1.099e+11 399762321               
## 12 observations deleted due to missingness
aov_profit_region <- aov(profit_change ~ Region, data = profit_data)
summary(aov_profit_region)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## Region        3 1.186e+08  39541157   0.099   0.96
## Residuals   276 1.099e+11 398129357               
## 9 observations deleted due to missingness
aov_profit_all <- aov(profit_change ~ Education_num + Age_num + Race_binary + Region, 
                      data=profit_data)
summary(aov_profit_all)
##                Df    Sum Sq   Mean Sq F value Pr(>F)
## Education_num   1 9.432e+08 943214099   2.267  0.133
## Age_num         1 3.620e+07  36199012   0.087  0.768
## Race_binary     1 6.746e+07  67455562   0.162  0.688
## Region          3 1.748e+08  58271682   0.140  0.936
## Residuals     261 1.086e+11 416045398               
## 21 observations deleted due to missingness

#Savings ANOVAs

aov_savings_edu <- aov(savings_change ~ Education_num, data=savings_data)
summary(aov_savings_edu)
##                Df    Sum Sq   Mean Sq F value Pr(>F)
## Education_num   1 8.091e+07  80909945   0.694  0.405
## Residuals     281 3.275e+10 116544089               
## 12 observations deleted due to missingness
aov_savings_age <- aov(savings_change ~ Age_num, data=savings_data)
summary(aov_savings_age)
##              Df    Sum Sq   Mean Sq F value Pr(>F)  
## Age_num       1 4.135e+08 413547671   3.585 0.0593 .
## Residuals   281 3.242e+10 115360325                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12 observations deleted due to missingness
aov_savings_race <- aov(savings_change ~ Race_binary, data=savings_data)
summary(aov_savings_race)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## Race_binary   1 1.501e+07  15008566   0.128  0.721
## Residuals   280 3.281e+10 117187007               
## 13 observations deleted due to missingness
aov_savings_region <- aov(savings_change ~ Region, data=savings_data)
summary(aov_savings_region)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## Region        3 3.990e+08 133009513   1.163  0.324
## Residuals   284 3.248e+10 114357051               
## 7 observations deleted due to missingness
aov_savings_all <- aov(savings_change ~ Education_num + Age_num + Race_binary +
                         Region, data=savings_data)
summary(aov_savings_all)
##                Df    Sum Sq   Mean Sq F value Pr(>F)  
## Education_num   1 7.545e+07  75452158   0.637 0.4256  
## Age_num         1 4.218e+08 421765811   3.560 0.0603 .
## Race_binary     1 1.542e+07  15417740   0.130 0.7186  
## Region          3 5.493e+08 183084542   1.545 0.2031  
## Residuals     268 3.175e+10 118471894                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 20 observations deleted due to missingness

#Regression Models for Economics

economic_vars <- Pesticide_Clean %>%
  mutate(
    Respondent_ID = row_number(),
    
    profit_before = as.numeric(Q16_4),
    profit_after  = as.numeric(Q16_5),
    profit_change = profit_after - profit_before,
    
    savings_before = as.numeric(Q18_4),
    savings_after  = as.numeric(Q18_5),
    savings_change = savings_after - savings_before
  ) %>%
  select(Respondent_ID, profit_change, savings_change)
model_data_econ <- model_data %>%
  left_join(economic_vars, by = "Respondent_ID")

profit_model_data <- model_data_econ %>%
  filter(!is.na(profit_change))

profit_model_data <- profit_model_data %>%
  mutate(
    Age_sq = Age_num^2,
    Years_sq = Years_num^2
  )

savings_model_data <- model_data_econ %>%
  filter(!is.na(savings_change))

savings_model_data <- savings_model_data %>%
  mutate(
    Age_sq = Age_num^2,
    Years_sq = Years_num^2
  )
lm_profit <- lm(
  profit_change ~ Age_num + Age_sq + Education_num + Employees_num + Years_num + Years_sq +
    Gender + Race_binary + Ethnicity + Q2_helpfulness + Region,
  data = profit_model_data
)

summary(lm_profit)
## 
## Call:
## lm(formula = profit_change ~ Age_num + Age_sq + Education_num + 
##     Employees_num + Years_num + Years_sq + Gender + Race_binary + 
##     Ethnicity + Q2_helpfulness + Region, data = profit_model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -144628   -5211   -2426    1188   89978 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -2120.8    24613.3  -0.086    0.931
## Age_num                       -4659.1     5914.6  -0.788    0.432
## Age_sq                          399.6      532.9   0.750    0.454
## Education_num                 -1164.4      904.0  -1.288    0.199
## Employees_num                  1628.7     1286.9   1.266    0.207
## Years_num                      1311.8     5439.5   0.241    0.810
## Years_sq                       -253.6      690.0  -0.367    0.714
## GenderMale                     5334.3     4067.9   1.311    0.191
## GenderPrefer not to answer    -7306.6    21949.9  -0.333    0.740
## Race_binaryWhite                439.2     5487.3   0.080    0.936
## EthnicityNot Hispanic/Latino  10711.3    15142.8   0.707    0.480
## Q2_helpfulness                  988.1     1596.9   0.619    0.537
## RegionNorth West               1750.3     4307.0   0.406    0.685
## RegionSouth East               2009.6     3701.5   0.543    0.588
## RegionSouth West               -263.1     3712.1  -0.071    0.944
## 
## Residual standard error: 19970 on 241 degrees of freedom
##   (33 observations deleted due to missingness)
## Multiple R-squared:  0.02611,    Adjusted R-squared:  -0.03046 
## F-statistic: 0.4615 on 14 and 241 DF,  p-value: 0.9512
profit_model_data_complete <- model_data_econ %>%
  select(
    profit_change,
    Age_num, Education_num, Employees_num, Years_num,
    Gender, Race_binary, Ethnicity, Q2_helpfulness, Region
  ) %>%
  drop_na() %>%
  mutate(
    Age_sq = Age_num^2,
    Years_sq = Years_num^2,
    Employees_sq = Employees_num^2
  )
lm_profit_complete <- lm(
  profit_change ~ Age_num + Age_sq +
    Years_num + Years_sq +
    Employees_num + Employees_sq +
    Education_num +
    Gender + Race_binary + Ethnicity +
    Q2_helpfulness + Region,
  data = profit_model_data_complete
)
step_profit <- step(lm_profit_complete, direction = "both")
## Start:  AIC=5085.96
## profit_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Gender + Race_binary + Ethnicity + 
##     Q2_helpfulness + Region
## 
##                  Df Sum of Sq        RSS    AIC
## - Region          3 249876048 9.6211e+10 5080.6
## - Gender          2 719087784 9.6680e+10 5083.9
## - Race_binary     1   2431225 9.5964e+10 5084.0
## - Years_num       1  25932219 9.5987e+10 5084.0
## - Years_sq        1  61230330 9.6022e+10 5084.1
## - Employees_sq    1 106422000 9.6068e+10 5084.2
## - Ethnicity       1 137402049 9.6099e+10 5084.3
## - Q2_helpfulness  1 156914674 9.6118e+10 5084.4
## - Age_sq          1 237429060 9.6199e+10 5084.6
## - Age_num         1 253479019 9.6215e+10 5084.6
## - Employees_num   1 324204744 9.6285e+10 5084.8
## - Education_num   1 607295561 9.6569e+10 5085.6
## <none>                        9.5961e+10 5086.0
## 
## Step:  AIC=5080.63
## profit_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Gender + Race_binary + Ethnicity + 
##     Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Gender          2 698140894 9.6909e+10 5078.5
## - Race_binary     1   3397090 9.6215e+10 5078.6
## - Years_num       1  22875472 9.6234e+10 5078.7
## - Years_sq        1  57173350 9.6268e+10 5078.8
## - Employees_sq    1 129857920 9.6341e+10 5079.0
## - Ethnicity       1 157988500 9.6369e+10 5079.0
## - Q2_helpfulness  1 186466293 9.6398e+10 5079.1
## - Age_sq          1 224493920 9.6436e+10 5079.2
## - Age_num         1 235824837 9.6447e+10 5079.3
## - Employees_num   1 347331385 9.6558e+10 5079.5
## - Education_num   1 584780009 9.6796e+10 5080.2
## <none>                        9.6211e+10 5080.6
## + Region          3 249876048 9.5961e+10 5086.0
## 
## Step:  AIC=5078.48
## profit_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Race_binary + Ethnicity + 
##     Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Years_num       1  25109686 9.6934e+10 5076.5
## - Race_binary     1  26655392 9.6936e+10 5076.5
## - Years_sq        1  51494975 9.6961e+10 5076.6
## - Ethnicity       1  88085574 9.6997e+10 5076.7
## - Q2_helpfulness  1 108872215 9.7018e+10 5076.8
## - Age_num         1 117129851 9.7026e+10 5076.8
## - Age_sq          1 118716178 9.7028e+10 5076.8
## - Employees_sq    1 184362206 9.7094e+10 5077.0
## - Employees_num   1 400476023 9.7310e+10 5077.5
## - Education_num   1 662741115 9.7572e+10 5078.2
## <none>                        9.6909e+10 5078.5
## + Gender          2 698140894 9.6211e+10 5080.6
## + Region          3 228929158 9.6680e+10 5083.9
## 
## Step:  AIC=5076.54
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Race_binary + Ethnicity + 
##     Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Race_binary     1  24637648 9.6959e+10 5074.6
## - Ethnicity       1  80076841 9.7014e+10 5074.8
## - Q2_helpfulness  1  98845680 9.7033e+10 5074.8
## - Age_num         1 107915761 9.7042e+10 5074.8
## - Age_sq          1 111250211 9.7046e+10 5074.8
## - Employees_sq    1 180818464 9.7115e+10 5075.0
## - Years_sq        1 192904028 9.7127e+10 5075.1
## - Employees_num   1 393166613 9.7328e+10 5075.6
## - Education_num   1 658938692 9.7593e+10 5076.3
## <none>                        9.6934e+10 5076.5
## + Years_num       1  25109686 9.6909e+10 5078.5
## + Gender          2 700375108 9.6234e+10 5078.7
## + Region          3 228352718 9.6706e+10 5081.9
## 
## Step:  AIC=5074.61
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Ethnicity + Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Q2_helpfulness  1  89691164 9.7049e+10 5072.8
## - Ethnicity       1  92718743 9.7052e+10 5072.9
## - Age_num         1  97154067 9.7056e+10 5072.9
## - Age_sq          1 101473362 9.7060e+10 5072.9
## - Employees_sq    1 183478009 9.7142e+10 5073.1
## - Years_sq        1 209636703 9.7169e+10 5073.2
## - Employees_num   1 395001016 9.7354e+10 5073.6
## - Education_num   1 647474556 9.7606e+10 5074.3
## <none>                        9.6959e+10 5074.6
## + Race_binary     1  24637648 9.6934e+10 5076.5
## + Years_num       1  23091942 9.6936e+10 5076.5
## + Gender          2 722297734 9.6237e+10 5076.7
## + Region          3 230683723 9.6728e+10 5080.0
## 
## Step:  AIC=5072.85
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Ethnicity
## 
##                  Df Sum of Sq        RSS    AIC
## - Ethnicity       1  79114803 9.7128e+10 5071.1
## - Age_num         1 101669431 9.7150e+10 5071.1
## - Age_sq          1 105932786 9.7155e+10 5071.1
## - Employees_sq    1 178306559 9.7227e+10 5071.3
## - Years_sq        1 203418538 9.7252e+10 5071.4
## - Employees_num   1 381627215 9.7430e+10 5071.8
## - Education_num   1 669177428 9.7718e+10 5072.6
## <none>                        9.7049e+10 5072.8
## + Q2_helpfulness  1  89691164 9.6959e+10 5074.6
## + Race_binary     1  15483132 9.7033e+10 5074.8
## + Years_num       1  14201900 9.7034e+10 5074.8
## + Gender          2 639851741 9.6409e+10 5075.2
## + Region          3 256168158 9.6793e+10 5078.2
## 
## Step:  AIC=5071.05
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num + 
##     Employees_sq + Education_num
## 
##                  Df Sum of Sq        RSS    AIC
## - Age_num         1  81507139 9.7209e+10 5069.3
## - Age_sq          1  84337516 9.7212e+10 5069.3
## - Years_sq        1 199846190 9.7328e+10 5069.6
## - Employees_sq    1 242538386 9.7370e+10 5069.7
## - Employees_num   1 434780198 9.7563e+10 5070.2
## - Education_num   1 638693243 9.7767e+10 5070.7
## <none>                        9.7128e+10 5071.1
## + Ethnicity       1  79114803 9.7049e+10 5072.8
## + Q2_helpfulness  1  76087224 9.7052e+10 5072.9
## + Race_binary     1  25627414 9.7102e+10 5073.0
## + Years_num       1   8602700 9.7119e+10 5073.0
## + Gender          2 590380242 9.6537e+10 5073.5
## + Region          3 266594492 9.6861e+10 5076.4
## 
## Step:  AIC=5069.27
## profit_change ~ Age_sq + Years_sq + Employees_num + Employees_sq + 
##     Education_num
## 
##                  Df Sum of Sq        RSS    AIC
## - Age_sq          1   2846461 9.7212e+10 5067.3
## - Employees_sq    1 224036054 9.7433e+10 5067.9
## - Years_sq        1 246562286 9.7456e+10 5067.9
## - Employees_num   1 413312662 9.7623e+10 5068.4
## - Education_num   1 653750817 9.7863e+10 5069.0
## <none>                        9.7209e+10 5069.3
## + Age_num         1  81507139 9.7128e+10 5071.1
## + Q2_helpfulness  1  81448508 9.7128e+10 5071.1
## + Ethnicity       1  58952511 9.7150e+10 5071.1
## + Race_binary     1  14058944 9.7195e+10 5071.2
## + Years_num       1   4734004 9.7205e+10 5071.3
## + Gender          2 493787469 9.6716e+10 5072.0
## + Region          3 252918946 9.6956e+10 5074.6
## 
## Step:  AIC=5067.28
## profit_change ~ Years_sq + Employees_num + Employees_sq + Education_num
## 
##                  Df Sum of Sq        RSS    AIC
## - Employees_sq    1 221635381 9.7434e+10 5065.9
## - Years_sq        1 252961337 9.7465e+10 5065.9
## - Employees_num   1 421416235 9.7634e+10 5066.4
## - Education_num   1 653906501 9.7866e+10 5067.0
## <none>                        9.7212e+10 5067.3
## + Q2_helpfulness  1  81507150 9.7131e+10 5069.1
## + Ethnicity       1  57402813 9.7155e+10 5069.1
## + Race_binary     1  14874693 9.7197e+10 5069.2
## + Years_num       1   5564335 9.7207e+10 5069.3
## + Age_sq          1   2846461 9.7209e+10 5069.3
## + Age_num         1     16083 9.7212e+10 5069.3
## + Gender          2 495976936 9.6716e+10 5070.0
## + Region          3 254844049 9.6957e+10 5072.6
## 
## Step:  AIC=5065.86
## profit_change ~ Years_sq + Employees_num + Education_num
## 
##                  Df Sum of Sq        RSS    AIC
## - Years_sq        1 201524721 9.7635e+10 5064.4
## - Employees_num   1 416884560 9.7851e+10 5065.0
## - Education_num   1 721542080 9.8155e+10 5065.7
## <none>                        9.7434e+10 5065.9
## + Employees_sq    1 221635381 9.7212e+10 5067.3
## + Ethnicity       1 116006583 9.7318e+10 5067.6
## + Q2_helpfulness  1  70564304 9.7363e+10 5067.7
## + Race_binary     1  21001541 9.7413e+10 5067.8
## + Age_num         1   3991698 9.7430e+10 5067.8
## + Years_num       1   2193234 9.7432e+10 5067.9
## + Age_sq          1    445788 9.7433e+10 5067.9
## + Gender          2 532755708 9.6901e+10 5068.5
## + Region          3 290296921 9.7144e+10 5071.1
## 
## Step:  AIC=5064.39
## profit_change ~ Employees_num + Education_num
## 
##                  Df Sum of Sq        RSS    AIC
## - Employees_num   1 397830478 9.8033e+10 5063.4
## - Education_num   1 702797156 9.8338e+10 5064.2
## <none>                        9.7635e+10 5064.4
## + Years_sq        1 201524721 9.7434e+10 5065.9
## + Years_num       1 190562300 9.7445e+10 5065.9
## + Employees_sq    1 170198765 9.7465e+10 5065.9
## + Ethnicity       1 101942308 9.7533e+10 5066.1
## + Q2_helpfulness  1  67393930 9.7568e+10 5066.2
## + Age_num         1  37446621 9.7598e+10 5066.3
## + Race_binary     1  29177459 9.7606e+10 5066.3
## + Age_sq          1  20575366 9.7615e+10 5066.3
## + Gender          2 388825962 9.7246e+10 5067.4
## + Region          3 296394359 9.7339e+10 5069.6
## 
## Step:  AIC=5063.43
## profit_change ~ Education_num
## 
##                  Df Sum of Sq        RSS    AIC
## - Education_num   1 610232759 9.8643e+10 5063.0
## <none>                        9.8033e+10 5063.4
## + Employees_num   1 397830478 9.7635e+10 5064.4
## + Employees_sq    1 219518937 9.7814e+10 5064.9
## + Years_sq        1 182470639 9.7851e+10 5065.0
## + Years_num       1 176735654 9.7856e+10 5065.0
## + Age_num         1 128031410 9.7905e+10 5065.1
## + Age_sq          1  97199610 9.7936e+10 5065.2
## + Q2_helpfulness  1  54983202 9.7978e+10 5065.3
## + Ethnicity       1  33151619 9.8000e+10 5065.3
## + Race_binary     1  14490242 9.8019e+10 5065.4
## + Gender          2 315432468 9.7718e+10 5066.6
## + Region          3 233532300 9.7800e+10 5068.8
## 
## Step:  AIC=5063.02
## profit_change ~ 1
## 
##                  Df Sum of Sq        RSS    AIC
## <none>                        9.8643e+10 5063.0
## + Education_num   1 610232759 9.8033e+10 5063.4
## + Employees_num   1 305266082 9.8338e+10 5064.2
## + Years_sq        1 167903242 9.8475e+10 5064.6
## + Years_num       1 163788840 9.8480e+10 5064.6
## + Employees_sq    1 139478631 9.8504e+10 5064.7
## + Age_num         1 117236919 9.8526e+10 5064.7
## + Age_sq          1  86158035 9.8557e+10 5064.8
## + Q2_helpfulness  1  74838766 9.8569e+10 5064.8
## + Ethnicity       1  23637571 9.8620e+10 5065.0
## + Race_binary     1   5448408 9.8638e+10 5065.0
## + Gender          2 379741104 9.8264e+10 5066.0
## + Region          3 251002658 9.8392e+10 5068.4
summary(step_profit)
## 
## Call:
## lm(formula = profit_change ~ 1, data = profit_model_data_complete)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -146924   -3424   -3424   -1424   96576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     3424       1229   2.786  0.00574 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19670 on 255 degrees of freedom
savings_model_data_complete <- model_data_econ %>%
  select(
    savings_change,
    Age_num, Education_num, Employees_num, Years_num,
    Gender, Race_binary, Ethnicity, Q2_helpfulness, Region
  ) %>%
  drop_na() %>%
  mutate(
    Age_sq = Age_num^2,
    Years_sq = Years_num^2,
    Employees_sq = Employees_num^2
  )
lm_savings_complete <- lm(
  savings_change ~ Age_num + Age_sq +
    Years_num + Years_sq +
    Employees_num + Employees_sq +
    Education_num +
    Gender + Race_binary + Ethnicity +
    Q2_helpfulness + Region,
  data = savings_model_data_complete
)
step_savings <- step(lm_savings_complete, direction = "both")
## Start:  AIC=4950.46
## savings_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Gender + Race_binary + Ethnicity + 
##     Q2_helpfulness + Region
## 
##                  Df Sum of Sq        RSS    AIC
## - Gender          2  34392193 3.0503e+10 4946.8
## - Region          3 437660322 3.0906e+10 4948.2
## - Years_sq        1   1642807 3.0470e+10 4948.5
## - Years_num       1  17096599 3.0486e+10 4948.6
## - Age_sq          1  32233122 3.0501e+10 4948.7
## - Ethnicity       1  35605013 3.0504e+10 4948.8
## - Employees_sq    1  39964530 3.0509e+10 4948.8
## - Race_binary     1  52496602 3.0521e+10 4948.9
## - Employees_num   1  58099761 3.0527e+10 4949.0
## - Education_num   1  69828779 3.0539e+10 4949.1
## - Age_num         1 110574425 3.0579e+10 4949.4
## <none>                        3.0469e+10 4950.5
## - Q2_helpfulness  1 792518321 3.1261e+10 4955.3
## 
## Step:  AIC=4946.76
## savings_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Race_binary + Ethnicity + 
##     Q2_helpfulness + Region
## 
##                  Df Sum of Sq        RSS    AIC
## - Region          3 439715398 3.0943e+10 4944.6
## - Years_sq        1   1768538 3.0505e+10 4944.8
## - Years_num       1  17699274 3.0521e+10 4944.9
## - Age_sq          1  20099470 3.0523e+10 4944.9
## - Ethnicity       1  30288236 3.0533e+10 4945.0
## - Employees_sq    1  38148883 3.0541e+10 4945.1
## - Employees_num   1  54706856 3.0558e+10 4945.2
## - Race_binary     1  74559276 3.0578e+10 4945.4
## - Education_num   1  78884349 3.0582e+10 4945.4
## - Age_num         1  87763936 3.0591e+10 4945.5
## <none>                        3.0503e+10 4946.8
## + Gender          2  34392193 3.0469e+10 4950.5
## - Q2_helpfulness  1 790916506 3.1294e+10 4951.5
## 
## Step:  AIC=4944.55
## savings_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num + 
##     Employees_sq + Education_num + Race_binary + Ethnicity + 
##     Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Years_sq        1   6626068 3.0950e+10 4942.6
## - Age_sq          1  13591785 3.0956e+10 4942.7
## - Ethnicity       1  16452076 3.0959e+10 4942.7
## - Years_num       1  30968968 3.0974e+10 4942.8
## - Employees_sq    1  39745519 3.0983e+10 4942.9
## - Employees_num   1  52581238 3.0995e+10 4943.0
## - Age_num         1  68576285 3.1011e+10 4943.1
## - Education_num   1  77801872 3.1021e+10 4943.2
## - Race_binary     1  82309991 3.1025e+10 4943.3
## <none>                        3.0943e+10 4944.6
## + Region          3 439715398 3.0503e+10 4946.8
## + Gender          2  36447270 3.0906e+10 4948.2
## - Q2_helpfulness  1 887948882 3.1831e+10 4950.1
## 
## Step:  AIC=4942.61
## savings_change ~ Age_num + Age_sq + Years_num + Employees_num + 
##     Employees_sq + Education_num + Race_binary + Ethnicity + 
##     Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Age_sq          1  12890329 3.0962e+10 4940.7
## - Ethnicity       1  14789552 3.0964e+10 4940.7
## - Employees_sq    1  43272154 3.0993e+10 4941.0
## - Employees_num   1  56825185 3.1006e+10 4941.1
## - Age_num         1  66484784 3.1016e+10 4941.2
## - Education_num   1  77899541 3.1027e+10 4941.3
## - Race_binary     1  80227980 3.1030e+10 4941.3
## <none>                        3.0950e+10 4942.6
## - Years_num       1 351370676 3.1301e+10 4943.6
## + Years_sq        1   6626068 3.0943e+10 4944.6
## + Region          3 444572928 3.0505e+10 4944.8
## + Gender          2  36906031 3.0913e+10 4946.3
## - Q2_helpfulness  1 881410685 3.1831e+10 4948.1
## 
## Step:  AIC=4940.72
## savings_change ~ Age_num + Years_num + Employees_num + Employees_sq + 
##     Education_num + Race_binary + Ethnicity + Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Ethnicity       1  13631491 3.0976e+10 4938.8
## - Employees_sq    1  51465384 3.1014e+10 4939.2
## - Employees_num   1  66552187 3.1029e+10 4939.3
## - Education_num   1  76029139 3.1038e+10 4939.4
## - Race_binary     1  76293335 3.1039e+10 4939.4
## <none>                        3.0962e+10 4940.7
## - Years_num       1 340227869 3.1303e+10 4941.6
## + Age_sq          1  12890329 3.0950e+10 4942.6
## + Years_sq        1   5924612 3.0956e+10 4942.7
## + Region          3 437755639 3.0525e+10 4942.9
## + Gender          2  25978258 3.0936e+10 4944.5
## - Age_num         1 734767698 3.1697e+10 4944.9
## - Q2_helpfulness  1 878910857 3.1841e+10 4946.1
## 
## Step:  AIC=4938.84
## savings_change ~ Age_num + Years_num + Employees_num + Employees_sq + 
##     Education_num + Race_binary + Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Employees_sq    1  43431412 3.1019e+10 4937.2
## - Employees_num   1  59951114 3.1036e+10 4937.4
## - Education_num   1  81628050 3.1058e+10 4937.5
## - Race_binary     1  95014878 3.1071e+10 4937.7
## <none>                        3.0976e+10 4938.8
## - Years_num       1 343743487 3.1320e+10 4939.8
## + Ethnicity       1  13631491 3.0962e+10 4940.7
## + Age_sq          1  11732268 3.0964e+10 4940.7
## + Years_sq        1   4444705 3.0972e+10 4940.8
## + Region          3 424286994 3.0552e+10 4941.2
## + Gender          2  22369987 3.0954e+10 4942.6
## - Age_num         1 739994959 3.1716e+10 4943.1
## - Q2_helpfulness  1 867933258 3.1844e+10 4944.2
## 
## Step:  AIC=4937.21
## savings_change ~ Age_num + Years_num + Employees_num + Education_num + 
##     Race_binary + Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Employees_num   1  21570814 3.1041e+10 4935.4
## - Race_binary     1  84288108 3.1104e+10 4935.9
## - Education_num   1  96355553 3.1116e+10 4936.0
## <none>                        3.1019e+10 4937.2
## - Years_num       1 311748801 3.1331e+10 4937.9
## + Employees_sq    1  43431412 3.0976e+10 4938.8
## + Age_sq          1  19439246 3.1000e+10 4939.0
## + Years_sq        1   7882401 3.1012e+10 4939.1
## + Ethnicity       1   5597518 3.1014e+10 4939.2
## + Region          3 429924530 3.0590e+10 4939.5
## + Gender          2  19578695 3.1000e+10 4941.0
## - Age_num         1 705120967 3.1725e+10 4941.2
## - Q2_helpfulness  1 883702352 3.1903e+10 4942.7
## 
## Step:  AIC=4935.39
## savings_change ~ Age_num + Years_num + Education_num + Race_binary + 
##     Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Race_binary     1  86206596 3.1127e+10 4934.1
## - Education_num   1  86694151 3.1128e+10 4934.1
## <none>                        3.1041e+10 4935.4
## - Years_num       1 299446819 3.1340e+10 4935.9
## + Employees_num   1  21570814 3.1019e+10 4937.2
## + Age_sq          1  20343410 3.1021e+10 4937.2
## + Ethnicity       1   9382022 3.1032e+10 4937.3
## + Years_sq        1   8064700 3.1033e+10 4937.3
## + Employees_sq    1   5051112 3.1036e+10 4937.4
## + Region          3 418792581 3.0622e+10 4937.8
## + Gender          2  17687721 3.1023e+10 4939.2
## - Age_num         1 697642229 3.1739e+10 4939.3
## - Q2_helpfulness  1 899968440 3.1941e+10 4941.0
## 
## Step:  AIC=4934.13
## savings_change ~ Age_num + Years_num + Education_num + Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## - Education_num   1 102562250 3.1230e+10 4933.0
## <none>                        3.1127e+10 4934.1
## - Years_num       1 274404385 3.1402e+10 4934.5
## + Race_binary     1  86206596 3.1041e+10 4935.4
## + Ethnicity       1  25960648 3.1101e+10 4935.9
## + Employees_num   1  23489302 3.1104e+10 4935.9
## + Age_sq          1  13256116 3.1114e+10 4936.0
## + Employees_sq    1   7371427 3.1120e+10 4936.1
## + Years_sq        1   4529548 3.1123e+10 4936.1
## + Region          3 419527886 3.0708e+10 4936.5
## - Age_num         1 661104471 3.1788e+10 4937.7
## + Gender          2  35677748 3.1092e+10 4937.8
## - Q2_helpfulness  1 840754028 3.1968e+10 4939.2
## 
## Step:  AIC=4933
## savings_change ~ Age_num + Years_num + Q2_helpfulness
## 
##                  Df Sum of Sq        RSS    AIC
## <none>                        3.1230e+10 4933.0
## - Years_num       1 268241482 3.1498e+10 4933.3
## + Education_num   1 102562250 3.1127e+10 4934.1
## + Race_binary     1 102074695 3.1128e+10 4934.1
## + Ethnicity       1  31817622 3.1198e+10 4934.7
## + Employees_num   1  12710008 3.1217e+10 4934.9
## + Age_sq          1  11291015 3.1219e+10 4934.9
## + Years_sq        1   4586972 3.1225e+10 4935.0
## + Employees_sq    1   1382305 3.1228e+10 4935.0
## + Region          3 420257796 3.0810e+10 4935.4
## + Gender          2  44793274 3.1185e+10 4936.6
## - Age_num         1 672862743 3.1903e+10 4936.6
## - Q2_helpfulness  1 816959623 3.2047e+10 4937.8
summary(step_savings)
## 
## Call:
## lm(formula = savings_change ~ Age_num + Years_num + Q2_helpfulness, 
##     data = savings_model_data_complete)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94931  -1324    998   2850  61105 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      3656.8     4543.1   0.805   0.4216   
## Age_num          1163.2      490.5   2.371   0.0184 * 
## Years_num        -688.9      460.1  -1.497   0.1355   
## Q2_helpfulness  -2219.5      849.4  -2.613   0.0095 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10940 on 261 degrees of freedom
## Multiple R-squared:  0.04668,    Adjusted R-squared:  0.03572 
## F-statistic:  4.26 on 3 and 261 DF,  p-value: 0.005854