#Packages
library(tidyverse)
library(readxl)
library(dplyr)
library(ggplot2)
library(ggalluvial)
library(naniar)
library(knitr)
library(forcats)
library(kableExtra)
library(webshot2)
library(scales)
library(sf)
library(tigris)
library(viridis)
library(stringr)
library(sandwich)
library(ggeffects)
library(lmtest)
#Functions for graphs
plot_demo_bar <- function(data, var, xlab) {
ggplot(data, aes(x = {{ var }},
y = after_stat(..count.. / sum(..count..)),
fill = {{ var }})) +
geom_bar(color = "black") +
scale_y_continuous(
expand = c(0, 0),
name = "Percent",
labels = scales::percent_format(accuracy = 0.1)
) +
scale_fill_brewer(palette = "Pastel4") +
labs(x = xlab) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
}
#Importing the Data
Pesticide<- read_excel("Pesticide data.xlsx")
## New names:
## • `Q32...205` -> `Q32...190`
## • `Q32...207` -> `Q32...192`
##Columns that are completely deleted due to privacy or not needed
Pesticide$IPAddress<- NULL
Pesticide$Status<- NULL
Pesticide$RecipientLastName<- NULL
Pesticide$RecipientFirstName<- NULL
Pesticide$RecipientEmail<- NULL
Pesticide$ExternalReference<- NULL
Pesticide$LocationLatitude<- NULL
Pesticide$LocationLongitude<- NULL
Pesticide$StartDate<- NULL
Pesticide$EndDate<- NULL
Pesticide$Progress<- NULL
Pesticide$`Duration (in seconds)`<- NULL
Pesticide$Finished<- NULL
Pesticide$RecordedDate<- NULL
Pesticide_Clean<- read.csv("cleaned_Pesticide.csv")
Pesticide_Clean<- Pesticide_Clean[-1,]
#Qustion 1
Pesticide_License_Type<- Pesticide_Clean$Q2
Pesticide_License_Type_NA<- ifelse(is.na(Pesticide_License_Type), "Missing", as.character(Pesticide_License_Type))
recode for bar chart
Pesticide_License_Type <- as.data.frame(Pesticide_License_Type)
Pesticide_License_Type_recode <- Pesticide_License_Type %>%
mutate(
Applicator = dplyr::recode(Pesticide_License_Type,
"1" = "Private Applicator",
"2" = "Commerical Applicator",
"3" = "Commerical Applicator with Contractor License")
)
Pesticide_License_Type_recode <- Pesticide_License_Type_recode %>% drop_na()
applicator <- ggplot(data = Pesticide_License_Type_recode, aes(x = Applicator, fill = Applicator)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.3, size = 4)+
scale_fill_manual(
values = c("steelblue", "darkorchid", "deeppink2", "darkred")
) +
labs(x = "License Type", y = "Count", fill = "License Type", title = "Counts of Type of Pesticide Applicator") +
theme_void()+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
applicator
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/applicator_piechart.png", applicator)
## Saving 7 x 5 in image
Pesticide_License_Type <- Pesticide_License_Type %>% drop_na()
applicator <- plot_demo_bar(data=Pesticide_License_Type_recode, Applicator, "Type of Pesticide Applicator")
## Warning: Unknown palette: "Pastel4"
applicator
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/applicator_barchart.png", applicator)
## Saving 7 x 5 in image
length(Pesticide_License_Type) #Total Responses
## [1] 1
table(Pesticide_License_Type, useNA = "ifany")
## Pesticide_License_Type
## 1 2 3
## 556 38 3
prop.table(table(Pesticide_License_Type))*100 #Without Missing Data
## Pesticide_License_Type
## 1 2 3
## 93.1323283 6.3651591 0.5025126
prop.table(table(Pesticide_License_Type, useNA = "ifany"))*100 #With Missing Data
## Pesticide_License_Type
## 1 2 3
## 93.1323283 6.3651591 0.5025126
#Question 2
counts_Q2<- colSums(Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")] == 1, na.rm = TRUE)
table_Q2<- data.frame(
option = names(counts_Q2),
count = counts_Q2,
percent = round(counts_Q2/ sum(counts_Q2) *100, 1)
)
table_Q2
sum(rowSums(is.na(Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")])) == 9)
## [1] 36
(36/633) *100 # Percent Missing
## [1] 5.687204
#Question 3
Q2_reg<- Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")]
Q3_reg<- Pesticide_Clean[, c("Q4_1", "Q4_2", "Q4_3", "Q4_4", "Q4_5", "Q4_6", "Q4_7", "Q4_8", "Q4_9", "Q4_10", "Q4_11")]
Q2<- Pesticide_Clean[, c("Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6", "Q3_7", "Q3_8", "Q3_9")]
Q3<- Pesticide_Clean[, c("Q4_1", "Q4_2", "Q4_3", "Q4_4", "Q4_5", "Q4_6", "Q4_7", "Q4_8", "Q4_9", "Q4_10", "Q4_11")]
Q2[is.na(Q2)] <- 0
Q3[is.na(Q3)] <- 0
skipLogic_Q3<- list(
Q4_1 = Q2$Q3_1 == 1 | Q2$Q3_2 == 1,
Q4_2 = Q2$Q3_4 == 1,
Q4_3 = Q2$Q3_5 == 1,
Q4_4 = Q2$Q3_3 == 1,
Q4_5 = Q2$Q3_2 == 1,
Q4_6 = Q2$Q3_6 == 1,
Q4_7 = Q2$Q3_7 == 1,
Q4_8 = Q2$Q3_1 == 1 | Q2$Q3_2 == 1,
Q4_9 = Q2$Q3_8 == 1,
Q4_10 = Q2$Q3_1 == 1 | Q2$Q3_2 == 1,
Q4_11 = Q2$Q3_8 == 1
)
Q4_table_title<- c(Q4_1 = "Plant Ag",
Q4_2 = "Animal Ag",
Q4_3 = "Forestry",
Q4_4 = "Ornamental/Turf",
Q4_5 = "Seed Treatment",
Q4_6 = "Aquatic",
Q4_7 = "ROW",
Q4_8 = "Aerial Equipment",
Q4_9 = "Pest Control",
Q4_10 = "Ag Commodity Fumigation",
Q4_11 = "Mosquito Control")
result_Q3<- data.frame(
Option = names(skipLogic_Q3),
Eligible = sapply(skipLogic_Q3, sum),
Selected = sapply(names(skipLogic_Q3), function(col) {
skip<- skipLogic_Q3[[col]]
sum(Q3[skip, col] == 1, na.rm = TRUE)
}
)
)
result_Q3$Percentage<- round(result_Q3$Selected / result_Q3$Eligible *100, 1)
result_Q3$Option<- Q4_table_title[result_Q3$Option]
results_Q3_table_ready<- result_Q3 %>%
rename(
"Pesticide License" = Option,
"Eligible Participants" = Eligible,
"Selected Participants" = Selected,
"Percentage Selected (%)" = Percentage
)
kable(results_Q3_table_ready,
caption = "Survey Question 3 Eligibility and Selection Summary",
row.names = FALSE)
| Pesticide License | Eligible Participants | Selected Participants | Percentage Selected (%) |
|---|---|---|---|
| Plant Ag | 360 | 331 | 91.9 |
| Animal Ag | 189 | 138 | 73.0 |
| Forestry | 136 | 101 | 74.3 |
| Ornamental/Turf | 87 | 67 | 77.0 |
| Seed Treatment | 211 | 26 | 12.3 |
| Aquatic | 24 | 14 | 58.3 |
| ROW | 26 | 15 | 57.7 |
| Aerial Equipment | 360 | 4 | 1.1 |
| Pest Control | 109 | 14 | 12.8 |
| Ag Commodity Fumigation | 360 | 18 | 5.0 |
| Mosquito Control | 109 | 17 | 15.6 |
#Q2 & Q3 Alluvial Plot:
Q2_Q3_Skip_Labels <- c(
Q3_1 = "Fruits/Vegetables",
Q3_2 = "Row Crops",
Q3_3 = "Green Industry",
Q3_4 = "Livestock",
Q3_5 = "Forestry",
Q3_6 = "Aquatic",
Q3_7 = "Utility/ROW",
Q3_8 = "Pest Management",
Q3_9 = "Other Industry",
Q4_1 = "Plant Ag",
Q4_2 = "Animal Ag",
Q4_3 = "Forestry",
Q4_4 = "Ornamental/Turf",
Q4_5 = "Seed Treatment",
Q4_6 = "Aquatic",
Q4_7 = "ROW",
Q4_8 = "Aerial Equipment",
Q4_9 = "Pest Control",
Q4_10 = "Ag Commodity Fumigation",
Q4_11 = "Mosquito Control"
)
Q2_long<- Q2 %>%
mutate(ID = 1:n()) %>%
pivot_longer(
cols = starts_with("Q3_"),
names_to = ("Q2_option"),
values_to = "selected"
) %>%
filter(selected == 1)
Q3_long<- Q3 %>%
mutate(ID = 1:n()) %>%
pivot_longer(
cols = starts_with("Q4_"),
names_to = ("Q3_option"),
values_to = "selected"
) %>%
filter(selected == 1)
long_format_merge_2_3<- merge(Q2_long[, c("ID", "Q2_option")],
Q3_long[, c("ID", "Q3_option")],
by = "ID")
long_format_merge_2_3$Q2_option<- Q2_Q3_Skip_Labels[long_format_merge_2_3$Q2_option]
long_format_merge_2_3$Q3_option<- Q2_Q3_Skip_Labels[long_format_merge_2_3$Q3_option]
Q2_colors<- c(
"Fruits/Vegetables" = "#8DD3C7",
"Row Crops" = "gold",
"Green Industry" ="#BEBADA",
"Livestock" ="#FB8072",
"Forestry" ="#80B1D3",
"Aquatic" ="#FBD462",
"Utility/ROW" ="#B3DE69",
"Pest Management" ="#FCCDE5",
"Other Industry" ="#D9D9D9"
)
alluvial <- ggplot(long_format_merge_2_3, aes(axis1 = Q2_option, axis2 = Q3_option)) +
geom_alluvium(aes(fill = Q2_option), alpha = 0.9, width = 1/15) +
geom_stratum(width = 1/8, fill= "grey95", color="black") +
geom_label(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
scale_fill_manual(values = Q2_colors,
name = "Industry Options") +
scale_x_discrete(limits = c("Survey Q2", "Survey Q3")) +
theme_void(base_size = (12))
alluvial
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/alluvial.png", alluvial)
## Saving 7 x 5 in image
#Question 4
Question4= Pesticide_Clean %>%
select(County_Code = Q24_1)
Question4= Question4 %>%
mutate(
County = dplyr::recode(as.character(County_Code),
"1"="Appling","2"="Atkinson","3"="Bacon","4"="Baker","5"="Baldwin",
"6"="Banks","7"="Barrow","8"="Bartow","9"="Ben Hill","10"="Berrien",
"11"="Bibb","12"="Bleckley","13"="Brantley","14"="Brooks","15"="Bryan",
"16"="Bulloch","17"="Burke","18"="Butts","19"="Calhoun","20"="Camden",
"21"="Candler","22"="Carroll","23"="Catoosa","24"="Charlton","25"="Chatham",
"26"="Chattahoochee","27"="Chattooga","28"="Cherokee","29"="Clarke","30"="Clay",
"31"="Clayton","32"="Clinch","33"="Cobb","34"="Coffee","35"="Colquitt",
"36"="Columbia","37"="Cook","38"="Coweta","39"="Crawford","40"="Crisp",
"41"="Dade","42"="Dawson","43"="Decatur","44"="DeKalb","45"="Dodge",
"46"="Dooly","47"="Dougherty","48"="Douglas","49"="Early","50"="Echols",
"51"="Effingham","52"="Elbert","53"="Emanuel","54"="Evans","55"="Fannin",
"56"="Fayette","57"="Floyd","58"="Forsyth","59"="Franklin","60"="Fulton",
"61"="Gilmer","62"="Glascock","63"="Glynn","64"="Gordon","65"="Grady",
"66"="Greene","67"="Gwinnett","68"="Habersham","69"="Hall","70"="Hancock",
"71"="Haralson","72"="Harris","73"="Hart","74"="Heard","75"="Henry",
"76"="Houston","77"="Irwin","78"="Jackson","79"="Jasper","80"="Jeff Davis",
"81"="Jefferson","82"="Jenkins","83"="Johnson","84"="Jones","85"="Lamar",
"86"="Lanier","87"="Laurens","88"="Lee","89"="Liberty","90"="Lincoln",
"91"="Long","92"="Lowndes","93"="Lumpkin","94"="McDuffie","95"="McIntosh",
"96"="Macon","97"="Madison","98"="Marion","99"="Meriwether","100"="Miller",
"101"="Mitchell","102"="Monroe","103"="Montgomery","104"="Morgan","105"="Murray",
"106"="Muscogee","107"="Newton","108"="Oconee","109"="Oglethorpe","110"="Paulding",
"111"="Peach","112"="Pickens","113"="Pierce","114"="Pike","115"="Polk",
"116"="Pulaski","117"="Putnam","118"="Quitman","119"="Rabun","120"="Randolph",
"121"="Richmond","122"="Rockdale","123"="Schley","124"="Screven","125"="Seminole",
"126"="Spalding","127"="Stephens","128"="Stewart","129"="Sumter","130"="Talbot",
"131"="Taliaferro","132"="Tattnall","133"="Taylor","134"="Telfair","135"="Terrell",
"136"="Thomas","137"="Tift","138"="Toombs","139"="Towns","140"="Treutlen",
"141"="Troup","142"="Turner","143"="Twiggs","144"="Union","145"="Upson",
"146"="Walker","147"="Walton","148"="Ware","149"="Warren","150"="Washington",
"151"="Wayne","152"="Webster","153"="Wheeler","154"="White","155"="Whitfield",
"156"="Wilcox","157"="Wilkes","158"="Wilkinson","159"="Worth"))
#Checking Missingness
data.frame(
Total = nrow(Question4),
Missing = sum(is.na(Question4$County)),
Percent_Missing = mean(is.na(Question4$County))*100
)
County_Counts= Question4%>%
count(County, name = "responses")
options(tigris_use_cache = TRUE)
ga_map= counties(state = "GA", class = "sf")
## Retrieving data for the year 2024
ga_map$County= gsub(" County", "", ga_map$NAME)
map_data= ga_map %>%
left_join(County_Counts, by = "County")
map_data$responses[is.na(map_data$responses)] <- 0
ggplot(map_data) +
geom_sf(aes(fill = responses), color = "white", size = 0.15) +
scale_fill_viridis(option = "plasma", direction = -1) +
labs(
title = "Survey Respondents by Georgia County",
subtitle = "Question 4: County Location",
fill = "Responses") +
theme_void() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "right")
#Question 5
Pesticide_Clean$Q6<- factor(
Pesticide_Clean$Q6,
levels = c(1, 2),
labels = c("Yes", "No")
)
table(Pesticide_Clean$Q6, useNA = "ifany")
##
## Yes No <NA>
## 107 456 70
#Need more info to continue
#Question 6
Q6_data <- as.data.frame(Pesticide_Clean[,186])
colnames(Q6_data) <- c("Employees")
Q6_data$Employees <- as.character(Q6_data$Employees)
Q6_data <- Q6_data %>%
mutate(
Employees = dplyr::recode(as.character(Employees),
"1" = "Sole operator",
"2" = "Between 1 and 10 employees",
"3" = "Between 11 and 24 employees",
"4" = "Between 25 and 50 employees",
"5" = "Between 51 and 75 employees",
"6" = "Between 76 and 100 employees",
"7" = "More than 100 employees"
),
Employees = factor(Employees, levels = c(
"Sole operator",
"Between 1 and 10 employees",
"Between 11 and 24 employees",
"Between 25 and 50 employees",
"Between 51 and 75 employees",
"Between 76 and 100 employees",
"More than 100 employees"
))
) %>%
filter(!is.na(Employees))
Q6_table <- Q6_data %>%
count(Employees, name = "Count") %>%
arrange(Employees) %>% # optional but nice
mutate(
Percent_num = 100 * Count / sum(Count),
Percent = paste0(round(Percent_num, 1), "%")
) %>%
select(-Percent_num)
Q6_table %>%
kable(
caption = "Survey Question 6: Number of Employees",
col.names = c("Number of Employees", "Count", "Percent"),
align = c("l", "c", "c"),
booktabs = TRUE
) %>%
kable_styling(
full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed")
)
| Number of Employees | Count | Percent |
|---|---|---|
| Sole operator | 332 | 59% |
| Between 1 and 10 employees | 170 | 30.2% |
| Between 11 and 24 employees | 30 | 5.3% |
| Between 25 and 50 employees | 11 | 2% |
| Between 51 and 75 employees | 5 | 0.9% |
| Between 76 and 100 employees | 2 | 0.4% |
| More than 100 employees | 13 | 2.3% |
#Question 7
Q7_data <- as.data.frame(Pesticide_Clean[,187])
colnames(Q7_data) <- c("Years")
Q7_data <- Q7_data %>%
mutate(
Years = dplyr::recode(as.character(Years),
"1" = "Less than 1 year",
"2" = "Between 1 to 5 years",
"3" = "Between 6 to 10 years",
"4" = "Between 11 to 15 years",
"5" = "Between 16 to 20 years",
"6" = "More than 20 years"
),
Years = factor(Years, levels = c(
"Less than 1 year",
"Between 1 to 5 years",
"Between 6 to 10 years",
"Between 11 to 15 years",
"Between 16 to 20 years",
"More than 20 years"
))
) %>%
filter(!is.na(Years))
Q7_table <- Q7_data %>%
count(Years, name = "Count") %>%
arrange(Years) %>% # optional but explicit
mutate(
Percent_num = 100 * Count / sum(Count),
Percent = paste0(round(Percent_num, 1), "%")
) %>%
select(-Percent_num)
Q7_table %>%
kable(
caption = "Survey Question 7: How long have you held a license?",
col.names = c("Years Licensed", "Count", "Percent"),
align = c("l", "c", "c"),
booktabs = TRUE
) %>%
kable_styling(
full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed")
)
| Years Licensed | Count | Percent |
|---|---|---|
| Less than 1 year | 14 | 2.6% |
| Between 1 to 5 years | 203 | 37.7% |
| Between 6 to 10 years | 112 | 20.8% |
| Between 11 to 15 years | 54 | 10% |
| Between 16 to 20 years | 50 | 9.3% |
| More than 20 years | 106 | 19.7% |
#Question 8
Pesticide_Clean$Q10 <- as.numeric(as.character(Pesticide_Clean$Q10))
Pesticide_Clean$Q10<- factor(Pesticide_Clean$Q10,
levels = c(1, 2),
labels = c(
"Initial Certification",
"Recertification"
)
)
Pesticide_Clean$Q10<- fct_explicit_na(
Pesticide_Clean$Q10,
na_level = "Missing"
)
## Warning: `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
counts_Q8<- table(Pesticide_Clean$Q10, useNA = "ifany")
counts_Q8
##
## Initial Certification Recertification Missing
## 222 317 94
percent_Q8<- round(prop.table(counts_Q8)*100, 1)
percent_Q8
##
## Initial Certification Recertification Missing
## 35.1 50.1 14.8
table_Q8<- data.frame(
Status = names(counts_Q8),
Count = as.numeric(counts_Q8),
Percent = percent_Q8
)
table_Q8$Percent.Var1<- NULL
table_Q8
##Certification Graph
#Fixing Data for Q9 and Q13
Q13<- Pesticide_Clean[, c("Q12_1", "Q12_2", "Q12_3", "Q12_4", "Q12_5", "Q12_6", "Q12_7")]
colnames(Q13)<- c(
"No training offered when I needed them",
"Training sessions were not held at a convenient time",
"The training costs too much money",
"A different trainer was closer",
"I attended a full day training workshop to receive multiple licenses",
"The instructor at my class was from UGA but a different company sponsored it",
"Other"
)
Q13_freq<- colSums(Q13 == 1, na.rm = TRUE)
Q13_freq
## No training offered when I needed them
## 16
## Training sessions were not held at a convenient time
## 13
## The training costs too much money
## 1
## A different trainer was closer
## 7
## I attended a full day training workshop to receive multiple licenses
## 3
## The instructor at my class was from UGA but a different company sponsored it
## 5
## Other
## 29
Pesticide_Clean$Q12_7_TEXT<- str_trim(tolower(Pesticide_Clean$Q12_7_TEXT))
Pesticide_Clean$Q12_7_TEXT<- gsub("extion", "extension", Pesticide_Clean$Q12_7_TEXT)
Pesticide_Clean$Q12_7_TEXT<- gsub("4", "extension", Pesticide_Clean$Q12_7_TEXT)
fix_skip_logic_Q9_Q13<- Pesticide_Clean$Q11_2 == 1 & Pesticide_Clean$Q12_7 == 1 & grepl("extension", Pesticide_Clean$Q12_7_TEXT, ignore.case = TRUE)
sum(fix_skip_logic_Q9_Q13)
## [1] 7
Pesticide_Clean$Q11_1[fix_skip_logic_Q9_Q13]<- 1
Pesticide_Clean$Q11_2[fix_skip_logic_Q9_Q13]<- NA
Q13_columns<- c("Q12_1", "Q12_2", "Q12_3", "Q12_4", "Q12_5", "Q12_6", "Q12_7")
Pesticide_Clean[fix_skip_logic_Q9_Q13, Q13_columns]<- NA
Pesticide_Clean$Q12_7_TEXT[fix_skip_logic_Q9_Q13]<- NA
#Question 9
counts_Q9<- colSums(Pesticide_Clean[, c("Q11_1", "Q11_2")] == 1, na.rm =TRUE)
names(counts_Q9)<- c("UGA", "Other")
eligibible_Q9<- sum(Pesticide_Clean$Q10 == "Recertification", na.rm = TRUE)
eligibible_Q9
## [1] 317
percent_Q9<- round(counts_Q9 / eligibible_Q9 * 100, 1)
percent_Q9
## UGA Other
## 85.8 17.0
table_Q9<- data.frame(
Training = names(counts_Q9),
Selected = as.numeric(counts_Q9),
Percent = percent_Q9
)
table_Q9
#Question 10
unique(Pesticide_Clean$Q32...205)
## [1] "0"
## [2] "can't remember"
## [3] "$20 "
## [4] NA
## [5] "$10 "
## [6] "$25 "
## [7] "$25.00 "
## [8] "15"
## [9] "25"
## [10] "$5 "
## [11] "$30 "
## [12] "10"
## [13] "$15 "
## [14] "Nothing"
## [15] "35"
## [16] "20"
## [17] "I0"
## [18] "i am not sure . may be 25.00"
## [19] "don't remember"
## [20] "I don’t remember"
## [21] "It was online---do not remember exact cost"
## [22] "50-100"
## [23] "not sure"
## [24] "$35 "
## [25] "50"
## [26] "$20.00 Application Fee"
## [27] "100"
## [28] "Not sure"
## [29] "5"
## [30] "don't remember - maybe $50"
## [31] "O"
## [32] "7.50 per hour"
## [33] "40"
## [34] "Don't remember"
## [35] "$5 per credit"
## [36] "0\n"
## [37] "Don’t remember"
## [38] "$12.50 "
## [39] "30?"
## [40] "I don't remember"
## [41] "?"
## [42] "110"
## [43] "$75 "
## [44] "$35.00 "
## [45] "N/a"
## [46] "$50.00 "
## [47] "Unsure"
## [48] "8"
Q10_unfiltered<- str_trim(tolower(Pesticide_Clean$Q32...205))
Q10_unfiltered<- gsub("\\$", "", Q10_unfiltered)
Q10_unfiltered<- gsub("\\?", "", Q10_unfiltered)
Q10_unfiltered<- gsub("application fee", "", Q10_unfiltered)
Q10_unfiltered<- gsub("per hour", "", Q10_unfiltered)
Q10_unfiltered<- gsub("i", "", Q10_unfiltered)
Q10_unfiltered[Q10_unfiltered == "nothng"]<- 0
Q10_clean<- Q10_unfiltered
Q10_miss_NA<- rep("Valid", length(Q10_clean))
excluded_values<- c(
"can't remember", " am not sure . may be 25.00", "don't remember", "don't remember", "t was onlne---do not remember exact cost", "not sure", "don't remember - maybe 50", "5 per credt", " don't remember", "?", "unsure", "o", "", "n/a", "don’t remember", " don’t remember", "50-100"
)
Q10_miss_NA[Q10_clean %in% excluded_values]<- "Excluded"
Q10_clean[Q10_clean %in% excluded_values]<- NA
Q10_miss_NA[is.na(Pesticide_Clean$Q32...205)]<- "Missing"
unique(Q10_clean)
## [1] "0" NA "20" "10" "25" "25.00" "15" "5"
## [9] "30" "35" "50" "20.00 " "100" "7.50 " "40" "12.50"
## [17] "110" "75" "35.00" "50.00" "8"
Q10_numeric<- as.numeric(gsub("[^0-9\\.]", "", Q10_clean))
Q10_ranges<- cut(
Q10_numeric,
breaks = c(-1, 0, 10, 30, 50, Inf),
labels = c("$0", "$1-10", "$11-30", "$31-50", "$51+"),
right = TRUE
)
counts_Q10<- table(Q10_ranges, useNA = "ifany")
counts_Q10
## Q10_ranges
## $0 $1-10 $11-30 $31-50 $51+ <NA>
## 459 15 27 10 3 119
percent_Q10<- round(prop.table(counts_Q10)*100, 1)
percent_Q10
## Q10_ranges
## $0 $1-10 $11-30 $31-50 $51+ <NA>
## 72.5 2.4 4.3 1.6 0.5 18.8
table_Q10<- data.frame(
Status = names(counts_Q10),
Count = as.numeric(counts_Q10),
Percent = percent_Q10
)
table_Q10$Percent.Q10_ranges<- NULL
table_Q10
kable(
table_Q10 %>% filter(!is.na(table_Q10$Status)),
col.names = c("Cost Range ($)", "Count", "Percent(%)")
)
| Cost Range ($) | Count | Percent(%) |
|---|---|---|
| $0 | 459 | 72.5 |
| $1-10 | 15 | 2.4 |
| $11-30 | 27 | 4.3 |
| $31-50 | 10 | 1.6 |
| $51+ | 3 | 0.5 |
#Bar Chart including the NA
#In this question NA is missing data and the values that were imput that were not appropriate for the question.
Q10_data_fram<- data.frame(Q10_ranges = fct_explicit_na(Q10_ranges, na_level = "Missing"), Status = Q10_miss_NA)
Q10_data_fram$Q10_ranges[Q10_data_fram$Status == "Excluded"]<- "Excluded"
## Warning in `[<-.factor`(`*tmp*`, Q10_data_fram$Status == "Excluded", value =
## structure(c(1L, : invalid factor level, NA generated
Q10_data_fram$Q10_ranges<- factor(Q10_data_fram$Q10_ranges)
ggplot(Q10_data_fram, aes(x = Q10_ranges)) +
geom_bar(fill = "coral") +
geom_text(stat = "count", aes(label = ..count..)) +
labs(
title = "Cost of Virtual Training per Hour",
x = "Cost Range ($ per hour)",
y = "Number of responses"
) +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Question 11
unique(Pesticide_Clean$Q31)
## [1] "25" "0" "can't remember" "45"
## [5] "50" NA "30" "$75 "
## [9] "150" "$30 in person" "$25.00 " "60"
## [13] "Online classes" "$30 " "10" "40"
## [17] "75" "?" "20" "0\n"
## [21] "don't remember" "I don’t recall" "Less than $50" "100 (I think)"
## [25] "100" "200" "Not sure" "dont remember"
## [29] "Don't remember" "O" "Don’t remember" "2"
## [33] "15" "29" "N/a"
Q11_unfiltered<- str_trim(tolower(Pesticide_Clean$Q31))
Q11_unfiltered<- gsub("\\$", "", Q11_unfiltered)
Q11_unfiltered<- gsub("in person", "", Q11_unfiltered)
Q11_unfiltered<- gsub("i think", "", Q11_unfiltered)
Q11_unfiltered<- gsub("\\()", "", Q11_unfiltered)
Q11_clean<- Q11_unfiltered
Q11_miss_NA<- rep("Valid", length(Q11_clean))
excluded_values<- c( "?", "i don’t recall", "don't remember", "n/a", "o", "can't remember", "NA", "don't remember", "not sure","don't remember", "dont remember", "online classes", "don’t remember", "less than 50"
)
Q11_miss_NA[Q11_clean %in% excluded_values]<- "Excluded"
Q11_clean[Q11_clean %in% excluded_values]<- NA
Q11_miss_NA[is.na(Pesticide_Clean$Q31)]<- "Missing"
unique(Q11_clean)
## [1] "25" "0" NA "45" "50" "30" "75" "150" "30 "
## [10] "25.00" "60" "10" "40" "20" "100 " "100" "200" "2"
## [19] "15" "29"
Q11_numeric<- as.numeric(gsub("[^0-9\\.]", "", Q11_clean))
Q11_ranges<- cut(
Q11_numeric,
breaks = c(-1, 0, 10, 30, 50, Inf),
labels = c("0", "1-10", "11-30", "31-50", "51+"),
right = TRUE
)
counts_Q11<- table(Q11_ranges, useNA = "ifany")
counts_Q11
## Q11_ranges
## 0 1-10 11-30 31-50 51+ <NA>
## 489 3 17 6 7 111
percent_Q11<- round(prop.table(counts_Q11)*100, 1)
percent_Q11
## Q11_ranges
## 0 1-10 11-30 31-50 51+ <NA>
## 77.3 0.5 2.7 0.9 1.1 17.5
table_Q11<- data.frame(
Status = names(counts_Q11),
Count = as.numeric(counts_Q11),
Percent = percent_Q11
)
table_Q11$Percent.Q11_ranges<- NULL
table_Q11
kable(
table_Q11 %>% filter(!is.na(table_Q11$Status)),
col.names = c("Cost Range ($)", "Count", "Percent(%)")
)
| Cost Range ($) | Count | Percent(%) |
|---|---|---|
| 0 | 489 | 77.3 |
| 1-10 | 3 | 0.5 |
| 11-30 | 17 | 2.7 |
| 31-50 | 6 | 0.9 |
| 51+ | 7 | 1.1 |
#In this question NA is missing data and the values that were imput that were not appropriate for the question.
Q11_data_fram<- data.frame(Q11_ranges = fct_explicit_na(Q11_ranges, na_level = "Missing"), Status = Q11_miss_NA)
Q11_data_fram$Q11_ranges[Q11_data_fram$Status == "Excluded"]<- "Excluded"
## Warning in `[<-.factor`(`*tmp*`, Q11_data_fram$Status == "Excluded", value =
## structure(c(3L, : invalid factor level, NA generated
Q11_data_fram$Q11_ranges<- factor(Q11_data_fram$Q11_ranges)
ggplot(Q11_data_fram, aes(x = Q11_ranges)) +
geom_bar(fill = "steelblue") +
geom_text(stat = "count", aes(label = ..count..)) +
labs(
title = "Cost of Virtual Training per Half Day",
x = "Cost Range ($ per hour)",
y = "Number of responses"
) +
theme_minimal()
#Question 12
unique(Pesticide_Clean$Q32...207)
## [1] "50"
## [2] "0"
## [3] "can't remember"
## [4] "90"
## [5] "75"
## [6] NA
## [7] "70"
## [8] "125"
## [9] "$125 "
## [10] "200"
## [11] "50?"
## [12] "$40-50 in person"
## [13] "35"
## [14] "$50.00 "
## [15] "120"
## [16] "Online classes"
## [17] "20"
## [18] "25"
## [19] "40"
## [20] "85"
## [21] "60"
## [22] "0\n"
## [23] "don't remember"
## [24] "I don’t recalll. Approximately 250$"
## [25] "o"
## [26] "Less than $50"
## [27] "50-100"
## [28] "100"
## [29] "300"
## [30] "Not sure"
## [31] "dont remember"
## [32] "10"
## [33] "Don't remember"
## [34] "Don’t remember"
## [35] "O"
## [36] "15"
## [37] "N/a"
## [38] "Maybe $50 don't remember"
## [39] "\n0"
Q12_unfiltered<- str_trim(tolower(Pesticide_Clean$Q32...207))
Q12_unfiltered<- gsub("\\$", "", Q12_unfiltered)
Q12_unfiltered<- gsub("\\?", "", Q12_unfiltered)
Q12_unfiltered<- gsub("in person", "", Q12_unfiltered)
Q12_clean<- Q12_unfiltered
Q12_miss_NA<- rep("Valid", length(Q12_clean))
excluded_values<- c("can't remember", "online classes", "don't remember", "less than 50", "o", "not sure", "dont remember", "don’t remember", "maybe 50 don't remember"
)
Q12_miss_NA[Q12_clean %in% excluded_values]<- "Excluded"
Q12_clean[Q12_clean %in% excluded_values]<- NA
Q12_miss_NA[is.na(Pesticide_Clean$Q32...207)]<- "Missing"
Q12_clean[Q12_clean %in% c("40-50 ")]<- 45
Q12_clean[Q12_clean %in% c("50-100")]<- 75
Q12_clean[Q12_clean %in% c("i don’t recalll. approximately 250")]<- 250
unique(Q12_clean)
## [1] "50" "0" NA "90" "75" "70" "125" "200" "45"
## [10] "35" "50.00" "120" "20" "25" "40" "85" "60" "250"
## [19] "100" "300" "10" "15" "n/a"
Q12_numeric<- as.numeric(gsub("[^0-9\\.]", "", Q12_clean))
Q12_ranges<- cut(
Q12_numeric,
breaks = c(-1, 0, 10, 30, 50, Inf),
labels = c("0", "1-10", "11-30", "31-50", "51+"),
right = TRUE
)
counts_Q12<- table(Q12_ranges, useNA = "ifany")
counts_Q12
## Q12_ranges
## 0 1-10 11-30 31-50 51+ <NA>
## 489 1 8 11 15 109
percent_Q12<- round(prop.table(counts_Q12)*100, 1)
percent_Q12
## Q12_ranges
## 0 1-10 11-30 31-50 51+ <NA>
## 77.3 0.2 1.3 1.7 2.4 17.2
table_Q12<- data.frame(
Status = names(counts_Q12),
Count = as.numeric(counts_Q12),
Percent = percent_Q12
)
table_Q12$Percent.Q12_ranges<- NULL
table_Q12
kable(
table_Q12 %>% filter (!is.na(table_Q12$Status)),
col.names = c("Cost Range ($)", "Count", "Percent(%)")
)
| Cost Range ($) | Count | Percent(%) |
|---|---|---|
| 0 | 489 | 77.3 |
| 1-10 | 1 | 0.2 |
| 11-30 | 8 | 1.3 |
| 31-50 | 11 | 1.7 |
| 51+ | 15 | 2.4 |
#In this question NA is missing data and the values that were imput that were not appropriate for the question.
Q12_data_fram<- data.frame(Q12_ranges = fct_explicit_na(Q12_ranges, na_level = "Missing"), Status = Q12_miss_NA)
Q12_data_fram$Q12_ranges[Q12_data_fram$Status == "Excluded"]<- "Excluded"
## Warning in `[<-.factor`(`*tmp*`, Q12_data_fram$Status == "Excluded", value =
## structure(c(4L, : invalid factor level, NA generated
Q12_data_fram$Q12_ranges<- factor(Q12_data_fram$Q12_ranges)
ggplot(Q12_data_fram, aes(x = Q12_ranges)) +
geom_bar(fill = "violet") +
geom_text(stat = "count", aes(label = ..count..)) +
labs(
title = "Cost of A Full Day of Non UGA Training",
x = "Cost Range ($ per hour)",
y = "Number of responses"
) +
theme_minimal()
#Code to help create table
eligible_Q12 <- Pesticide_Clean$Q10 == "Recertification"
eligible_Q12[is.na(eligible_Q12)] <- FALSE
has_valid_Q12 <- !is.na(Q12_numeric)
Q12_missing_status <- dplyr::case_when(
!eligible_Q12 ~ "Not Eligible (Skip-Logic)",
eligible_Q12 & has_valid_Q12 ~ "Eligible, Answered",
eligible_Q12 & !has_valid_Q12 ~ "Eligible, No Response"
)
table(Q12_missing_status)
## Q12_missing_status
## Eligible, Answered Eligible, No Response Not Eligible (Skip-Logic)
## 308 9 316
#Table for Q8 and Q12
N_total <- nrow(Pesticide_Clean)
# -------------------------
# Q8 (Certification Status)
# -------------------------
q8_missing <- sum(Pesticide_Clean$Q10 == "Missing" | is.na(Pesticide_Clean$Q10), na.rm = TRUE)
q8_answered <- N_total - q8_missing
q8_resp_rate <- round(100 * q8_answered / N_total, 1)
# % nonresponse for Q8 (out of total)
q8_nonresp_pct <- round(100 * q8_missing / N_total, 1)
# -------------------------
# Q12 (Training Cost) — shown only if Recertification
# -------------------------
eligible_q12 <- Pesticide_Clean$Q10 == "Recertification"
eligible_q12[is.na(eligible_q12)] <- FALSE
N_eligible_q12 <- sum(eligible_q12)
N_not_eligible_q12 <- N_total - N_eligible_q12
answered_q12 <- eligible_q12 & !is.na(Q12_numeric)
N_answered_q12 <- sum(answered_q12, na.rm = TRUE)
N_nonresponse_q12 <- N_eligible_q12 - N_answered_q12
q12_resp_rate_eligible <- ifelse(
N_eligible_q12 > 0,
round(100 * N_answered_q12 / N_eligible_q12, 1),
NA_real_
)
# % nonresponse for Q12 (out of eligible)
q12_nonresp_pct <- ifelse(
N_eligible_q12 > 0,
round(100 * N_nonresponse_q12 / N_eligible_q12, 1),
NA_real_
)
# % skip-logic for Q12 (out of total)
q12_skip_pct <- round(100 * N_not_eligible_q12 / N_total, 1)
# -------------------------
# Build table (your preferred column order + 2 new % columns)
# -------------------------
missingness_tbl <- tibble(
Question = c(
"Q8 (Certification Status)",
"Q12 (Training Cost; shown only if Recertification)"
),
`Total N` = c(N_total, N_total),
Answered = c(q8_answered, N_answered_q12),
`Missing (Item Nonresponse)` = c(q8_missing, N_nonresponse_q12),
`Eligible (Saw Question)` = c(NA_integer_, N_eligible_q12),
`Not Eligible (Skip-Logic)` = c(NA_integer_, N_not_eligible_q12),
`Response Rate (Eligible)` = c(
paste0(q8_resp_rate, "%"),
paste0(q12_resp_rate_eligible, "%")
),
`Nonresponse Missingness %` = c(
paste0(q8_nonresp_pct, "%"),
paste0(q12_nonresp_pct, "%")
),
`Skip-Logic Missingness %` = c(
NA_character_,
paste0(q12_skip_pct, "%")
)
)
missingness_tbl %>%
kable(
caption = "Item Nonresponse and Skip-Logic Missingness for Questions 8 and 12",
align = "lcccccccc",
booktabs = TRUE
) %>%
kable_styling(
full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed")
)
| Question | Total N | Answered | Missing (Item Nonresponse) | Eligible (Saw Question) | Not Eligible (Skip-Logic) | Response Rate (Eligible) | Nonresponse Missingness % | Skip-Logic Missingness % |
|---|---|---|---|---|---|---|---|---|
| Q8 (Certification Status) | 633 | 539 | 94 | NA | NA | 85.2% | 14.8% | NA |
| Q12 (Training Cost; shown only if Recertification) | 633 | 308 | 9 | 317 | 316 | 97.2% | 2.8% | 49.9% |
#Question 13
Q13<- Pesticide_Clean[, c("Q12_1", "Q12_2", "Q12_3", "Q12_4", "Q12_5", "Q12_6", "Q12_7")]
colnames(Q13)<- c(
"No training offered when I needed them",
"Training sessions were not held at a convenient time",
"The training costs too much money",
"A different trainer was closer",
"I attended a full day training workshop to receive multiple licenses",
"The instructor at my class was from UGA but a different company sponsored it",
"Other"
)
Q13_freq<- colSums(Q13 == 1, na.rm = TRUE)
Q13_freq
## No training offered when I needed them
## 14
## Training sessions were not held at a convenient time
## 11
## The training costs too much money
## 1
## A different trainer was closer
## 7
## I attended a full day training workshop to receive multiple licenses
## 3
## The instructor at my class was from UGA but a different company sponsored it
## 5
## Other
## 22
#Additional Breakdown of Other Reasons for Question 13
unique(Pesticide_Clean$Q12_7_TEXT)
## [1] NA
## [2] "reciprocation agreement with neighboring state"
## [3] "i just studied the manuals and went to take the initial test. i can’t even remember where. i get all my updated training at ggcsa conferences."
## [4] "i trained myself. i read the book"
## [5] "i study the books myself"
## [6] "i learned what i needed through the handbook and classes at abac."
## [7] "i attend the annual training in pecan production meeting"
## [8] "i was not aware of training."
## [9] "i found a company that provides the training and i could set the time and duration. also in the previous questions, i was asked about how much it cost based on criteria that did not represent the \"time spent\" for recertification. i can tell you that the training cost me roughly $70 for all three courses. i think you should add another choice to fully explain each situation."
## [10] "as a quality control professional in commercial & military construction i attended several different training classes over my career including the us army coe in ft stewart, robins afb as well as classes on soil erosion & control measures on jobsites & roads & highways in georgia."
## [11] "offered free training locally."
## [12] "usually get credits thru in house training"
## [13] "i took the course online"
## [14] "the topics were not relevant to my operation"
## [15] "former ohio pco who moved 10 year ago. completed pa in ga online for my small private green setup"
## [16] "got application at county agents office"
## [17] "just took test at local spot"
## [18] "went on line"
## [19] "i have the private applicators license, so i just watched the videos and took a test at the end."
## [20] "didn't know you offered"
## [21] "online training."
## [22] "on the job training"
## [23] "took online"
#Demographics
demo_data <- Pesticide_Clean[,231:235]
colnames(demo_data) = c("Gender", "Race", "Ethnicity", "Age", "Education_Level")
demo_data <- demo_data %>% mutate(across(everything(), as.character))
demo_data <- demo_data %>%
mutate(
Gender = dplyr::recode(as.character(Gender),
"1" = "Male",
"2" = "Female",
"3" = "Prefer not to answer"),
Race = dplyr::recode(as.character(Race),
"1" = "American Indian or Alaska Native",
"2" = "Asian",
"3" = "Black/African American",
"4" = "Native Hawaiian or Other Pacific Islander",
"5" = "White",
"6" = "More than one race",
"7" = "Other"),
Ethnicity = dplyr::recode(as.character(Ethnicity),
"1" = "Hispanic/Latino",
"2" = "Not Hispanic/Latino"),
Age = dplyr::recode(as.character(Age),
"9" = "Under 18",
"10" = "18–24",
"11" = "25–34",
"12" = "35–44",
"13" = "45–54",
"14" = "55–64",
"15" = "65–74",
"16" = "75–84",
"17" = "85+"),
Education_Level = dplyr::recode(as.character(Education_Level),
"1" = "Less than 9th grade",
"2" = "9th to 12th grade, no diploma",
"3" = "High school graduate or GED",
"4" = "Some college, no degree",
"5" = "Associate’s degree",
"6" = "Bachelor’s degree",
"7" = "Graduate or professional degree"),
)
#Filtering Data
demo_gender <- demo_data[, 1, drop = FALSE] %>%
mutate(
Gender = factor(Gender, levels = c(
"Male",
"Female",
"Prefer not to answer"
))
) %>%
filter(!is.na(Gender))
demo_race <- demo_data[, 2, drop = FALSE] %>%
mutate(
Race = factor(Race, levels = c(
"American Indian or Alaska Native",
"Asian",
"Black/African American",
"Native Hawaiian or Other Pacific Islander",
"White",
"More than one race",
"Other"
))
) %>%
filter(!is.na(Race), Race != "")
demo_ethnicity <- demo_data[, 3, drop = FALSE] %>%
mutate(
Ethnicity = factor(Ethnicity, levels = c(
"Hispanic/Latino",
"Not Hispanic/Latino"
))
) %>%
filter(!is.na(Ethnicity), Ethnicity != "")
demo_age <- demo_data[, 4, drop = FALSE] %>%
mutate(
Age = factor(Age, levels = c(
"Under 18",
"18–24",
"25–34",
"35–44",
"45–54",
"55–64",
"65–74",
"75–84",
"85+"
))
) %>%
filter(!is.na(Age), Age != "")
demo_edu <- demo_data[, 5, drop = FALSE] %>%
mutate(
Education_Level = factor(Education_Level, levels = c(
"Less than 9th grade",
"9th to 12th grade, no diploma",
"High school graduate or GED",
"Some college, no degree",
"Associate’s degree",
"Bachelor’s degree",
"Graduate or professional degree"
))
) %>%
filter(!is.na(Education_Level), Education_Level != "")
gender_summary <- demo_gender %>%
count(Gender) %>%
mutate(percent = n / sum(n) * 100)
gender_summary
#Demmographics with NA Values
# Gender
plot_demo_bar(demo_data, Gender, "Gender")
## Warning: Unknown palette: "Pastel4"
# Race
plot_demo_bar(demo_data, Race, "Distribution of Race")
## Warning: Unknown palette: "Pastel4"
# Ethnicity
plot_demo_bar(demo_data, Ethnicity, "Ethnicity")
## Warning: Unknown palette: "Pastel4"
# Age (categories)
plot_demo_bar(demo_data, Age, "Distribution of Age")
## Warning: Unknown palette: "Pastel4"
# Education
plot_demo_bar(demo_data, Education_Level, "Distribution of Education Level")
## Warning: Unknown palette: "Pastel4"
#Taking out NA
# Gender
gender <- plot_demo_bar(demo_gender, Gender, "Gender")
## Warning: Unknown palette: "Pastel4"
gender
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/gender_barchart.png", gender)
## Saving 7 x 5 in image
# Race
race <- plot_demo_bar(demo_race, Race, "Distribution of Race")
## Warning: Unknown palette: "Pastel4"
race
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/race_barchart.png", race)
## Saving 7 x 5 in image
# Ethnicity
ethnicity <- plot_demo_bar(demo_ethnicity, Ethnicity, "Ethnicity")
## Warning: Unknown palette: "Pastel4"
ethnicity
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/ethnicity_barchart.png", ethnicity)
## Saving 7 x 5 in image
# Age (categories)
age <- plot_demo_bar(demo_age, Age, "Distribution of Age")
## Warning: Unknown palette: "Pastel4"
age
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/age_barchart.png", age)
## Saving 7 x 5 in image
# Education
education <- plot_demo_bar(demo_edu, Education_Level, "Distribution of Education Level")
## Warning: Unknown palette: "Pastel4"
education
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/education_barchart.png", education)
## Saving 7 x 5 in image
#Section 2 Midterm Outcomes: Behavior Change
#Creating Data Frame
safety_data <- Pesticide_Clean[,187:236]
safety_data <- safety_data[,16:23]
colnames(safety_data) = c("Gloves", "Long-Sleeved Shirts", "Long Pants",
"Closed Toed Shoes", "Wash Hands", "Wash Face",
"Take a Shower", "Separating Clothing")
safety_data <- safety_data %>%
mutate(
Gloves = dplyr::recode(as.character(Gloves),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Long-Sleeved Shirts` = dplyr::recode(as.character(`Long-Sleeved Shirts`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Long Pants` = dplyr::recode(as.character(`Long Pants`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Closed Toed Shoes` = dplyr::recode(as.character(`Closed Toed Shoes`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Wash Hands` = dplyr::recode(as.character(`Wash Hands`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Wash Face` = dplyr::recode(as.character(`Wash Face`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Take a Shower` = dplyr::recode(as.character(`Take a Shower`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never"),
`Separating Clothing` = dplyr::recode(as.character(`Separating Clothing`),
"1" = "Always",
"2" = "Sometimes",
"3" = "Never")
)
#Checking Missingness for Each Column of Safety
safety_summary <- safety_data %>%
pivot_longer(
cols = everything(),
names_to = "Safety Measure",
values_to = "Response"
) %>%
group_by(`Safety Measure`) %>%
summarise(
Total_Responses = sum(!is.na(Response)),
NA_Count = sum(is.na(Response)),
Percent_Responded = round(
100 * Total_Responses / (Total_Responses + NA_Count), 1
),
.groups = "drop"
)
kable(
safety_summary,
col.names = c(
"Safety Measure",
"Total Responses",
"NA Count",
"Responded (%)"
),
align = c("l", "c", "c", "c"),
caption = "Response Rates for Safety Practice Questions"
)
| Safety Measure | Total Responses | NA Count | Responded (%) |
|---|---|---|---|
| Closed Toed Shoes | 526 | 107 | 83.1 |
| Gloves | 526 | 107 | 83.1 |
| Long Pants | 526 | 107 | 83.1 |
| Long-Sleeved Shirts | 526 | 107 | 83.1 |
| Separating Clothing | 526 | 107 | 83.1 |
| Take a Shower | 526 | 107 | 83.1 |
| Wash Face | 526 | 107 | 83.1 |
| Wash Hands | 526 | 107 | 83.1 |
#Checking that NA is is not partially missing
all_missing <- safety_data %>%
mutate(all_na = if_all(everything(), is.na)) %>%
pull(all_na)
table(all_missing)
## all_missing
## FALSE TRUE
## 526 107
#Removing Missigness for Data Visualization
safety_data_complete <- safety_data %>%
drop_na()
#Start Visualizaton
safety_pct <- safety_data_complete %>%
pivot_longer(
cols = everything(),
names_to = "Safety_Measure",
values_to = "Response"
) %>%
count(Safety_Measure, Response) %>%
group_by(Safety_Measure) %>%
mutate(
Percent = 100 * n / sum(n)
) %>%
ungroup() %>%
mutate(
Response = factor(Response, levels = c("Never", "Sometimes", "Always")),
Safety_Measure = factor(Safety_Measure, levels = colnames(safety_data))
)
safety <- ggplot(safety_pct, aes(x = Safety_Measure, y = Percent, fill = Response)) +
geom_col(position = "dodge") +
labs(
x = "Safety Measure",
y = "Percent of Respondents",
fill = "Response"
) +
scale_y_continuous(
limits = c(0, 100),
labels = scales::percent_format(scale = 1)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 35, hjust = 1))
safety
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/safety_bar.png", safety)
## Saving 7 x 5 in image
#Creating groups (pre and post application)
safety_grouped <- safety_data_complete %>%
pivot_longer(
cols = everything(),
names_to = "Safety_Measure",
values_to = "Response"
) %>%
mutate(
Group = case_when(
Safety_Measure %in% c(
"Gloves",
"Long-Sleeved Shirts",
"Long Pants",
"Closed Toed Shoes"
) ~ "Personal Protection",
Safety_Measure %in% c(
"Separating Clothing",
"Wash Hands",
"Wash Face",
"Take a Shower"
) ~ "Post-Application Hygiene"
)
)
#Visualization of Groups
safety_long_pct <- safety_data %>%
pivot_longer(
cols = everything(),
names_to = "Safety_Measure",
values_to = "Response"
) %>%
mutate(
Group = case_when(
Safety_Measure %in% c("Gloves", "Long-Sleeved Shirts", "Long Pants",
"Closed Toed Shoes") ~ "Personal Protection",
Safety_Measure %in% c("Separating Clothing", "Wash Hands", "Wash Face",
"Take a Shower") ~ "Post-Application Hygiene",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(Response)) %>%
count(Group, Safety_Measure, Response) %>%
group_by(Safety_Measure) %>%
mutate(Percent = 100 * n / sum(n)) %>%
ungroup() %>%
mutate(
Response = factor(Response, levels = c("Never", "Sometimes", "Always")),
Safety_Measure = factor(Safety_Measure, levels = colnames(safety_data))
)
# 3 bars per variable (dodged), percent on y-axis
ggplot(safety_long_pct, aes(x = Safety_Measure, y = Percent, fill = Response)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
facet_wrap(~ Group, scales = "free_x") +
labs(x = "Safety Measure", y = "Percent of Respondents", fill = "Response") +
scale_y_continuous(
limits = c(0, 100),
labels = scales::percent_format(scale = 1)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 35, hjust = 1),
legend.position = "top",
panel.grid.major.x = element_blank()
)
#Redoing grouping
safety_grouped_counts <- safety_data_complete %>%
pivot_longer(
cols = everything(),
names_to = "Safety_Measure",
values_to = "Response"
) %>%
mutate(
Response = factor(Response, levels = c("Always", "Sometimes", "Never")),
Group = case_when(
Safety_Measure %in% c("Gloves", "Long-Sleeved Shirts", "Long Pants", "Closed Toed Shoes") ~
"Personal Protection",
Safety_Measure %in% c("Separating Clothing", "Wash Hands", "Wash Face", "Take a Shower") ~
"Post-Application Hygiene",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(Group), !is.na(Response), Response != "") %>%
count(Group, Response, name = "Count")
safety_grouped_counts <- safety_grouped_counts %>%
group_by(Group) %>%
mutate(
Percent = 100 * Count / sum(Count)
) %>%
ungroup()
#Plotting
ggplot(safety_grouped_counts, aes(x = Group, y = Percent, fill = Response)) +
geom_col(position = position_dodge(width = 0.8), color = "black") +
scale_y_continuous(labels = function(x) paste0(round(x, 1), "%"), expand = c(0, 0)) +
labs(x = NULL, y = "Percent", fill = "Response") +
theme_minimal(base_size = 12)
#Question 2 in Part 2
safety_data_q2 <- Pesticide_Clean$Q14
safety_data_q2 <- data.frame(Q2 = safety_data_q2)
safety_data_q2 <- safety_data_q2 %>%
mutate(
Q2 = dplyr::recode(as.character(Q2),
"1" = "Extremely helpful",
"2" = "Very helpful",
"3" = "Somewhat helpful",
"4" = "Slightly helpful",
"5" = "Not at all helpful"),
Q2 = factor(Q2, levels = c(
"Extremely helpful",
"Very helpful",
"Somewhat helpful",
"Slightly helpful",
"Not at all helpful"
), ordered = TRUE)
)
safetyQ2 <- plot_demo_bar(
data = safety_data_q2 %>% filter(!is.na(Q2)),
var = Q2,
xlab = "Helpfulness of Pesticide Applicator Training"
)
## Warning: Unknown palette: "Pastel4"
safetyQ2
ggsave("~/Desktop/UGA/Fall 2025/STAT 5010W/Project/Charts/safetyQ2.png", safetyQ2)
## Saving 7 x 5 in image
#Analysis of Safety with Demographics
demo_data <- demo_data %>%
mutate(Respondent_ID = row_number())
safety_data <- safety_data %>%
mutate(Respondent_ID = row_number())
base_data <- demo_data %>%
left_join(safety_data, by = "Respondent_ID")
base_data <- base_data %>%
mutate(
Q2_raw = Pesticide_Clean$Q14,
Q2_numeric = as.numeric(as.character(Q2_raw)),
Q2_helpfulness = 6 - Q2_numeric
)
#Creating Data Frame in Order of Respondent
safety_long <- base_data %>%
pivot_longer(
cols = c(
Gloves, `Long-Sleeved Shirts`, `Long Pants`, `Closed Toed Shoes`,
`Separating Clothing`, `Wash Hands`, `Wash Face`, `Take a Shower`
),
names_to = "Safety_Measure",
values_to = "Response"
) %>%
mutate(
Group = case_when(
Safety_Measure %in% c(
"Gloves", "Long-Sleeved Shirts", "Long Pants", "Closed Toed Shoes"
) ~ "Personal Protection",
Safety_Measure %in% c(
"Separating Clothing", "Wash Hands", "Wash Face", "Take a Shower"
) ~ "Post-Application Hygiene"
)
)
#Scoring
safety_scored <- safety_long %>%
mutate(
Response_score = case_when(
Response == "Always" ~ 2,
Response == "Sometimes" ~ 1,
Response == "Never" ~ 0,
TRUE ~ NA_real_
)
)
#Creating Safety Index
safety_index <- safety_scored %>%
group_by(Respondent_ID) %>%
summarise(
Avg_Compliance = mean(Response_score, na.rm = TRUE),
n_items = sum(!is.na(Response_score)),
.groups = "drop"
)
n_distinct(safety_index$Respondent_ID)
## [1] 633
safety_index <- safety_scored %>%
group_by(Respondent_ID) %>%
summarise(
Avg_Compliance = mean(Response_score, na.rm = TRUE),
n_items = sum(!is.na(Response_score)),
.groups = "drop"
) %>%
mutate(
Compliance_prop = Avg_Compliance / 2
)
#Starting Analysis
analysis_data <- safety_index %>%
filter(n_items > 0) %>%
left_join(base_data, by = "Respondent_ID")
nrow(analysis_data)
## [1] 526
#Checking dropped people
n_distinct(analysis_data$Respondent_ID)
## [1] 526
safety_index %>%
summarise(
total = n_distinct(Respondent_ID),
excluded = sum(n_items == 0)
) #excluded in 214
#Full Model with all Predictors
# Rebuild employee / years datasets with IDs
Q6_model <- as.data.frame(Pesticide_Clean[,186])
colnames(Q6_model) <- c("Employees")
Q6_model <- Q6_model %>%
mutate(
Respondent_ID = row_number(),
Employees_num = dplyr::recode(as.character(Employees),
"1" = "1",
"2" = "2",
"3" = "3",
"4" = "4",
"5" = "5",
"6" = "6",
"7" = "7"
),
Employees_num = as.numeric(Employees_num)
)
Q7_model <- as.data.frame(Pesticide_Clean[,187])
colnames(Q7_model) <- c("Years")
Q7_model <- Q7_model %>%
mutate(
Respondent_ID = row_number(),
Years_num = dplyr::recode(as.character(Years),
"1" = "1",
"2" = "2",
"3" = "3",
"4" = "4",
"5" = "5",
"6" = "6"
),
Years_num = as.numeric(Years_num)
)
# Create numeric versions of ordinal demographics
base_model <- base_data %>%
mutate(
Age_num = case_when(
Age == "Under 18" ~ 1,
Age == "18–24" ~ 2,
Age == "25–34" ~ 3,
Age == "35–44" ~ 4,
Age == "45–54" ~ 5,
Age == "55–64" ~ 6,
Age == "65–74" ~ 7,
Age == "75–84" ~ 8,
Age == "85+" ~ 9,
TRUE ~ NA_real_
),
Education_num = case_when(
Education_Level == "Less than 9th grade" ~ 1,
Education_Level == "9th to 12th grade, no diploma" ~ 2,
Education_Level == "High school graduate or GED" ~ 3,
Education_Level == "Some college, no degree" ~ 4,
Education_Level == "Associate’s degree" ~ 5,
Education_Level == "Bachelor’s degree" ~ 6,
Education_Level == "Graduate or professional degree" ~ 7,
TRUE ~ NA_real_
)
)
# Build final modeling dataset
model_data <- safety_index %>%
filter(n_items > 0) %>%
left_join(base_model, by = "Respondent_ID") %>%
left_join(Q6_model %>% select(Respondent_ID, Employees_num), by = "Respondent_ID") %>%
left_join(Q7_model %>% select(Respondent_ID, Years_num), by = "Respondent_ID")
# Make categorical predictors factors
model_data <- model_data %>%
mutate(
Gender = factor(Gender),
Race_binary = ifelse(Race == "White", "White", "Non-White"),
Race_binary = factor(Race_binary),
Ethnicity = factor(Ethnicity)
)
Question4 <- Question4 %>%
mutate(Respondent_ID = row_number()) %>%
mutate(
Region = case_when(
# NORTHEAST
County %in% c(
"Athens-Clarke","Baldwin","Banks","Barrow","Butts","Columbia",
"Dawson","Elbert","Fannin","Franklin","Gilmer","Glascock",
"Greene","Habersham","Hall","Hancock","Hart","Jackson",
"Jasper","Jones","Lincoln","Lumpkin","Madison","McDuffie",
"Monroe","Morgan","Oconee","Oglethorpe","Pickens","Putnam",
"Rabun","Richmond","Stephens","Taliaferro","Towns","Union",
"Walton","Warren","White","Wilkes"
) ~ "North East",
# NORTHWEST
County %in% c(
"Bartow","Bibb","Carroll","Catoosa","Chattahoochee","Chattooga",
"Cherokee","Clayton","Cobb","Coweta","Crawford","Dade",
"DeKalb","Douglas","Fayette","Floyd","Forsyth","Fulton",
"Gordon","Gwinnett","Haralson","Harris","Heard","Henry",
"Lamar","Meriwether","Muscogee","Murray","Newton","Paulding",
"Pike","Polk","Rockdale","Spalding","Talbot","Troup",
"Upson","Walker","Whitfield"
) ~ "North West",
# SOUTHEAST
County %in% c(
"Appling","Atkinson","Bacon","Bleckley","Brantley","Bryan",
"Bulloch","Burke","Camden","Candler","Charlton","Chatham",
"Coffee","Dodge","Effingham","Emanuel","Evans","Glynn",
"Jeff Davis","Jefferson","Jenkins","Johnson","Laurens",
"Liberty","Long","McIntosh","Montgomery","Pierce","Screven",
"Tattnall","Telfair","Toombs","Treutlen","Twiggs",
"Ware","Washington","Wayne","Wheeler","Wilkinson"
) ~ "South East",
# SOUTHWEST
County %in% c(
"Baker","Ben Hill","Berrien","Brooks","Calhoun","Clay",
"Clinch","Colquitt","Cook","Crisp","Decatur","Dooly",
"Dougherty","Early","Echols","Grady","Houston","Irwin",
"Lanier","Lee","Lowndes","Macon","Marion","Miller",
"Mitchell","Peach","Pulaski","Quitman","Randolph",
"Schley","Seminole","Stewart","Sumter","Taylor",
"Terrell","Thomas","Tift","Turner","Webster",
"Wilcox","Worth"
) ~ "South West",
# METRO ATLANTA (subset of NW — we override)
County %in% c(
"Fulton","DeKalb","Cobb","Gwinnett","Clayton","Henry"
) ~ "Metro Atlanta",
TRUE ~ NA_character_
)
)
model_data <- model_data %>%
left_join(
Question4 %>% select(Respondent_ID, Region),
by = "Respondent_ID"
) %>%
mutate(Region = factor(Region))
model_data <- model_data %>%
mutate(
Region = factor(Region,
levels = c("North East","North West","South East","South West","Metro Atlanta")
)
)
model_data <- model_data %>%
mutate(
Region = ifelse(Region == "Metro Atlanta", "North West", Region),
Region = factor(Region)
)
model_data <- model_data %>%
mutate(
Region = case_when(
Region == 1 ~ "North East",
Region == 2 ~ "North West",
Region == 3 ~ "South East",
Region == 4 ~ "South West",
TRUE ~ as.character(Region)
),
Region = factor(Region)
)
#Full Model
full_lm <- lm(
Compliance_prop ~
Age_num +
Education_num +
Employees_num +
Years_num +
Gender +
Race_binary +
Ethnicity +
Q2_helpfulness +
Region,
data = model_data
)
summary(full_lm)
##
## Call:
## lm(formula = Compliance_prop ~ Age_num + Education_num + Employees_num +
## Years_num + Gender + Race_binary + Ethnicity + Q2_helpfulness +
## Region, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79068 -0.08849 0.01341 0.11731 0.33322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.700977 0.105435 6.648 1.18e-10 ***
## Age_num 0.012187 0.006275 1.942 0.05296 .
## Education_num -0.002698 0.005927 -0.455 0.64922
## Employees_num 0.004829 0.007636 0.632 0.52759
## Years_num -0.006774 0.005583 -1.213 0.22584
## GenderMale -0.012776 0.027534 -0.464 0.64293
## GenderPrefer not to answer -0.069635 0.160822 -0.433 0.66529
## Race_binaryWhite -0.098809 0.031250 -3.162 0.00171 **
## EthnicityNot Hispanic/Latino 0.002567 0.070604 0.036 0.97102
## Q2_helpfulness 0.048101 0.010173 4.728 3.32e-06 ***
## RegionNorth West -0.006972 0.027040 -0.258 0.79668
## RegionSouth East -0.003881 0.023915 -0.162 0.87118
## RegionSouth West -0.075107 0.023805 -3.155 0.00175 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1527 on 341 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.1482, Adjusted R-squared: 0.1182
## F-statistic: 4.945 on 12 and 341 DF, p-value: 1.477e-07
#Checking higher order terms
model_data <- model_data %>%
mutate(
Age_sq = Age_num^2,
Education_sq = Education_num^2,
Employees_sq = Employees_num^2,
Years_sq = Years_num^2
)
lm_poly <- lm(
Compliance_prop ~
Age_num + Age_sq +
Education_num + Education_sq +
Employees_num + Employees_sq +
Years_num + Years_sq +
Gender + Race_binary + Ethnicity +
Q2_helpfulness + Region,
data = model_data
)
summary(lm_poly)
##
## Call:
## lm(formula = Compliance_prop ~ Age_num + Age_sq + Education_num +
## Education_sq + Employees_num + Employees_sq + Years_num +
## Years_sq + Gender + Race_binary + Ethnicity + Q2_helpfulness +
## Region, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79518 -0.08983 0.01159 0.11487 0.33307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.107088 0.180544 6.132 2.43e-09 ***
## Age_num -0.069309 0.037668 -1.840 0.06665 .
## Age_sq 0.007459 0.003423 2.179 0.03000 *
## Education_num -0.056488 0.046113 -1.225 0.22144
## Education_sq 0.005368 0.004577 1.173 0.24163
## Employees_num -0.018613 0.024471 -0.761 0.44742
## Employees_sq 0.003383 0.003521 0.961 0.33733
## Years_num -0.050757 0.035426 -1.433 0.15285
## Years_sq 0.005993 0.004495 1.333 0.18331
## GenderMale -0.004799 0.027468 -0.175 0.86140
## GenderPrefer not to answer -0.148742 0.162342 -0.916 0.36021
## Race_binaryWhite -0.096903 0.031052 -3.121 0.00196 **
## EthnicityNot Hispanic/Latino 0.014598 0.071063 0.205 0.83737
## Q2_helpfulness 0.046799 0.010186 4.594 6.14e-06 ***
## RegionNorth West -0.009235 0.026956 -0.343 0.73213
## RegionSouth East -0.004395 0.023884 -0.184 0.85410
## RegionSouth West -0.073587 0.023789 -3.093 0.00214 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1514 on 337 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.1725, Adjusted R-squared: 0.1333
## F-statistic: 4.392 on 16 and 337 DF, p-value: 7.012e-08
#Taking out missingness for step()
model_data_complete <- model_data %>%
select(
Avg_Compliance,
Compliance_prop,
Age_num, Age_sq,
Education_num, Education_sq,
Employees_num, Employees_sq,
Years_num, Years_sq,
Gender,
Race_binary,
Ethnicity,
Q2_helpfulness,
Region
) %>%
drop_na()
#Refitting model wihtout missingness
lm_poly_complete <- lm(
Compliance_prop ~
Age_num + Age_sq +
Education_num + Education_sq +
Employees_num + Employees_sq +
Years_num + Years_sq +
Gender + Race_binary + Ethnicity +
Q2_helpfulness + Region,
data = model_data_complete
)
#Running step() model
step_model <- step(lm_poly_complete, direction = "both")
## Start: AIC=-1320.19
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq +
## Employees_num + Employees_sq + Years_num + Years_sq + Gender +
## Race_binary + Ethnicity + Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Gender 2 0.01934 7.7400 -1323.3
## - Ethnicity 1 0.00097 7.7216 -1322.2
## - Employees_num 1 0.01325 7.7339 -1321.6
## - Employees_sq 1 0.02115 7.7418 -1321.2
## - Education_sq 1 0.03152 7.7522 -1320.8
## - Education_num 1 0.03438 7.7551 -1320.6
## - Years_sq 1 0.04073 7.7614 -1320.3
## <none> 7.7207 -1320.2
## - Years_num 1 0.04703 7.7677 -1320.0
## - Age_num 1 0.07756 7.7982 -1318.7
## - Age_sq 1 0.10880 7.8295 -1317.2
## - Race_binary 1 0.22312 7.9438 -1312.1
## - Region 3 0.35197 8.0726 -1310.4
## - Q2_helpfulness 1 0.48359 8.2043 -1300.7
##
## Step: AIC=-1323.3
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq +
## Employees_num + Employees_sq + Years_num + Years_sq + Race_binary +
## Ethnicity + Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Ethnicity 1 0.00058 7.7406 -1325.3
## - Employees_num 1 0.01235 7.7524 -1324.7
## - Employees_sq 1 0.02087 7.7609 -1324.3
## - Education_sq 1 0.03024 7.7703 -1323.9
## - Education_num 1 0.03241 7.7724 -1323.8
## - Years_sq 1 0.04078 7.7808 -1323.4
## <none> 7.7400 -1323.3
## - Years_num 1 0.04738 7.7874 -1323.1
## - Age_num 1 0.06673 7.8067 -1322.3
## - Age_sq 1 0.09758 7.8376 -1320.9
## + Gender 2 0.01934 7.7207 -1320.2
## - Race_binary 1 0.20818 7.9482 -1315.9
## - Region 3 0.35485 8.0949 -1313.4
## - Q2_helpfulness 1 0.48636 8.2264 -1303.7
##
## Step: AIC=-1325.28
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq +
## Employees_num + Employees_sq + Years_num + Years_sq + Race_binary +
## Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Employees_num 1 0.01214 7.7527 -1326.7
## - Employees_sq 1 0.02040 7.7610 -1326.3
## - Education_sq 1 0.03030 7.7709 -1325.9
## - Education_num 1 0.03248 7.7731 -1325.8
## - Years_sq 1 0.04243 7.7830 -1325.3
## <none> 7.7406 -1325.3
## - Years_num 1 0.04907 7.7897 -1325.0
## - Age_num 1 0.06617 7.8068 -1324.3
## + Ethnicity 1 0.00058 7.7400 -1323.3
## - Age_sq 1 0.09700 7.8376 -1322.9
## + Gender 2 0.01895 7.7216 -1322.2
## - Race_binary 1 0.20872 7.9493 -1317.9
## - Region 3 0.35467 8.0953 -1315.4
## - Q2_helpfulness 1 0.48695 8.2275 -1305.7
##
## Step: AIC=-1326.72
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq +
## Employees_sq + Years_num + Years_sq + Race_binary + Q2_helpfulness +
## Region
##
## Df Sum of Sq RSS AIC
## - Employees_sq 1 0.01536 7.7681 -1328.0
## - Education_sq 1 0.03585 7.7886 -1327.1
## - Education_num 1 0.03793 7.7907 -1327.0
## - Years_sq 1 0.04209 7.7948 -1326.8
## <none> 7.7527 -1326.7
## - Years_num 1 0.05006 7.8028 -1326.4
## - Age_num 1 0.07071 7.8234 -1325.5
## + Employees_num 1 0.01214 7.7406 -1325.3
## + Ethnicity 1 0.00038 7.7524 -1324.7
## - Age_sq 1 0.10604 7.8588 -1323.9
## + Gender 2 0.01813 7.7346 -1323.5
## - Race_binary 1 0.21641 7.9691 -1319.0
## - Region 3 0.34729 8.1000 -1317.2
## - Q2_helpfulness 1 0.49981 8.2526 -1306.6
##
## Step: AIC=-1328.02
## Compliance_prop ~ Age_num + Age_sq + Education_num + Education_sq +
## Years_num + Years_sq + Race_binary + Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Education_sq 1 0.03701 7.8051 -1328.3
## - Education_num 1 0.03847 7.8066 -1328.3
## <none> 7.7681 -1328.0
## - Years_sq 1 0.04641 7.8145 -1327.9
## - Years_num 1 0.05439 7.8225 -1327.5
## + Employees_sq 1 0.01536 7.7527 -1326.7
## - Age_num 1 0.07833 7.8464 -1326.5
## + Employees_num 1 0.00710 7.7610 -1326.3
## + Ethnicity 1 0.00001 7.7681 -1326.0
## - Age_sq 1 0.11140 7.8795 -1325.0
## + Gender 2 0.02070 7.7474 -1325.0
## - Race_binary 1 0.21614 7.9842 -1320.3
## - Region 3 0.33631 8.1044 -1319.0
## - Q2_helpfulness 1 0.50004 8.2681 -1307.9
##
## Step: AIC=-1328.34
## Compliance_prop ~ Age_num + Age_sq + Education_num + Years_num +
## Years_sq + Race_binary + Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Education_num 1 0.00176 7.8069 -1330.3
## <none> 7.8051 -1328.3
## - Years_sq 1 0.04470 7.8498 -1328.3
## + Education_sq 1 0.03701 7.7681 -1328.0
## - Years_num 1 0.05357 7.8587 -1327.9
## + Employees_sq 1 0.01652 7.7886 -1327.1
## - Age_num 1 0.07795 7.8830 -1326.8
## + Employees_num 1 0.00660 7.7985 -1326.6
## + Ethnicity 1 0.00001 7.8051 -1326.3
## - Age_sq 1 0.11145 7.9166 -1325.3
## + Gender 2 0.01935 7.7857 -1325.2
## - Race_binary 1 0.22222 8.0273 -1320.4
## - Region 3 0.33711 8.1422 -1319.4
## - Q2_helpfulness 1 0.51602 8.3211 -1307.7
##
## Step: AIC=-1330.26
## Compliance_prop ~ Age_num + Age_sq + Years_num + Years_sq + Race_binary +
## Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## <none> 7.8069 -1330.3
## - Years_sq 1 0.04503 7.8519 -1330.2
## - Years_num 1 0.05402 7.8609 -1329.8
## + Employees_sq 1 0.01530 7.7916 -1329.0
## - Age_num 1 0.07805 7.8849 -1328.7
## + Employees_num 1 0.00599 7.8009 -1328.5
## + Education_num 1 0.00176 7.8051 -1328.3
## + Education_sq 1 0.00030 7.8066 -1328.3
## + Ethnicity 1 0.00000 7.8069 -1328.3
## - Age_sq 1 0.11170 7.9186 -1327.2
## + Gender 2 0.01778 7.7891 -1327.1
## - Race_binary 1 0.22684 8.0337 -1322.1
## - Region 3 0.33770 8.1446 -1321.3
## - Q2_helpfulness 1 0.51875 8.3256 -1309.5
summary(step_model)
##
## Call:
## lm(formula = Compliance_prop ~ Age_num + Age_sq + Years_num +
## Years_sq + Race_binary + Q2_helpfulness + Region, data = model_data_complete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79200 -0.09415 0.01411 0.11932 0.31944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.953936 0.116659 8.177 5.66e-15 ***
## Age_num -0.066969 0.036111 -1.855 0.06452 .
## Age_sq 0.007312 0.003296 2.219 0.02717 *
## Years_num -0.053914 0.034944 -1.543 0.12378
## Years_sq 0.006240 0.004430 1.409 0.15984
## Race_binaryWhite -0.094954 0.030034 -3.162 0.00171 **
## Q2_helpfulness 0.047718 0.009981 4.781 2.59e-06 ***
## RegionNorth West -0.010113 0.026469 -0.382 0.70265
## RegionSouth East -0.002645 0.023520 -0.112 0.91052
## RegionSouth West -0.070659 0.023446 -3.014 0.00277 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1506 on 344 degrees of freedom
## Multiple R-squared: 0.1633, Adjusted R-squared: 0.1414
## F-statistic: 7.46 on 9 and 344 DF, p-value: 5.645e-10
nrow(model_data_complete)
## [1] 354
nobs(step_model)
## [1] 354
####Partial Model####
# Create grouped safety scores
safety_grouped_scores <- safety_scored %>%
mutate(
Behavior_Group = case_when(
Safety_Measure %in% c("Gloves", "Long-Sleeved Shirts", "Long Pants", "Closed Toed Shoes")
~ "Personal Protection",
Safety_Measure %in% c("Wash Hands", "Wash Face", "Take a Shower", "Separating Clothing")
~ "Post-Application Hygiene"
)
) %>%
group_by(Respondent_ID, Behavior_Group) %>%
summarise(
Avg_Compliance = mean(Response_score, na.rm = TRUE),
n_items = sum(!is.na(Response_score)),
.groups = "drop"
) %>%
mutate(
Compliance_Prop = Avg_Compliance / 2 # scale 0–1
)
pp_data <- safety_grouped_scores %>%
filter(Behavior_Group == "Personal Protection")
pah_data <- safety_grouped_scores %>%
filter(Behavior_Group == "Post-Application Hygiene")
pp_model_data <- pp_data %>%
left_join(model_data, by = "Respondent_ID") %>%
filter(!is.na(Compliance_Prop))
pah_model_data <- pah_data %>%
left_join(model_data, by = "Respondent_ID") %>%
filter(!is.na(Compliance_Prop))
lm_pp <- lm(
Compliance_Prop ~ Age_num + Age_sq +
Years_num + Years_sq +
Race_binary + Q2_helpfulness,
data = pp_model_data
)
summary(lm_pp)
##
## Call:
## lm(formula = Compliance_Prop ~ Age_num + Age_sq + Years_num +
## Years_sq + Race_binary + Q2_helpfulness, data = pp_model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84409 -0.11049 0.03091 0.14895 0.28809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.935867 0.127500 7.340 1.34e-12 ***
## Age_num -0.060410 0.039078 -1.546 0.122970
## Age_sq 0.006548 0.003563 1.838 0.066865 .
## Years_num -0.054831 0.038002 -1.443 0.149896
## Years_sq 0.005936 0.004827 1.230 0.219618
## Race_binaryWhite -0.065263 0.031196 -2.092 0.037106 *
## Q2_helpfulness 0.039551 0.011311 3.497 0.000528 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1741 on 375 degrees of freedom
## (144 observations deleted due to missingness)
## Multiple R-squared: 0.07102, Adjusted R-squared: 0.05615
## F-statistic: 4.778 on 6 and 375 DF, p-value: 0.0001038
lm_pah <- lm(
Compliance_Prop ~ Age_num + Age_sq +
Years_num + Years_sq +
Race_binary + Q2_helpfulness,
data = pah_model_data
)
summary(lm_pah)
##
## Call:
## lm(formula = Compliance_Prop ~ Age_num + Age_sq + Years_num +
## Years_sq + Race_binary + Q2_helpfulness, data = pah_model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84524 -0.10733 0.03439 0.14980 0.33304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.745251 0.132124 5.641 3.34e-08 ***
## Age_num -0.051527 0.040495 -1.272 0.2040
## Age_sq 0.005876 0.003692 1.592 0.1123
## Years_num -0.020295 0.039380 -0.515 0.6066
## Years_sq 0.002332 0.005002 0.466 0.6413
## Race_binaryWhite -0.055813 0.032327 -1.726 0.0851 .
## Q2_helpfulness 0.059559 0.011721 5.081 5.93e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1804 on 375 degrees of freedom
## (144 observations deleted due to missingness)
## Multiple R-squared: 0.09421, Adjusted R-squared: 0.07972
## F-statistic: 6.501 on 6 and 375 DF, p-value: 1.538e-06
#Economic Data Analysis #Question 1b:
Q16pta<- Pesticide_Clean$Q16_4
table(Q16pta, useNA = "always")
## Q16pta
## 0 1 100 1000 10000 100000 1100 1200 12000 125000 1260
## 158 5 4 7 9 4 1 2 2 1 1
## 143000 1500 15000 150000 1600 18000 2 200 2000 200000 2090
## 1 4 2 2 1 1 1 1 4 1 1
## 2400 24000 2500 25000 2800 3000 30000 3100 31200 32000 33000
## 3 1 5 4 1 3 6 1 1 1 1
## 35000 36000 37440 4000 40000 400000 4400 45000 46000 47000 500
## 2 1 1 2 7 1 1 1 1 1 4
## 5000 50000 55000 600 6000 60000 75000 8000 80000 <NA>
## 7 14 1 2 2 2 1 1 1 340
Q16ptb<- Pesticide_Clean$Q16_5
table(Q16ptb, useNA = "always")
## Q16ptb
## 0 1 100 1000 10000 100000 101 110000 12000 14000 143000
## 165 4 3 1 6 6 1 1 3 1 1
## 150 1500 15000 150000 16000 1620 2 200 2000 20000 200000
## 1 2 3 2 1 1 2 1 7 2 1
## 215000 225000 23000 2500 25000 250000 3000 30000 31200 3200 32000
## 1 1 1 4 2 1 6 3 1 1 1
## 3500 35000 38000 400 40000 40400 42000 450 45000 450000 45760
## 1 1 1 1 5 1 1 1 2 1 1
## 500 5000 50000 52000 5500 55000 6000 60000 6500 65000 700
## 1 6 6 1 1 1 3 7 1 1 1
## 70000 750 7500 75000 800 8000 80000 9000 <NA>
## 2 1 1 2 1 1 4 1 338
table(is.na(Pesticide_Clean$Q16_4), is.na(Pesticide_Clean$Q16_5))
##
## FALSE TRUE
## FALSE 289 4
## TRUE 6 334
#Creating my variables
Q16<- data.frame(
before = Pesticide_Clean$Q16_4,
after = Pesticide_Clean$Q16_5
)
Q16<- Q16[!is.na(Q16$before) & !is.na(Q16$after), ]
Q16$change<- as.numeric(Q16$after) - as.numeric(Q16$before)
nrow(Q16)
## [1] 289
summary(Q16)
## before after change
## Length:289 Length:289 Min. :-143500
## Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Median : 0
## Mean : 3603
## 3rd Qu.: 2000
## Max. : 100000
ggplot(Q16, aes(x = change)) +
geom_histogram(bins = 10, color = "black", fill = "purple4") +
scale_x_continuous(labels = comma) +
labs(
title = "Histogram of Change",
x = "Change (After - Before)",
y = "Number of Respondents"
) +
theme_minimal()
#Paired t-test
t.test(as.numeric(Q16$after), as.numeric(Q16$before), paired = TRUE)
##
## Paired t-test
##
## data: as.numeric(Q16$after) and as.numeric(Q16$before)
## t = 3.1318, df = 288, p-value = 0.001916
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 1338.567 5867.274
## sample estimates:
## mean difference
## 3602.92
#Question 1d:
Q18pta<- Pesticide_Clean$Q18_4
table(Q18pta, useNA = "always")
## Q18pta
## 0 1 100 1000 10000 100000 101 11 12000 1400 150
## 152 4 5 10 13 2 1 1 1 1 1
## 1500 15000 150000 17000 1800 20 200 2000 20000 2200 2400
## 3 2 1 1 1 1 1 14 6 1 3
## 2500 25000 2800 300 3000 30000 3500 35000 400 4000 40000
## 9 2 2 2 5 2 1 1 2 3 1
## 400000 42000 4500 500 5000 50000 54235 5500 600 6000 750
## 1 1 1 9 10 8 1 1 2 3 1
## 800 8000 80000 850 <NA>
## 1 3 1 1 334
Q18ptb<- Pesticide_Clean$Q18_5
table(Q18ptb, useNA = "always")
## Q18ptb
## 0 1 100 1000 10000 1200 1300 140000 150 1500 15000
## 161 3 5 6 6 6 1 1 3 4 4
## 1600 16000 17000 1800 1850 2 200 2000 20000 2400 250
## 1 1 1 2 1 1 5 7 4 2 2
## 2500 25000 300 3000 30000 3500 400 4000 40000 400000 450
## 4 4 3 4 2 2 1 4 2 1 1
## 4500 45000 50 500 5000 50000 51335 5500 58000 6 600
## 1 1 1 9 7 3 1 1 1 1 2
## 6000 7000 750 7500 80 800 8000 80000 855 900 9000
## 1 1 1 1 1 3 4 1 1 1 1
## <NA>
## 334
table(is.na(Pesticide_Clean$Q18_4), is.na(Pesticide_Clean$Q18_5))
##
## FALSE TRUE
## FALSE 295 4
## TRUE 4 330
#Creating my variables
Q18<- data.frame(
before = Pesticide_Clean$Q18_4,
after = Pesticide_Clean$Q18_5
)
Q18<- Q18[!is.na(Q18$before) & !is.na(Q18$after), ]
Q18$change<- as.numeric(Q18$after) - as.numeric(Q18$before)
nrow(Q18)
## [1] 295
summary(Q18)
## before after change
## Length:295 Length:295 Min. :-100000
## Class :character Class :character 1st Qu.: -1000
## Mode :character Mode :character Median : 0
## Mean : -1486
## 3rd Qu.: 0
## Max. : 60000
ggplot(Q18, aes(x = change)) +
geom_histogram(bins = 10, color = "black", fill = "green4") +
scale_x_continuous(labels = comma) +
labs(
title = "Histogram of Change",
x = "Change (After - Before)",
y = "Number of Respondents"
) +
theme_minimal()
#Paired t-test
t.test(as.numeric(Q18$after), as.numeric(Q18$before), paired = TRUE)
##
## Paired t-test
##
## data: as.numeric(Q18$after) and as.numeric(Q18$before)
## t = -2.4126, df = 294, p-value = 0.01645
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -2697.673 -273.751
## sample estimates:
## mean difference
## -1485.712
##Economic ANOVAs
profit_data <- data.frame(
Respondent_ID = 1:nrow(Pesticide_Clean),
profit_change = as.numeric(Pesticide_Clean$Q16_5) - as.numeric(Pesticide_Clean$Q16_4)
)
profit_data <- profit_data %>%
filter(!is.na(profit_change)) %>%
left_join(model_data, by = "Respondent_ID")
savings_data <- data.frame(
Respondent_ID = 1:nrow(Pesticide_Clean),
savings_change = as.numeric(Pesticide_Clean$Q18_5) - as.numeric(Pesticide_Clean$Q18_4)
)
savings_data <- savings_data %>%
filter(!is.na(savings_change)) %>%
left_join(model_data, by = "Respondent_ID")
#Profit ANOVAs
aov_profit_edu <- aov(profit_change ~ Education_num, data = profit_data)
summary(aov_profit_edu)
## Df Sum Sq Mean Sq F value Pr(>F)
## Education_num 1 8.233e+08 823295030 2.082 0.15
## Residuals 276 1.092e+11 395513703
## 11 observations deleted due to missingness
aov_profit_age <- aov(profit_change ~ Age_num, data = profit_data)
summary(aov_profit_age)
## Df Sum Sq Mean Sq F value Pr(>F)
## Age_num 1 6.334e+07 63340118 0.159 0.69
## Residuals 276 1.099e+11 398267163
## 11 observations deleted due to missingness
aov_profit_race <- aov(profit_change ~ Race_binary, data = profit_data)
summary(aov_profit_race)
## Df Sum Sq Mean Sq F value Pr(>F)
## Race_binary 1 3.629e+07 36291677 0.091 0.763
## Residuals 275 1.099e+11 399762321
## 12 observations deleted due to missingness
aov_profit_region <- aov(profit_change ~ Region, data = profit_data)
summary(aov_profit_region)
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 3 1.186e+08 39541157 0.099 0.96
## Residuals 276 1.099e+11 398129357
## 9 observations deleted due to missingness
aov_profit_all <- aov(profit_change ~ Education_num + Age_num + Race_binary + Region,
data=profit_data)
summary(aov_profit_all)
## Df Sum Sq Mean Sq F value Pr(>F)
## Education_num 1 9.432e+08 943214099 2.267 0.133
## Age_num 1 3.620e+07 36199012 0.087 0.768
## Race_binary 1 6.746e+07 67455562 0.162 0.688
## Region 3 1.748e+08 58271682 0.140 0.936
## Residuals 261 1.086e+11 416045398
## 21 observations deleted due to missingness
#Savings ANOVAs
aov_savings_edu <- aov(savings_change ~ Education_num, data=savings_data)
summary(aov_savings_edu)
## Df Sum Sq Mean Sq F value Pr(>F)
## Education_num 1 8.091e+07 80909945 0.694 0.405
## Residuals 281 3.275e+10 116544089
## 12 observations deleted due to missingness
aov_savings_age <- aov(savings_change ~ Age_num, data=savings_data)
summary(aov_savings_age)
## Df Sum Sq Mean Sq F value Pr(>F)
## Age_num 1 4.135e+08 413547671 3.585 0.0593 .
## Residuals 281 3.242e+10 115360325
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12 observations deleted due to missingness
aov_savings_race <- aov(savings_change ~ Race_binary, data=savings_data)
summary(aov_savings_race)
## Df Sum Sq Mean Sq F value Pr(>F)
## Race_binary 1 1.501e+07 15008566 0.128 0.721
## Residuals 280 3.281e+10 117187007
## 13 observations deleted due to missingness
aov_savings_region <- aov(savings_change ~ Region, data=savings_data)
summary(aov_savings_region)
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 3 3.990e+08 133009513 1.163 0.324
## Residuals 284 3.248e+10 114357051
## 7 observations deleted due to missingness
aov_savings_all <- aov(savings_change ~ Education_num + Age_num + Race_binary +
Region, data=savings_data)
summary(aov_savings_all)
## Df Sum Sq Mean Sq F value Pr(>F)
## Education_num 1 7.545e+07 75452158 0.637 0.4256
## Age_num 1 4.218e+08 421765811 3.560 0.0603 .
## Race_binary 1 1.542e+07 15417740 0.130 0.7186
## Region 3 5.493e+08 183084542 1.545 0.2031
## Residuals 268 3.175e+10 118471894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 20 observations deleted due to missingness
#Regression Models for Economics
economic_vars <- Pesticide_Clean %>%
mutate(
Respondent_ID = row_number(),
profit_before = as.numeric(Q16_4),
profit_after = as.numeric(Q16_5),
profit_change = profit_after - profit_before,
savings_before = as.numeric(Q18_4),
savings_after = as.numeric(Q18_5),
savings_change = savings_after - savings_before
) %>%
select(Respondent_ID, profit_change, savings_change)
model_data_econ <- model_data %>%
left_join(economic_vars, by = "Respondent_ID")
profit_model_data <- model_data_econ %>%
filter(!is.na(profit_change))
profit_model_data <- profit_model_data %>%
mutate(
Age_sq = Age_num^2,
Years_sq = Years_num^2
)
savings_model_data <- model_data_econ %>%
filter(!is.na(savings_change))
savings_model_data <- savings_model_data %>%
mutate(
Age_sq = Age_num^2,
Years_sq = Years_num^2
)
lm_profit <- lm(
profit_change ~ Age_num + Age_sq + Education_num + Employees_num + Years_num + Years_sq +
Gender + Race_binary + Ethnicity + Q2_helpfulness + Region,
data = profit_model_data
)
summary(lm_profit)
##
## Call:
## lm(formula = profit_change ~ Age_num + Age_sq + Education_num +
## Employees_num + Years_num + Years_sq + Gender + Race_binary +
## Ethnicity + Q2_helpfulness + Region, data = profit_model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -144628 -5211 -2426 1188 89978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2120.8 24613.3 -0.086 0.931
## Age_num -4659.1 5914.6 -0.788 0.432
## Age_sq 399.6 532.9 0.750 0.454
## Education_num -1164.4 904.0 -1.288 0.199
## Employees_num 1628.7 1286.9 1.266 0.207
## Years_num 1311.8 5439.5 0.241 0.810
## Years_sq -253.6 690.0 -0.367 0.714
## GenderMale 5334.3 4067.9 1.311 0.191
## GenderPrefer not to answer -7306.6 21949.9 -0.333 0.740
## Race_binaryWhite 439.2 5487.3 0.080 0.936
## EthnicityNot Hispanic/Latino 10711.3 15142.8 0.707 0.480
## Q2_helpfulness 988.1 1596.9 0.619 0.537
## RegionNorth West 1750.3 4307.0 0.406 0.685
## RegionSouth East 2009.6 3701.5 0.543 0.588
## RegionSouth West -263.1 3712.1 -0.071 0.944
##
## Residual standard error: 19970 on 241 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.02611, Adjusted R-squared: -0.03046
## F-statistic: 0.4615 on 14 and 241 DF, p-value: 0.9512
profit_model_data_complete <- model_data_econ %>%
select(
profit_change,
Age_num, Education_num, Employees_num, Years_num,
Gender, Race_binary, Ethnicity, Q2_helpfulness, Region
) %>%
drop_na() %>%
mutate(
Age_sq = Age_num^2,
Years_sq = Years_num^2,
Employees_sq = Employees_num^2
)
lm_profit_complete <- lm(
profit_change ~ Age_num + Age_sq +
Years_num + Years_sq +
Employees_num + Employees_sq +
Education_num +
Gender + Race_binary + Ethnicity +
Q2_helpfulness + Region,
data = profit_model_data_complete
)
step_profit <- step(lm_profit_complete, direction = "both")
## Start: AIC=5085.96
## profit_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num +
## Employees_sq + Education_num + Gender + Race_binary + Ethnicity +
## Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Region 3 249876048 9.6211e+10 5080.6
## - Gender 2 719087784 9.6680e+10 5083.9
## - Race_binary 1 2431225 9.5964e+10 5084.0
## - Years_num 1 25932219 9.5987e+10 5084.0
## - Years_sq 1 61230330 9.6022e+10 5084.1
## - Employees_sq 1 106422000 9.6068e+10 5084.2
## - Ethnicity 1 137402049 9.6099e+10 5084.3
## - Q2_helpfulness 1 156914674 9.6118e+10 5084.4
## - Age_sq 1 237429060 9.6199e+10 5084.6
## - Age_num 1 253479019 9.6215e+10 5084.6
## - Employees_num 1 324204744 9.6285e+10 5084.8
## - Education_num 1 607295561 9.6569e+10 5085.6
## <none> 9.5961e+10 5086.0
##
## Step: AIC=5080.63
## profit_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num +
## Employees_sq + Education_num + Gender + Race_binary + Ethnicity +
## Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Gender 2 698140894 9.6909e+10 5078.5
## - Race_binary 1 3397090 9.6215e+10 5078.6
## - Years_num 1 22875472 9.6234e+10 5078.7
## - Years_sq 1 57173350 9.6268e+10 5078.8
## - Employees_sq 1 129857920 9.6341e+10 5079.0
## - Ethnicity 1 157988500 9.6369e+10 5079.0
## - Q2_helpfulness 1 186466293 9.6398e+10 5079.1
## - Age_sq 1 224493920 9.6436e+10 5079.2
## - Age_num 1 235824837 9.6447e+10 5079.3
## - Employees_num 1 347331385 9.6558e+10 5079.5
## - Education_num 1 584780009 9.6796e+10 5080.2
## <none> 9.6211e+10 5080.6
## + Region 3 249876048 9.5961e+10 5086.0
##
## Step: AIC=5078.48
## profit_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num +
## Employees_sq + Education_num + Race_binary + Ethnicity +
## Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Years_num 1 25109686 9.6934e+10 5076.5
## - Race_binary 1 26655392 9.6936e+10 5076.5
## - Years_sq 1 51494975 9.6961e+10 5076.6
## - Ethnicity 1 88085574 9.6997e+10 5076.7
## - Q2_helpfulness 1 108872215 9.7018e+10 5076.8
## - Age_num 1 117129851 9.7026e+10 5076.8
## - Age_sq 1 118716178 9.7028e+10 5076.8
## - Employees_sq 1 184362206 9.7094e+10 5077.0
## - Employees_num 1 400476023 9.7310e+10 5077.5
## - Education_num 1 662741115 9.7572e+10 5078.2
## <none> 9.6909e+10 5078.5
## + Gender 2 698140894 9.6211e+10 5080.6
## + Region 3 228929158 9.6680e+10 5083.9
##
## Step: AIC=5076.54
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num +
## Employees_sq + Education_num + Race_binary + Ethnicity +
## Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Race_binary 1 24637648 9.6959e+10 5074.6
## - Ethnicity 1 80076841 9.7014e+10 5074.8
## - Q2_helpfulness 1 98845680 9.7033e+10 5074.8
## - Age_num 1 107915761 9.7042e+10 5074.8
## - Age_sq 1 111250211 9.7046e+10 5074.8
## - Employees_sq 1 180818464 9.7115e+10 5075.0
## - Years_sq 1 192904028 9.7127e+10 5075.1
## - Employees_num 1 393166613 9.7328e+10 5075.6
## - Education_num 1 658938692 9.7593e+10 5076.3
## <none> 9.6934e+10 5076.5
## + Years_num 1 25109686 9.6909e+10 5078.5
## + Gender 2 700375108 9.6234e+10 5078.7
## + Region 3 228352718 9.6706e+10 5081.9
##
## Step: AIC=5074.61
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num +
## Employees_sq + Education_num + Ethnicity + Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Q2_helpfulness 1 89691164 9.7049e+10 5072.8
## - Ethnicity 1 92718743 9.7052e+10 5072.9
## - Age_num 1 97154067 9.7056e+10 5072.9
## - Age_sq 1 101473362 9.7060e+10 5072.9
## - Employees_sq 1 183478009 9.7142e+10 5073.1
## - Years_sq 1 209636703 9.7169e+10 5073.2
## - Employees_num 1 395001016 9.7354e+10 5073.6
## - Education_num 1 647474556 9.7606e+10 5074.3
## <none> 9.6959e+10 5074.6
## + Race_binary 1 24637648 9.6934e+10 5076.5
## + Years_num 1 23091942 9.6936e+10 5076.5
## + Gender 2 722297734 9.6237e+10 5076.7
## + Region 3 230683723 9.6728e+10 5080.0
##
## Step: AIC=5072.85
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num +
## Employees_sq + Education_num + Ethnicity
##
## Df Sum of Sq RSS AIC
## - Ethnicity 1 79114803 9.7128e+10 5071.1
## - Age_num 1 101669431 9.7150e+10 5071.1
## - Age_sq 1 105932786 9.7155e+10 5071.1
## - Employees_sq 1 178306559 9.7227e+10 5071.3
## - Years_sq 1 203418538 9.7252e+10 5071.4
## - Employees_num 1 381627215 9.7430e+10 5071.8
## - Education_num 1 669177428 9.7718e+10 5072.6
## <none> 9.7049e+10 5072.8
## + Q2_helpfulness 1 89691164 9.6959e+10 5074.6
## + Race_binary 1 15483132 9.7033e+10 5074.8
## + Years_num 1 14201900 9.7034e+10 5074.8
## + Gender 2 639851741 9.6409e+10 5075.2
## + Region 3 256168158 9.6793e+10 5078.2
##
## Step: AIC=5071.05
## profit_change ~ Age_num + Age_sq + Years_sq + Employees_num +
## Employees_sq + Education_num
##
## Df Sum of Sq RSS AIC
## - Age_num 1 81507139 9.7209e+10 5069.3
## - Age_sq 1 84337516 9.7212e+10 5069.3
## - Years_sq 1 199846190 9.7328e+10 5069.6
## - Employees_sq 1 242538386 9.7370e+10 5069.7
## - Employees_num 1 434780198 9.7563e+10 5070.2
## - Education_num 1 638693243 9.7767e+10 5070.7
## <none> 9.7128e+10 5071.1
## + Ethnicity 1 79114803 9.7049e+10 5072.8
## + Q2_helpfulness 1 76087224 9.7052e+10 5072.9
## + Race_binary 1 25627414 9.7102e+10 5073.0
## + Years_num 1 8602700 9.7119e+10 5073.0
## + Gender 2 590380242 9.6537e+10 5073.5
## + Region 3 266594492 9.6861e+10 5076.4
##
## Step: AIC=5069.27
## profit_change ~ Age_sq + Years_sq + Employees_num + Employees_sq +
## Education_num
##
## Df Sum of Sq RSS AIC
## - Age_sq 1 2846461 9.7212e+10 5067.3
## - Employees_sq 1 224036054 9.7433e+10 5067.9
## - Years_sq 1 246562286 9.7456e+10 5067.9
## - Employees_num 1 413312662 9.7623e+10 5068.4
## - Education_num 1 653750817 9.7863e+10 5069.0
## <none> 9.7209e+10 5069.3
## + Age_num 1 81507139 9.7128e+10 5071.1
## + Q2_helpfulness 1 81448508 9.7128e+10 5071.1
## + Ethnicity 1 58952511 9.7150e+10 5071.1
## + Race_binary 1 14058944 9.7195e+10 5071.2
## + Years_num 1 4734004 9.7205e+10 5071.3
## + Gender 2 493787469 9.6716e+10 5072.0
## + Region 3 252918946 9.6956e+10 5074.6
##
## Step: AIC=5067.28
## profit_change ~ Years_sq + Employees_num + Employees_sq + Education_num
##
## Df Sum of Sq RSS AIC
## - Employees_sq 1 221635381 9.7434e+10 5065.9
## - Years_sq 1 252961337 9.7465e+10 5065.9
## - Employees_num 1 421416235 9.7634e+10 5066.4
## - Education_num 1 653906501 9.7866e+10 5067.0
## <none> 9.7212e+10 5067.3
## + Q2_helpfulness 1 81507150 9.7131e+10 5069.1
## + Ethnicity 1 57402813 9.7155e+10 5069.1
## + Race_binary 1 14874693 9.7197e+10 5069.2
## + Years_num 1 5564335 9.7207e+10 5069.3
## + Age_sq 1 2846461 9.7209e+10 5069.3
## + Age_num 1 16083 9.7212e+10 5069.3
## + Gender 2 495976936 9.6716e+10 5070.0
## + Region 3 254844049 9.6957e+10 5072.6
##
## Step: AIC=5065.86
## profit_change ~ Years_sq + Employees_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Years_sq 1 201524721 9.7635e+10 5064.4
## - Employees_num 1 416884560 9.7851e+10 5065.0
## - Education_num 1 721542080 9.8155e+10 5065.7
## <none> 9.7434e+10 5065.9
## + Employees_sq 1 221635381 9.7212e+10 5067.3
## + Ethnicity 1 116006583 9.7318e+10 5067.6
## + Q2_helpfulness 1 70564304 9.7363e+10 5067.7
## + Race_binary 1 21001541 9.7413e+10 5067.8
## + Age_num 1 3991698 9.7430e+10 5067.8
## + Years_num 1 2193234 9.7432e+10 5067.9
## + Age_sq 1 445788 9.7433e+10 5067.9
## + Gender 2 532755708 9.6901e+10 5068.5
## + Region 3 290296921 9.7144e+10 5071.1
##
## Step: AIC=5064.39
## profit_change ~ Employees_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Employees_num 1 397830478 9.8033e+10 5063.4
## - Education_num 1 702797156 9.8338e+10 5064.2
## <none> 9.7635e+10 5064.4
## + Years_sq 1 201524721 9.7434e+10 5065.9
## + Years_num 1 190562300 9.7445e+10 5065.9
## + Employees_sq 1 170198765 9.7465e+10 5065.9
## + Ethnicity 1 101942308 9.7533e+10 5066.1
## + Q2_helpfulness 1 67393930 9.7568e+10 5066.2
## + Age_num 1 37446621 9.7598e+10 5066.3
## + Race_binary 1 29177459 9.7606e+10 5066.3
## + Age_sq 1 20575366 9.7615e+10 5066.3
## + Gender 2 388825962 9.7246e+10 5067.4
## + Region 3 296394359 9.7339e+10 5069.6
##
## Step: AIC=5063.43
## profit_change ~ Education_num
##
## Df Sum of Sq RSS AIC
## - Education_num 1 610232759 9.8643e+10 5063.0
## <none> 9.8033e+10 5063.4
## + Employees_num 1 397830478 9.7635e+10 5064.4
## + Employees_sq 1 219518937 9.7814e+10 5064.9
## + Years_sq 1 182470639 9.7851e+10 5065.0
## + Years_num 1 176735654 9.7856e+10 5065.0
## + Age_num 1 128031410 9.7905e+10 5065.1
## + Age_sq 1 97199610 9.7936e+10 5065.2
## + Q2_helpfulness 1 54983202 9.7978e+10 5065.3
## + Ethnicity 1 33151619 9.8000e+10 5065.3
## + Race_binary 1 14490242 9.8019e+10 5065.4
## + Gender 2 315432468 9.7718e+10 5066.6
## + Region 3 233532300 9.7800e+10 5068.8
##
## Step: AIC=5063.02
## profit_change ~ 1
##
## Df Sum of Sq RSS AIC
## <none> 9.8643e+10 5063.0
## + Education_num 1 610232759 9.8033e+10 5063.4
## + Employees_num 1 305266082 9.8338e+10 5064.2
## + Years_sq 1 167903242 9.8475e+10 5064.6
## + Years_num 1 163788840 9.8480e+10 5064.6
## + Employees_sq 1 139478631 9.8504e+10 5064.7
## + Age_num 1 117236919 9.8526e+10 5064.7
## + Age_sq 1 86158035 9.8557e+10 5064.8
## + Q2_helpfulness 1 74838766 9.8569e+10 5064.8
## + Ethnicity 1 23637571 9.8620e+10 5065.0
## + Race_binary 1 5448408 9.8638e+10 5065.0
## + Gender 2 379741104 9.8264e+10 5066.0
## + Region 3 251002658 9.8392e+10 5068.4
summary(step_profit)
##
## Call:
## lm(formula = profit_change ~ 1, data = profit_model_data_complete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146924 -3424 -3424 -1424 96576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3424 1229 2.786 0.00574 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19670 on 255 degrees of freedom
savings_model_data_complete <- model_data_econ %>%
select(
savings_change,
Age_num, Education_num, Employees_num, Years_num,
Gender, Race_binary, Ethnicity, Q2_helpfulness, Region
) %>%
drop_na() %>%
mutate(
Age_sq = Age_num^2,
Years_sq = Years_num^2,
Employees_sq = Employees_num^2
)
lm_savings_complete <- lm(
savings_change ~ Age_num + Age_sq +
Years_num + Years_sq +
Employees_num + Employees_sq +
Education_num +
Gender + Race_binary + Ethnicity +
Q2_helpfulness + Region,
data = savings_model_data_complete
)
step_savings <- step(lm_savings_complete, direction = "both")
## Start: AIC=4950.46
## savings_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num +
## Employees_sq + Education_num + Gender + Race_binary + Ethnicity +
## Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Gender 2 34392193 3.0503e+10 4946.8
## - Region 3 437660322 3.0906e+10 4948.2
## - Years_sq 1 1642807 3.0470e+10 4948.5
## - Years_num 1 17096599 3.0486e+10 4948.6
## - Age_sq 1 32233122 3.0501e+10 4948.7
## - Ethnicity 1 35605013 3.0504e+10 4948.8
## - Employees_sq 1 39964530 3.0509e+10 4948.8
## - Race_binary 1 52496602 3.0521e+10 4948.9
## - Employees_num 1 58099761 3.0527e+10 4949.0
## - Education_num 1 69828779 3.0539e+10 4949.1
## - Age_num 1 110574425 3.0579e+10 4949.4
## <none> 3.0469e+10 4950.5
## - Q2_helpfulness 1 792518321 3.1261e+10 4955.3
##
## Step: AIC=4946.76
## savings_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num +
## Employees_sq + Education_num + Race_binary + Ethnicity +
## Q2_helpfulness + Region
##
## Df Sum of Sq RSS AIC
## - Region 3 439715398 3.0943e+10 4944.6
## - Years_sq 1 1768538 3.0505e+10 4944.8
## - Years_num 1 17699274 3.0521e+10 4944.9
## - Age_sq 1 20099470 3.0523e+10 4944.9
## - Ethnicity 1 30288236 3.0533e+10 4945.0
## - Employees_sq 1 38148883 3.0541e+10 4945.1
## - Employees_num 1 54706856 3.0558e+10 4945.2
## - Race_binary 1 74559276 3.0578e+10 4945.4
## - Education_num 1 78884349 3.0582e+10 4945.4
## - Age_num 1 87763936 3.0591e+10 4945.5
## <none> 3.0503e+10 4946.8
## + Gender 2 34392193 3.0469e+10 4950.5
## - Q2_helpfulness 1 790916506 3.1294e+10 4951.5
##
## Step: AIC=4944.55
## savings_change ~ Age_num + Age_sq + Years_num + Years_sq + Employees_num +
## Employees_sq + Education_num + Race_binary + Ethnicity +
## Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Years_sq 1 6626068 3.0950e+10 4942.6
## - Age_sq 1 13591785 3.0956e+10 4942.7
## - Ethnicity 1 16452076 3.0959e+10 4942.7
## - Years_num 1 30968968 3.0974e+10 4942.8
## - Employees_sq 1 39745519 3.0983e+10 4942.9
## - Employees_num 1 52581238 3.0995e+10 4943.0
## - Age_num 1 68576285 3.1011e+10 4943.1
## - Education_num 1 77801872 3.1021e+10 4943.2
## - Race_binary 1 82309991 3.1025e+10 4943.3
## <none> 3.0943e+10 4944.6
## + Region 3 439715398 3.0503e+10 4946.8
## + Gender 2 36447270 3.0906e+10 4948.2
## - Q2_helpfulness 1 887948882 3.1831e+10 4950.1
##
## Step: AIC=4942.61
## savings_change ~ Age_num + Age_sq + Years_num + Employees_num +
## Employees_sq + Education_num + Race_binary + Ethnicity +
## Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Age_sq 1 12890329 3.0962e+10 4940.7
## - Ethnicity 1 14789552 3.0964e+10 4940.7
## - Employees_sq 1 43272154 3.0993e+10 4941.0
## - Employees_num 1 56825185 3.1006e+10 4941.1
## - Age_num 1 66484784 3.1016e+10 4941.2
## - Education_num 1 77899541 3.1027e+10 4941.3
## - Race_binary 1 80227980 3.1030e+10 4941.3
## <none> 3.0950e+10 4942.6
## - Years_num 1 351370676 3.1301e+10 4943.6
## + Years_sq 1 6626068 3.0943e+10 4944.6
## + Region 3 444572928 3.0505e+10 4944.8
## + Gender 2 36906031 3.0913e+10 4946.3
## - Q2_helpfulness 1 881410685 3.1831e+10 4948.1
##
## Step: AIC=4940.72
## savings_change ~ Age_num + Years_num + Employees_num + Employees_sq +
## Education_num + Race_binary + Ethnicity + Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Ethnicity 1 13631491 3.0976e+10 4938.8
## - Employees_sq 1 51465384 3.1014e+10 4939.2
## - Employees_num 1 66552187 3.1029e+10 4939.3
## - Education_num 1 76029139 3.1038e+10 4939.4
## - Race_binary 1 76293335 3.1039e+10 4939.4
## <none> 3.0962e+10 4940.7
## - Years_num 1 340227869 3.1303e+10 4941.6
## + Age_sq 1 12890329 3.0950e+10 4942.6
## + Years_sq 1 5924612 3.0956e+10 4942.7
## + Region 3 437755639 3.0525e+10 4942.9
## + Gender 2 25978258 3.0936e+10 4944.5
## - Age_num 1 734767698 3.1697e+10 4944.9
## - Q2_helpfulness 1 878910857 3.1841e+10 4946.1
##
## Step: AIC=4938.84
## savings_change ~ Age_num + Years_num + Employees_num + Employees_sq +
## Education_num + Race_binary + Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Employees_sq 1 43431412 3.1019e+10 4937.2
## - Employees_num 1 59951114 3.1036e+10 4937.4
## - Education_num 1 81628050 3.1058e+10 4937.5
## - Race_binary 1 95014878 3.1071e+10 4937.7
## <none> 3.0976e+10 4938.8
## - Years_num 1 343743487 3.1320e+10 4939.8
## + Ethnicity 1 13631491 3.0962e+10 4940.7
## + Age_sq 1 11732268 3.0964e+10 4940.7
## + Years_sq 1 4444705 3.0972e+10 4940.8
## + Region 3 424286994 3.0552e+10 4941.2
## + Gender 2 22369987 3.0954e+10 4942.6
## - Age_num 1 739994959 3.1716e+10 4943.1
## - Q2_helpfulness 1 867933258 3.1844e+10 4944.2
##
## Step: AIC=4937.21
## savings_change ~ Age_num + Years_num + Employees_num + Education_num +
## Race_binary + Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Employees_num 1 21570814 3.1041e+10 4935.4
## - Race_binary 1 84288108 3.1104e+10 4935.9
## - Education_num 1 96355553 3.1116e+10 4936.0
## <none> 3.1019e+10 4937.2
## - Years_num 1 311748801 3.1331e+10 4937.9
## + Employees_sq 1 43431412 3.0976e+10 4938.8
## + Age_sq 1 19439246 3.1000e+10 4939.0
## + Years_sq 1 7882401 3.1012e+10 4939.1
## + Ethnicity 1 5597518 3.1014e+10 4939.2
## + Region 3 429924530 3.0590e+10 4939.5
## + Gender 2 19578695 3.1000e+10 4941.0
## - Age_num 1 705120967 3.1725e+10 4941.2
## - Q2_helpfulness 1 883702352 3.1903e+10 4942.7
##
## Step: AIC=4935.39
## savings_change ~ Age_num + Years_num + Education_num + Race_binary +
## Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Race_binary 1 86206596 3.1127e+10 4934.1
## - Education_num 1 86694151 3.1128e+10 4934.1
## <none> 3.1041e+10 4935.4
## - Years_num 1 299446819 3.1340e+10 4935.9
## + Employees_num 1 21570814 3.1019e+10 4937.2
## + Age_sq 1 20343410 3.1021e+10 4937.2
## + Ethnicity 1 9382022 3.1032e+10 4937.3
## + Years_sq 1 8064700 3.1033e+10 4937.3
## + Employees_sq 1 5051112 3.1036e+10 4937.4
## + Region 3 418792581 3.0622e+10 4937.8
## + Gender 2 17687721 3.1023e+10 4939.2
## - Age_num 1 697642229 3.1739e+10 4939.3
## - Q2_helpfulness 1 899968440 3.1941e+10 4941.0
##
## Step: AIC=4934.13
## savings_change ~ Age_num + Years_num + Education_num + Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## - Education_num 1 102562250 3.1230e+10 4933.0
## <none> 3.1127e+10 4934.1
## - Years_num 1 274404385 3.1402e+10 4934.5
## + Race_binary 1 86206596 3.1041e+10 4935.4
## + Ethnicity 1 25960648 3.1101e+10 4935.9
## + Employees_num 1 23489302 3.1104e+10 4935.9
## + Age_sq 1 13256116 3.1114e+10 4936.0
## + Employees_sq 1 7371427 3.1120e+10 4936.1
## + Years_sq 1 4529548 3.1123e+10 4936.1
## + Region 3 419527886 3.0708e+10 4936.5
## - Age_num 1 661104471 3.1788e+10 4937.7
## + Gender 2 35677748 3.1092e+10 4937.8
## - Q2_helpfulness 1 840754028 3.1968e+10 4939.2
##
## Step: AIC=4933
## savings_change ~ Age_num + Years_num + Q2_helpfulness
##
## Df Sum of Sq RSS AIC
## <none> 3.1230e+10 4933.0
## - Years_num 1 268241482 3.1498e+10 4933.3
## + Education_num 1 102562250 3.1127e+10 4934.1
## + Race_binary 1 102074695 3.1128e+10 4934.1
## + Ethnicity 1 31817622 3.1198e+10 4934.7
## + Employees_num 1 12710008 3.1217e+10 4934.9
## + Age_sq 1 11291015 3.1219e+10 4934.9
## + Years_sq 1 4586972 3.1225e+10 4935.0
## + Employees_sq 1 1382305 3.1228e+10 4935.0
## + Region 3 420257796 3.0810e+10 4935.4
## + Gender 2 44793274 3.1185e+10 4936.6
## - Age_num 1 672862743 3.1903e+10 4936.6
## - Q2_helpfulness 1 816959623 3.2047e+10 4937.8
summary(step_savings)
##
## Call:
## lm(formula = savings_change ~ Age_num + Years_num + Q2_helpfulness,
## data = savings_model_data_complete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94931 -1324 998 2850 61105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3656.8 4543.1 0.805 0.4216
## Age_num 1163.2 490.5 2.371 0.0184 *
## Years_num -688.9 460.1 -1.497 0.1355
## Q2_helpfulness -2219.5 849.4 -2.613 0.0095 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10940 on 261 degrees of freedom
## Multiple R-squared: 0.04668, Adjusted R-squared: 0.03572
## F-statistic: 4.26 on 3 and 261 DF, p-value: 0.005854