pacman::p_load(
tidyverse, # includes ggplot2 and other
stringr, # working with characters
scales, # transform numbers
ggrepel, # smartly-placed labels
gghighlight, # highlight one part of plot
RColorBrewer, # color scales
tinytex,
DAAG
)
## The following packages have been unloaded:
## tidyverse
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## also installing the dependency 'cli'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'cli' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cli'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\naima\AppData\Local\R\win-library\4.2\00LOCK\cli\libs\x64\cli.dll to
## C:\Users\naima\AppData\Local\R\win-library\4.2\cli\libs\x64\cli.dll: Permission
## denied
## Warning: restored 'cli'
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
##
## tidyverse installed
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning in pacman::p_load(tidyverse, stringr, scales, ggrepel, gghighlight, : Failed to install/load:
## tidyverse
the data that we have
# (Tu): I changed the initial dataset
install.packages("readxl")
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'readxl' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
#set working directory (path) and read dataset
setwd("C:/Users/naima/OneDrive/Documenten/R_studio_files/ARM_assignement")
clean_data2 <- read_excel("~/R_studio_files/ARM_assignement/clean_data_Tu.xlsx")
#clean_data2<-read_excel("C:/Users/HA TU/OneDrive - Universiteit Antwerpen/Uantwerp study/semester 1.2/ARM/cleaning data/data exported from REDCAp/clean_data_Tu.xlsx")
head(clean_data2)
## # A tibble: 6 × 37
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 2 NA NA 0 2
## 2 14 1 2 NA NA 0 2
## 3 15 1 2 NA NA 0 2
## 4 16 2 2 NA NA 0 2
## 5 17 2 2 NA NA 0 2
## 6 18 2 2 NA NA 0 2
## # ℹ 30 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## # fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## # fruit_groot_schijf <dbl>, fruit_big <chr>, fruit_handful <dbl>,
## # fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## # fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## # veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## # veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …
Descriptive Statistics
Inferencial Statistics Exposure: We need two groups CG-participats only if for more than 6 months and more than once a week
Outcome: On an average day not average week (exclude days per week)
Sum: fruit and vegetables
install.packages("magrittr") # package installations are only needed the first time you use it
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'magrittr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'magrittr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\naima\AppData\Local\R\win-library\4.2\00LOCK\magrittr\libs\x64\magrittr.dll
## to
## C:\Users\naima\AppData\Local\R\win-library\4.2\magrittr\libs\x64\magrittr.dll:
## Permission denied
## Warning: restored 'magrittr'
##
## The downloaded binary packages are in
## C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
install.packages("dplyr") # alternative installation of the %>%
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\naima\AppData\Local\R\win-library\4.2\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\naima\AppData\Local\R\win-library\4.2\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
library(magrittr) # needs to be run every time you start R and want to use %>%
## Warning: package 'magrittr' was built under R version 4.2.3
library(dplyr) # alternatively, this also loads %>%
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
min_max_table <- sapply(clean_data2, function(x) c(Min = min(x, na.rm = TRUE), Max = max(x, na.rm = TRUE)))
min_max_table <- as.data.frame(min_max_table)
min_max_table
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## Min 12 1 1 1 1 0 1
## Max 160 2 2 2 2 1 2
## supermarket sparet1 sparet2 fruit_freq fruit_medium fruit_small
## Min 2 0 1 0 0 0
## Max 5 24 24 72 5 4
## fruit_groot_schijf fruit_big fruit_handful fruit_small2
## Min 0 0 0 0
## Max 2 3 7 2
## fruit_handful_cherish fuit_grapes fruit_lychees fruit_salad fruit_juice
## Min 0 0 0 0 0
## Max 2 5 1 4 7
## veg_freq veg_cooked veg_cooked_spoon veg_green_brocolli_cauli veg_legume
## Min 3 0 0 0 0
## Max 715 222 20 5 5
## veg_lettuce veg_raw veg_juice age gender demo_weight height ethnicity edu
## Min 0 0 0 20 1 43 1.7 1 1
## Max 8 7 7 81 4 105 190 8 5
## urban_harvest_q_complete
## Min 0
## Max 2
# <Tu> extract these record with invalid values
clean_data2 %>%
filter(veg_freq==715|veg_cooked==222|sparet1==24|sparet2==24)
## # A tibble: 3 × 37
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 71 1 2 NA NA 0 2
## 2 96 2 1 2 1 1 1
## 3 128 2 1 2 1 1 1
## # ℹ 30 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## # fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## # fruit_groot_schijf <dbl>, fruit_big <chr>, fruit_handful <dbl>,
## # fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## # fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## # veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## # veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …
# <Tu> As these values are on key covariates, I will exclude these records (71, 96, 128) later in the analysis
Now to collapse the fruit variables to a new variable called fruit_avrg (for that I needed to change all these variables to numeric)
fruit_variables <- c("fruit_medium", "fruit_small", "fruit_groot_schijf", "fruit_big", "fruit_handful",
"fruit_small2", "fruit_handful_cherish", "fuit_grapes", "fruit_lychees", "fruit_salad",
"fruit_juice")
# Convert the variables to numeric
clean_data2[, fruit_variables] <- lapply(clean_data2[, fruit_variables], as.numeric)
clean_data2$fruit_avrg <- rowSums(clean_data2[, fruit_variables], na.rm = TRUE)
head(clean_data2)
## # A tibble: 6 × 38
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 2 NA NA 0 2
## 2 14 1 2 NA NA 0 2
## 3 15 1 2 NA NA 0 2
## 4 16 2 2 NA NA 0 2
## 5 17 2 2 NA NA 0 2
## 6 18 2 2 NA NA 0 2
## # ℹ 31 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## # fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## # fruit_groot_schijf <dbl>, fruit_big <dbl>, fruit_handful <dbl>,
## # fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## # fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## # veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## # veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …
Now the same for vegetables (new variable called veg_avrg)
veg_variables <- c("veg_cooked", "veg_cooked_spoon", "veg_green_brocolli_cauli", "veg_legume", "veg_lettuce",
"veg_raw", "veg_juice")
# Convert the variables to numeric
clean_data2[, veg_variables] <- lapply(clean_data2[, veg_variables], as.numeric)
clean_data2$veg_avrg <- rowSums(clean_data2[, veg_variables], na.rm = TRUE)
head(clean_data2)
## # A tibble: 6 × 39
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 2 NA NA 0 2
## 2 14 1 2 NA NA 0 2
## 3 15 1 2 NA NA 0 2
## 4 16 2 2 NA NA 0 2
## 5 17 2 2 NA NA 0 2
## 6 18 2 2 NA NA 0 2
## # ℹ 32 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## # fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## # fruit_groot_schijf <dbl>, fruit_big <dbl>, fruit_handful <dbl>,
## # fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## # fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## # veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## # veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …
Now to remove the single variables that are not needed anymore
clean_data3 <- subset(clean_data2, select = -c(fruit_medium, fruit_small, fruit_groot_schijf, fruit_big, fruit_handful,
fruit_small2, fruit_handful_cherish, fuit_grapes, fruit_lychees, fruit_salad,
fruit_juice, veg_cooked, veg_cooked_spoon, veg_green_brocolli_cauli, veg_legume, veg_lettuce,
veg_raw, veg_juice, urban_harvest_q_complete
))
head(clean_data3)
## # A tibble: 6 × 20
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 2 NA NA 0 2
## 2 14 1 2 NA NA 0 2
## 3 15 1 2 NA NA 0 2
## 4 16 2 2 NA NA 0 2
## 5 17 2 2 NA NA 0 2
## 6 18 2 2 NA NA 0 2
## # ℹ 13 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## # fruit_freq <dbl>, veg_freq <dbl>, age <dbl>, gender <dbl>,
## # demo_weight <dbl>, height <dbl>, ethnicity <dbl>, edu <dbl>,
## # fruit_avrg <dbl>, veg_avrg <dbl>
Now to add the combined variable of fruit and vegetables comb_avrg
comb_variables <- c("fruit_avrg", "veg_avrg")
# Convert the variables to numeric
clean_data3[, comb_variables] <- lapply(clean_data3[, comb_variables], as.numeric)
clean_data3$comb_avrg <- rowSums(clean_data3[, comb_variables], na.rm = TRUE)
head(clean_data3)
## # A tibble: 6 × 21
## record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 2 NA NA 0 2
## 2 14 1 2 NA NA 0 2
## 3 15 1 2 NA NA 0 2
## 4 16 2 2 NA NA 0 2
## 5 17 2 2 NA NA 0 2
## 6 18 2 2 NA NA 0 2
## # ℹ 14 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## # fruit_freq <dbl>, veg_freq <dbl>, age <dbl>, gender <dbl>,
## # demo_weight <dbl>, height <dbl>, ethnicity <dbl>, edu <dbl>,
## # fruit_avrg <dbl>, veg_avrg <dbl>, comb_avrg <dbl>
Now to collabs the spare time variables (they will remain in clean_data5) Matteo: I will go with weekly here (to make it dayly it just needs to be divided by 7) Tu: I add new variable comb_avrg_collapse to catogorize the outcome
library(dplyr)
clean_data4 <- clean_data3 %>%
mutate(sparet_w = sparet1 * 5 + sparet2 * 2,
comb_avrg_collapse = ifelse(comb_avrg>=5,1,0))
clean_data5 <- subset(clean_data4, select = -c(sparet1, sparet2,
cg_time, cg_freq,cg_participate))
head(clean_data5)
## # A tibble: 6 × 18
## record_id woonplaats cg_new home_gard supermarket fruit_freq veg_freq age
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 0 2 4 1 6 34
## 2 14 1 0 2 4 72 7 33
## 3 15 1 0 2 4 7 7 32
## 4 16 2 0 2 5 6 7 35
## 5 17 2 0 2 4 3 6 41
## 6 18 2 0 2 3 4 7 23
## # ℹ 10 more variables: gender <dbl>, demo_weight <dbl>, height <dbl>,
## # ethnicity <dbl>, edu <dbl>, fruit_avrg <dbl>, veg_avrg <dbl>,
## # comb_avrg <dbl>, sparet_w <dbl>, comb_avrg_collapse <dbl>
# exclude record with invalid values
clean_data_exclude_invalid <- clean_data5 %>%
filter(record_id!=71 &
record_id!=96 &
record_id!=128)
Descriptive Statistics
lets go
1. How many participates eat 5 or more portions of fruit or vegetable per day?
participants_comb_avg_5_or_more <- clean_data_exclude_invalid %>%
filter(comb_avrg >= 5) %>%
nrow()
participants_comb_avg_5_or_more
## [1] 85
# Tu: add proportions
participants_comb_avg_5_or_more/ nrow(clean_data_exclude_invalid)*100
## [1] 85.85859
2. How many participants eat fruit/vegeatable every of day the week? - How many participants eat fruit every day of the week?
sum(clean_data_exclude_invalid$fruit_freq == 7)
## [1] 46
sum(clean_data_exclude_invalid$veg_freq == 7)
## [1] 71
sum(clean_data_exclude_invalid$fruit_freq == 7 | clean_data_exclude_invalid$veg_freq == 7)
## [1] 78
sum(clean_data_exclude_invalid$fruit_freq == 7 & clean_data_exclude_invalid$veg_freq == 7)
## [1] 39
`` Compare daily intake of fruit/veg in cg vs ncg
# Subset the data for community gardeners and non-community gardeners
community_gardeners <- subset(clean_data_exclude_invalid, cg_new == 1)
non_community_gardeners <- subset(clean_data_exclude_invalid, cg_new == 0)
# Calculate the proportion of community gardeners and non-community gardeners meeting the criteria
prop_community_gardeners <- mean(community_gardeners$fruit_freq == 7 & community_gardeners$veg_freq == 7)
prop_non_community_gardeners <- mean(non_community_gardeners$fruit_freq == 7 & non_community_gardeners$veg_freq == 7)
# Print the proportions
print(prop_community_gardeners)
## [1] 0.5862069
print(prop_non_community_gardeners)
## [1] 0.3142857
significance in proportion who have a daily intake in fruit and veg
# Sample sizes for each group (assuming you have this information)
n_community_gardeners <-31
n_non_community_gardeners <- 71
# Calculate the pooled proportion
p_pooled <- (prop_community_gardeners * n_community_gardeners + prop_non_community_gardeners * n_non_community_gardeners) / (n_community_gardeners + n_non_community_gardeners)
# Calculate the standard error
se <- sqrt(p_pooled * (1 - p_pooled) * (1 / n_community_gardeners + 1 / n_non_community_gardeners))
# Calculate the test statistic (z-score)
z <- (prop_community_gardeners - prop_non_community_gardeners) / se
# Calculate the p-value
p_value <- 2 * pnorm(-abs(z)) # 2-tailed test
# Print the test statistic and p-value
print(z)
## [1] 2.58174
print(p_value)
## [1] 0.009830371
plot daily intake cg vs non-cg
library(ggplot2)
# Data
prop_community_gardeners <- 0.5862069
prop_non_community_gardeners <- 0.3142857
# Create a data frame for plotting
data <- data.frame(
Group = c("Community Gardeners", "Non-Community Gardeners"),
Proportion = c(prop_community_gardeners, prop_non_community_gardeners)
)
# Create the bar plot
ggplot(data, aes(x = Group, y = Proportion, fill = Group)) +
geom_bar(stat = "identity", position = "dodge", width = 0.5) +
labs(
title = "Daily Intake of Fruit/Vegetables",
y = "Proportion",
fill = "Group"
) +
theme_minimal()
analysis on the subset of respondents who consume fruit and vegetables
every day (fruit_freq == 7 and veg_freq == 7) and determine how many eat
5 servings a day
# Subset the data for respondents with daily intake of fruit and vegetables
subset_daily_intake <- subset(clean_data_exclude_invalid, fruit_freq == 7 & veg_freq == 7)
# Subset the data further for community gardeners and non-community gardeners
cg_daily_intake <- subset(subset_daily_intake, cg_new == 1)
non_cg_daily_intake <- subset(subset_daily_intake, cg_new == 0)
# Calculate the number of respondents eating 5 servings a day for each group
num_cg_5_servings <- sum(cg_daily_intake$comb_avrg_collapse == 1)
num_non_cg_5_servings <- sum(non_cg_daily_intake$comb_avrg_collapse == 1)
# Print the results
print(num_cg_5_servings)
## [1] 17
print(num_non_cg_5_servings)
## [1] 18
#There are 17 community gardeners within the subset of respondents who consume fruit and vegetables every day and also eat 5 servings a day.
#There are 18 non-community gardeners within the same subset who consume fruit and vegetables every day and also eat 5 servings a day.
#statistical signficant?
# Define the proportions
prop_cg <- 17 # Number of community gardeners eating 5 servings a day
prop_non_cg <- 18 # Number of non-community gardeners eating 5 servings a day
# Sample sizes for each group (assuming you have this information)
n_cg <- 31 # Total number of community gardeners in the subset
n_non_cg <- 71 # Total number of non-community gardeners in the subset
# Calculate the proportions
prop_cg <- prop_cg / n_cg
prop_non_cg <- prop_non_cg / n_non_cg
# Calculate the pooled proportion
p_pooled <- (prop_cg * n_cg + prop_non_cg * n_non_cg) / (n_cg + n_non_cg)
# Calculate the standard error
se <- sqrt(p_pooled * (1 - p_pooled) * (1 / n_cg + 1 / n_non_cg))
# Calculate the test statistic (z-score)
z <- (prop_cg - prop_non_cg) / se
# Calculate the p-value
p_value <- 2 * pnorm(-abs(z)) # 2-tailed test
# Print the test statistic and p-value
print(z)
## [1] 2.885116
print(p_value)
## [1] 0.00391269
#The result of the z-test indicates a z-score of approximately 2.885116 and a corresponding two-tailed p-value of approximately 0.00391269. significant difference in the proportions of individuals eating 5 servings of fruit and vegetables a day between community gardeners and non-community gardeners.
Number of cg and noncg with daily intake
# Number of community gardeners with daily intake
num_cg_daily_intake <- sum(clean_data_exclude_invalid$cg_new == 1 & clean_data_exclude_invalid$fruit_freq == 7 & clean_data_exclude_invalid$veg_freq == 7)
# Number of non-community gardeners with daily intake
num_non_cg_daily_intake <- sum(clean_data_exclude_invalid$cg_new == 0 & clean_data_exclude_invalid$fruit_freq == 7 & clean_data_exclude_invalid$veg_freq == 7)
# Print the results
print(num_cg_daily_intake)
## [1] 17
print(num_non_cg_daily_intake)
## [1] 22
plot
library(ggplot2)
# Create a data frame for plotting
data_plot <- data.frame(
Group = c("Community Gardeners", "Non-Community Gardeners"),
Proportion = c(prop_cg, prop_non_cg)
)
# Create the bar plot
ggplot(data_plot, aes(x = Group, y = Proportion, fill = Group)) +
geom_bar(stat = "identity", position = "dodge", width = 0.5) +
labs(
title = "Proportion of Individuals Eating 5 Servings of Fruit/Vegetables a Day",
y = "Proportion",
fill = "Group"
) +
theme_minimal()
Plot community and non-community gardeners who eat fruit veg every day
and also 5 servings
library(ggplot2)
# Data
community_gardeners <- 17 # Number of community gardeners eating 5 servings a day
non_community_gardeners <- 18 # Number of non-community gardeners eating 5 servings a day
# Total number of individuals in each group
total_cg <- 31 # Total number of community gardeners in the subset
total_non_cg <- 71 # Total number of non-community gardeners in the subset
# Calculate proportions
prop_cg_5_servings <- community_gardeners / total_cg
prop_non_cg_5_servings <- non_community_gardeners / total_non_cg
prop_cg_not_5_servings <- 1 - prop_cg_5_servings
prop_non_cg_not_5_servings <- 1 - prop_non_cg_5_servings
# Create a data frame for plotting
data <- data.frame(
Group = rep(c("Community Gardeners", "Non-Community Gardeners"), each = 2),
Status = rep(c("Five Servings", "Not Five Servings"), times = 2),
Proportion = c(prop_cg_5_servings, prop_cg_not_5_servings, prop_non_cg_5_servings, prop_non_cg_not_5_servings)
)
# Create the bar plot
ggplot(data, aes(x = Group, y = Proportion, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Proportion of Individuals Eating Daily Fruit/Veg and 5 Servings a Day",
y = "Proportion",
fill = "Status"
) +
theme_minimal()
Average number of fruit and veg in group of cg and non cg with daily
intake
# Subset the data for individuals eating fruit/vegetables every day
daily_intake_data <- data[clean_data_exclude_invalid$fruit_freq == 7 & data$veg_freq == 7, ]
# Subset the data for community gardeners and calculate averages
cg_daily_intake_data <- daily_intake_data[daily_intake_data$cg_new == 1, ]
if (nrow(cg_daily_intake_data) > 0) {
avg_fruit_cg <- mean(cg_daily_intake_data$fruit_avrg, na.rm = TRUE)
avg_veg_cg <- mean(cg_daily_intake_data$veg_avrg, na.rm = TRUE)
avg_comb_cg <- mean(cg_daily_intake_data$comb_avrg, na.rm = TRUE)
} else {
avg_fruit_cg <- "No data"
avg_veg_cg <- "No data"
avg_comb_cg <- "No data"
}
# Subset the data for non-community gardeners and calculate averages
non_cg_daily_intake_data <- daily_intake_data[daily_intake_data$cg_new == 0, ]
if (nrow(non_cg_daily_intake_data) > 0) {
avg_fruit_non_cg <- mean(non_cg_daily_intake_data$fruit_avrg, na.rm = TRUE)
avg_veg_non_cg <- mean(non_cg_daily_intake_data$veg_avrg, na.rm = TRUE)
avg_comb_non_cg <- mean(non_cg_daily_intake_data$comb_avrg, na.rm = TRUE)
} else {
avg_fruit_non_cg <- "No data"
avg_veg_non_cg <- "No data"
avg_comb_non_cg <- "No data"
}
# Print the results
print("Community Gardeners:")
## [1] "Community Gardeners:"
print(paste("Average servings of fruit:", avg_fruit_cg))
## [1] "Average servings of fruit: No data"
print(paste("Average servings of vegetables:", avg_veg_cg))
## [1] "Average servings of vegetables: No data"
print(paste("Average combined servings:", avg_comb_cg))
## [1] "Average combined servings: No data"
print("Non-Community Gardeners:")
## [1] "Non-Community Gardeners:"
print(paste("Average servings of fruit:", avg_fruit_non_cg))
## [1] "Average servings of fruit: No data"
print(paste("Average servings of vegetables:", avg_veg_non_cg))
## [1] "Average servings of vegetables: No data"
print(paste("Average combined servings:", avg_comb_non_cg))
## [1] "Average combined servings: No data"
3. Descriptive stats for fruit/day and vegetable/day (cg yes / cg no) CG yes group Fruit consumption per day in the community gardeners
summary_stats_fruit_avrg <- clean_data_exclude_invalid %>%
filter(cg_new == 1) %>%
summarize(
mean_fruit_avrg = mean(fruit_avrg, na.rm = TRUE),
median_fruit_avrg = median(fruit_avrg, na.rm = TRUE),
min_fruit_avrg = min(fruit_avrg, na.rm = TRUE),
max_fruit_avrg = max(fruit_avrg, na.rm = TRUE),
sd_fruit_avrg = sd(fruit_avrg, na.rm = TRUE)
)
summary_stats_fruit_avrg
## # A tibble: 1 × 5
## mean_fruit_avrg median_fruit_avrg min_fruit_avrg max_fruit_avrg sd_fruit_avrg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.66 3 1 14 3.19
Vegetable consumption per day cg yes group
summary_stats_veg_avrg <- clean_data_exclude_invalid %>%
filter(cg_new == 1) %>%
summarize(
mean_veg_avrg = mean(veg_avrg, na.rm = TRUE),
median_veg_avrg = median(veg_avrg, na.rm = TRUE),
min_veg_avrg = min(veg_avrg, na.rm = TRUE),
max_veg_avrg = max(veg_avrg, na.rm = TRUE),
sd_veg_avrg = sd(veg_avrg, na.rm = TRUE)
)
summary_stats_veg_avrg
## # A tibble: 1 × 5
## mean_veg_avrg median_veg_avrg min_veg_avrg max_veg_avrg sd_veg_avrg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.76 5 2 52 11.5
Combined consumption per day cg yes group
summary_stats_comb_avrg <- clean_data_exclude_invalid %>%
filter(cg_new == 1) %>%
summarize(
mean_comb_avrg = mean(comb_avrg, na.rm = TRUE),
median_comb_avrg = median(comb_avrg, na.rm = TRUE),
min_comb_avrg = min(comb_avrg, na.rm = TRUE),
max_comb_avrg = max(comb_avrg, na.rm = TRUE),
sd_comb_avrg = sd(comb_avrg, na.rm = TRUE)
)
summary_stats_comb_avrg
## # A tibble: 1 × 5
## mean_comb_avrg median_comb_avrg min_comb_avrg max_comb_avrg sd_comb_avrg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14.4 10 4 59 12.8
CG no group Fruit consumption per day in the cg no group
summary_stats_fruit_avrg <- clean_data_exclude_invalid %>%
filter(cg_new == 0) %>%
summarize(
mean_fruit_avrg = mean(fruit_avrg, na.rm = TRUE),
median_fruit_avrg = median(fruit_avrg, na.rm = TRUE),
min_fruit_avrg = min(fruit_avrg, na.rm = TRUE),
max_fruit_avrg = max(fruit_avrg, na.rm = TRUE),
sd_fruit_avrg = sd(fruit_avrg, na.rm = TRUE)
)
summary_stats_fruit_avrg
## # A tibble: 1 × 5
## mean_fruit_avrg median_fruit_avrg min_fruit_avrg max_fruit_avrg sd_fruit_avrg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.79 3 1 14 2.47
Vegetable consumption per day cg no group
summary_stats_veg_avrg <- clean_data_exclude_invalid %>%
filter(cg_new == 0) %>%
summarize(
mean_veg_avrg = mean(veg_avrg, na.rm = TRUE),
median_veg_avrg = median(veg_avrg, na.rm = TRUE),
min_veg_avrg = min(veg_avrg, na.rm = TRUE),
max_veg_avrg = max(veg_avrg, na.rm = TRUE),
sd_veg_avrg = sd(veg_avrg, na.rm = TRUE)
)
summary_stats_veg_avrg
## # A tibble: 1 × 5
## mean_veg_avrg median_veg_avrg min_veg_avrg max_veg_avrg sd_veg_avrg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.36 5 1 25 4.21
Combined consumption per day cg no group
summary_stats_comb_avrg <- clean_data_exclude_invalid %>%
filter(cg_new == 0) %>%
summarize(
mean_comb_avrg = mean(comb_avrg, na.rm = TRUE),
median_comb_avrg = median(comb_avrg, na.rm = TRUE),
min_comb_avrg = min(comb_avrg, na.rm = TRUE),
max_comb_avrg = max(comb_avrg, na.rm = TRUE),
sd_comb_avrg = sd(comb_avrg, na.rm = TRUE)
)
summary_stats_comb_avrg
## # A tibble: 1 × 5
## mean_comb_avrg median_comb_avrg min_comb_avrg max_comb_avrg sd_comb_avrg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10.1 10 2 29 5.79
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.2.2
write.xlsx(clean_data5, "clean_data5.xlsx")
Perform indpendent t-test to compare mean number of servings F/V between the two groups (cg vs ncg)
#variantie test
var.test(comb_avrg~cg_new, data=clean_data5)
##
## F test to compare two variances
##
## data: comb_avrg by cg_new
## F = 0.021255, num df = 70, denom df = 30, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.01107329 0.03781766
## sample estimates:
## ratio of variances
## 0.02125534
var.test(fruit_avrg~cg_new, data=clean_data5)
##
## F test to compare two variances
##
## data: fruit_avrg by cg_new
## F = 0.61539, num df = 70, denom df = 30, p-value = 0.09873
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3205943 1.0948985
## sample estimates:
## ratio of variances
## 0.6153856
var.test(veg_avrg~cg_new, data = clean_data5)
##
## F test to compare two variances
##
## data: veg_avrg by cg_new
## F = 0.012228, num df = 70, denom df = 30, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.00637019 0.02175557
## sample estimates:
## ratio of variances
## 0.01222767
#equal variance is assumed, dus je kan geen t-test doen, wel Welsh' test
#p-value <0.05, dus you reject H0 = strong evidence to assume unequal variance
#t-test --> Welsh' test
#unequal variance, dus je vult in var_equal= FALSE
t.test(comb_avrg~cg_new, data=clean_data5,var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: comb_avrg by cg_new
## t = -1.4977, df = 30.558, p-value = 0.1445
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -26.180632 4.017979
## sample estimates:
## mean in group 0 mean in group 1
## 10.33803 21.41935
t.test(fruit_avrg~cg_new, data=clean_data5,var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: fruit_avrg by cg_new
## t = -1.4285, df = 46.838, p-value = 0.1598
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -2.1742651 0.3687222
## sample estimates:
## mean in group 0 mean in group 1
## 3.774648 4.677419
t.test(veg_avrg~cg_new, data=clean_data5,var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: veg_avrg by cg_new
## t = -1.3811, df = 30.321, p-value = 0.1773
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -25.22275 4.86564
## sample estimates:
## mean in group 0 mean in group 1
## 6.56338 16.74194
#p-value = geen statistisch significant verschil
# Separate data into two groups based on cg_new
group1 <- clean_data5$comb_avrg[clean_data5$cg_new == 0]
group2 <- clean_data5$comb_avrg[clean_data5$cg_new == 1]
# Perform Welch's t-test
result <- t.test(group1, group2, var.equal = FALSE)
# Print the result
print(result)
##
## Welch Two Sample t-test
##
## data: group1 and group2
## t = -1.4977, df = 30.558, p-value = 0.1445
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -26.180632 4.017979
## sample estimates:
## mean of x mean of y
## 10.33803 21.41935
Perform chi-square test to compare minimum intake of servings fruit/veg in cg vs non cg
# Create a contingency table
contingency_table <- table(clean_data5$cg_new, clean_data5$comb_avrg_collapse)
# Perform chi-square test of proportions
result <- prop.test(contingency_table)
## Warning in prop.test(contingency_table): Chi-squared approximation may be
## incorrect
# Print the result
print(result)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: contingency_table
## X-squared = 1.2052, df = 1, p-value = 0.2723
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.04146538 0.25046129
## sample estimates:
## prop 1 prop 2
## 0.16901408 0.06451613
Perform Fisher test
# Perform Fisher's exact test
result_fisher <- fisher.test(contingency_table)
# Print the result
print(result_fisher)
##
## Fisher's Exact Test for Count Data
##
## data: contingency_table
## p-value = 0.2175
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5902052 28.6167328
## sample estimates:
## odds ratio
## 2.923272
Now to the regression (inferential analysis)
lm_model_uni <- lm(comb_avrg ~ cg_new, data = clean_data_exclude_invalid)
# Step 4: Evaluate the Model
summary(lm_model_uni)
##
## Call:
## lm(formula = comb_avrg ~ cg_new, data = clean_data_exclude_invalid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.414 -6.143 -1.143 2.722 44.586
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.143 1.011 10.036 <2e-16 ***
## cg_new 4.271 1.867 2.287 0.0244 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.456 on 97 degrees of freedom
## Multiple R-squared: 0.05117, Adjusted R-squared: 0.04139
## F-statistic: 5.231 on 1 and 97 DF, p-value: 0.02436
plot(lm_model_uni)
Now lets do multiple linear regression with our minimal sufficient
set
# Fit the multiple linear regression model
lm_model_multiple <- lm(comb_avrg ~ cg_new + age + gender + edu + sparet_w + ethnicity + home_gard, data = clean_data_exclude_invalid)
# Summarize the model
summary(lm_model_multiple)
##
## Call:
## lm(formula = comb_avrg ~ cg_new + age + gender + edu + sparet_w +
## ethnicity + home_gard, data = clean_data_exclude_invalid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.891 -5.245 -1.560 2.844 43.036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.24721 10.58627 1.346 0.1817
## cg_new 3.16189 2.32757 1.358 0.1777
## age -0.03155 0.06942 -0.454 0.6506
## gender -0.80710 1.64303 -0.491 0.6244
## edu 0.76888 1.34351 0.572 0.5685
## sparet_w 0.03158 0.04758 0.664 0.5086
## ethnicity 0.14697 0.58618 0.251 0.8026
## home_gard -3.74672 2.00456 -1.869 0.0648 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.462 on 91 degrees of freedom
## Multiple R-squared: 0.1087, Adjusted R-squared: 0.04009
## F-statistic: 1.585 on 7 and 91 DF, p-value: 0.1499
# Install and load the broom package
# Install the latest version of the cli package from CRAN
#install.packages("cli") # Install or update the cli package
# Load the updated cli package
#library(cli)
#install.packages("broom")
#library(broom)
#model_multiple_diagnostic <- augment(lm_model_multiple)
#plot(lm_model_multiple)
# Extract residuals
residuals <- residuals(lm_model_multiple)
# Extract fitted values
fitted_values <- fitted(lm_model_multiple)
# Extract leverage
leverage <- influence.measures(lm_model_multiple)$hat
clean_data_exclude_invalid[c(82,25,83), ]
## # A tibble: 3 × 18
## record_id woonplaats cg_new home_gard supermarket fruit_freq veg_freq age
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 140 2 1 1 5 7 7 26
## 2 45 2 1 1 4 7 7 54
## 3 141 1 1 1 5 7 7 65
## # ℹ 10 more variables: gender <dbl>, demo_weight <dbl>, height <dbl>,
## # ethnicity <dbl>, edu <dbl>, fruit_avrg <dbl>, veg_avrg <dbl>,
## # comb_avrg <dbl>, sparet_w <dbl>, comb_avrg_collapse <dbl>
#fit model again, use log_transform the outcome
lm_model_multiple_log <- lm(log(comb_avrg) ~ cg_new + age + gender + edu + sparet_w + ethnicity + home_gard, data = clean_data_exclude_invalid)
summary(lm_model_multiple_log)
##
## Call:
## lm(formula = log(comb_avrg) ~ cg_new + age + gender + edu + sparet_w +
## ethnicity + home_gard, data = clean_data_exclude_invalid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1964 -0.4874 -0.0147 0.4022 1.6671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.086495 0.772369 2.701 0.00824 **
## cg_new 0.101424 0.169818 0.597 0.55182
## age 0.001610 0.005065 0.318 0.75126
## gender -0.015350 0.119875 -0.128 0.89839
## edu 0.107237 0.098022 1.094 0.27684
## sparet_w 0.002196 0.003471 0.633 0.52852
## ethnicity -0.019960 0.042768 -0.467 0.64182
## home_gard -0.299969 0.146252 -2.051 0.04314 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6174 on 91 degrees of freedom
## Multiple R-squared: 0.1345, Adjusted R-squared: 0.06796
## F-statistic: 2.021 on 7 and 91 DF, p-value: 0.06077
plot(lm_model_multiple_log)
#library(tidyverse)
#install.packages("broom")
#library(broom)
#model_diagnostic <-augment(lm_model_multiple_log)
#library(ggplot2)
# check for normality
##model_diagnostic %>%
# ggplot(aes(.resid))+
#geom_histogram(binwidth = 0.05, colour="white")
##shapiro.test(model_diagnostic$.resid)
# p-value >0.05 -> not deviate from normality
summary(lm_model_multiple_log)
##
## Call:
## lm(formula = log(comb_avrg) ~ cg_new + age + gender + edu + sparet_w +
## ethnicity + home_gard, data = clean_data_exclude_invalid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1964 -0.4874 -0.0147 0.4022 1.6671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.086495 0.772369 2.701 0.00824 **
## cg_new 0.101424 0.169818 0.597 0.55182
## age 0.001610 0.005065 0.318 0.75126
## gender -0.015350 0.119875 -0.128 0.89839
## edu 0.107237 0.098022 1.094 0.27684
## sparet_w 0.002196 0.003471 0.633 0.52852
## ethnicity -0.019960 0.042768 -0.467 0.64182
## home_gard -0.299969 0.146252 -2.051 0.04314 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6174 on 91 degrees of freedom
## Multiple R-squared: 0.1345, Adjusted R-squared: 0.06796
## F-statistic: 2.021 on 7 and 91 DF, p-value: 0.06077
p-value for cg_new = 0.55 >0.05 -> not significant adjusted R squared = 0.068 => goodness of fit is very poor (I think we should include other variables such as BMI, city…)
clean_data_exclude_invalid
## # A tibble: 99 × 18
## record_id woonplaats cg_new home_gard supermarket fruit_freq veg_freq age
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2 0 2 4 1 6 34
## 2 14 1 0 2 4 72 7 33
## 3 15 1 0 2 4 7 7 32
## 4 16 2 0 2 5 6 7 35
## 5 17 2 0 2 4 3 6 41
## 6 18 2 0 2 3 4 7 23
## 7 19 1 0 2 5 7 7 51
## 8 20 1 0 2 5 5 5 58
## 9 22 2 1 1 5 5 3 68
## 10 23 2 0 2 4 5 7 55
## # ℹ 89 more rows
## # ℹ 10 more variables: gender <dbl>, demo_weight <dbl>, height <dbl>,
## # ethnicity <dbl>, edu <dbl>, fruit_avrg <dbl>, veg_avrg <dbl>,
## # comb_avrg <dbl>, sparet_w <dbl>, comb_avrg_collapse <dbl>
logistic_model <- glm(comb_avrg_collapse ~ cg_new + age + gender + edu + sparet_w + ethnicity + home_gard, data = clean_data_exclude_invalid, family = "binomial")
summary(logistic_model)
##
## Call:
## glm(formula = comb_avrg_collapse ~ cg_new + age + gender + edu +
## sparet_w + ethnicity + home_gard, family = "binomial", data = clean_data_exclude_invalid)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6624 0.1364 0.3096 0.5662 1.4723
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.43604 3.72201 0.386 0.6996
## cg_new 0.16239 0.98659 0.165 0.8693
## age 0.05142 0.02654 1.937 0.0527 .
## gender -0.01577 0.68186 -0.023 0.9816
## edu 0.59511 0.42782 1.391 0.1642
## sparet_w -0.03517 0.02012 -1.748 0.0805 .
## ethnicity -0.27602 0.19164 -1.440 0.1498
## home_gard -1.58803 0.92944 -1.709 0.0875 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 80.689 on 98 degrees of freedom
## Residual deviance: 62.430 on 91 degrees of freedom
## AIC: 78.43
##
## Number of Fisher Scoring iterations: 6
still not significant