pacman::p_load(
  tidyverse,      # includes ggplot2 and other
  stringr,        # working with characters   
  scales,         # transform numbers
  ggrepel,        # smartly-placed labels
  gghighlight,    # highlight one part of plot
  RColorBrewer,    # color scales
  tinytex,
  DAAG
)
## The following packages have been unloaded:
## tidyverse
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## also installing the dependency 'cli'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'cli' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cli'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\naima\AppData\Local\R\win-library\4.2\00LOCK\cli\libs\x64\cli.dll to
## C:\Users\naima\AppData\Local\R\win-library\4.2\cli\libs\x64\cli.dll: Permission
## denied
## Warning: restored 'cli'
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
## 
## tidyverse installed
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning in pacman::p_load(tidyverse, stringr, scales, ggrepel, gghighlight, : Failed to install/load:
## tidyverse

the data that we have

# (Tu): I changed the initial dataset
install.packages("readxl")
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'readxl' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
#set working directory (path) and read dataset 
setwd("C:/Users/naima/OneDrive/Documenten/R_studio_files/ARM_assignement")
clean_data2 <- read_excel("~/R_studio_files/ARM_assignement/clean_data_Tu.xlsx")
#clean_data2<-read_excel("C:/Users/HA TU/OneDrive - Universiteit Antwerpen/Uantwerp study/semester 1.2/ARM/cleaning data/data exported from REDCAp/clean_data_Tu.xlsx")

head(clean_data2)
## # A tibble: 6 × 37
##   record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
##       <dbl>      <dbl>          <dbl>   <dbl>   <dbl>  <dbl>     <dbl>
## 1        12          2              2      NA      NA      0         2
## 2        14          1              2      NA      NA      0         2
## 3        15          1              2      NA      NA      0         2
## 4        16          2              2      NA      NA      0         2
## 5        17          2              2      NA      NA      0         2
## 6        18          2              2      NA      NA      0         2
## # ℹ 30 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## #   fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## #   fruit_groot_schijf <dbl>, fruit_big <chr>, fruit_handful <dbl>,
## #   fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## #   fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## #   veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## #   veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …

Descriptive Statistics

Inferencial Statistics Exposure: We need two groups CG-participats only if for more than 6 months and more than once a week

Outcome: On an average day not average week (exclude days per week)

Sum: fruit and vegetables

install.packages("magrittr") # package installations are only needed the first time you use it
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'magrittr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'magrittr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\naima\AppData\Local\R\win-library\4.2\00LOCK\magrittr\libs\x64\magrittr.dll
## to
## C:\Users\naima\AppData\Local\R\win-library\4.2\magrittr\libs\x64\magrittr.dll:
## Permission denied
## Warning: restored 'magrittr'
## 
## The downloaded binary packages are in
##  C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
install.packages("dplyr")    # alternative installation of the %>%
## Installing package into 'C:/Users/naima/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\naima\AppData\Local\R\win-library\4.2\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\naima\AppData\Local\R\win-library\4.2\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\naima\AppData\Local\Temp\Rtmpcncgqx\downloaded_packages
library(magrittr) # needs to be run every time you start R and want to use %>%
## Warning: package 'magrittr' was built under R version 4.2.3
library(dplyr)    # alternatively, this also loads %>%
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
min_max_table <- sapply(clean_data2, function(x) c(Min = min(x, na.rm = TRUE), Max = max(x, na.rm = TRUE)))
min_max_table <- as.data.frame(min_max_table)
min_max_table
##     record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
## Min        12          1              1       1       1      0         1
## Max       160          2              2       2       2      1         2
##     supermarket sparet1 sparet2 fruit_freq fruit_medium fruit_small
## Min           2       0       1          0            0           0
## Max           5      24      24         72            5           4
##     fruit_groot_schijf fruit_big fruit_handful fruit_small2
## Min                  0         0             0            0
## Max                  2         3             7            2
##     fruit_handful_cherish fuit_grapes fruit_lychees fruit_salad fruit_juice
## Min                     0           0             0           0           0
## Max                     2           5             1           4           7
##     veg_freq veg_cooked veg_cooked_spoon veg_green_brocolli_cauli veg_legume
## Min        3          0                0                        0          0
## Max      715        222               20                        5          5
##     veg_lettuce veg_raw veg_juice age gender demo_weight height ethnicity edu
## Min           0       0         0  20      1          43    1.7         1   1
## Max           8       7         7  81      4         105    190         8   5
##     urban_harvest_q_complete
## Min                        0
## Max                        2
# <Tu> extract these record with invalid values
clean_data2 %>% 
  filter(veg_freq==715|veg_cooked==222|sparet1==24|sparet2==24)
## # A tibble: 3 × 37
##   record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
##       <dbl>      <dbl>          <dbl>   <dbl>   <dbl>  <dbl>     <dbl>
## 1        71          1              2      NA      NA      0         2
## 2        96          2              1       2       1      1         1
## 3       128          2              1       2       1      1         1
## # ℹ 30 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## #   fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## #   fruit_groot_schijf <dbl>, fruit_big <chr>, fruit_handful <dbl>,
## #   fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## #   fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## #   veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## #   veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …
# <Tu> As these values are on key covariates, I will exclude these records (71, 96, 128) later in the analysis

Now to collapse the fruit variables to a new variable called fruit_avrg (for that I needed to change all these variables to numeric)

fruit_variables <- c("fruit_medium", "fruit_small", "fruit_groot_schijf", "fruit_big", "fruit_handful", 
                     "fruit_small2", "fruit_handful_cherish", "fuit_grapes", "fruit_lychees", "fruit_salad", 
                     "fruit_juice")

# Convert the variables to numeric
clean_data2[, fruit_variables] <- lapply(clean_data2[, fruit_variables], as.numeric)

clean_data2$fruit_avrg <- rowSums(clean_data2[, fruit_variables], na.rm = TRUE)
head(clean_data2)
## # A tibble: 6 × 38
##   record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
##       <dbl>      <dbl>          <dbl>   <dbl>   <dbl>  <dbl>     <dbl>
## 1        12          2              2      NA      NA      0         2
## 2        14          1              2      NA      NA      0         2
## 3        15          1              2      NA      NA      0         2
## 4        16          2              2      NA      NA      0         2
## 5        17          2              2      NA      NA      0         2
## 6        18          2              2      NA      NA      0         2
## # ℹ 31 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## #   fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## #   fruit_groot_schijf <dbl>, fruit_big <dbl>, fruit_handful <dbl>,
## #   fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## #   fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## #   veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## #   veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …

Now the same for vegetables (new variable called veg_avrg)

veg_variables <- c("veg_cooked", "veg_cooked_spoon", "veg_green_brocolli_cauli", "veg_legume", "veg_lettuce", 
                     "veg_raw", "veg_juice")

# Convert the variables to numeric
clean_data2[, veg_variables] <- lapply(clean_data2[, veg_variables], as.numeric)

clean_data2$veg_avrg <- rowSums(clean_data2[, veg_variables], na.rm = TRUE)

head(clean_data2)
## # A tibble: 6 × 39
##   record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
##       <dbl>      <dbl>          <dbl>   <dbl>   <dbl>  <dbl>     <dbl>
## 1        12          2              2      NA      NA      0         2
## 2        14          1              2      NA      NA      0         2
## 3        15          1              2      NA      NA      0         2
## 4        16          2              2      NA      NA      0         2
## 5        17          2              2      NA      NA      0         2
## 6        18          2              2      NA      NA      0         2
## # ℹ 32 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## #   fruit_freq <dbl>, fruit_medium <dbl>, fruit_small <dbl>,
## #   fruit_groot_schijf <dbl>, fruit_big <dbl>, fruit_handful <dbl>,
## #   fruit_small2 <dbl>, fruit_handful_cherish <dbl>, fuit_grapes <dbl>,
## #   fruit_lychees <dbl>, fruit_salad <dbl>, fruit_juice <dbl>, veg_freq <dbl>,
## #   veg_cooked <dbl>, veg_cooked_spoon <dbl>, veg_green_brocolli_cauli <dbl>,
## #   veg_legume <dbl>, veg_lettuce <dbl>, veg_raw <dbl>, veg_juice <dbl>, …

Now to remove the single variables that are not needed anymore

clean_data3 <- subset(clean_data2, select = -c(fruit_medium, fruit_small, fruit_groot_schijf, fruit_big, fruit_handful, 
                     fruit_small2, fruit_handful_cherish, fuit_grapes, fruit_lychees, fruit_salad, 
                     fruit_juice, veg_cooked, veg_cooked_spoon, veg_green_brocolli_cauli, veg_legume, veg_lettuce, 
                     veg_raw, veg_juice, urban_harvest_q_complete
                     ))
head(clean_data3)
## # A tibble: 6 × 20
##   record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
##       <dbl>      <dbl>          <dbl>   <dbl>   <dbl>  <dbl>     <dbl>
## 1        12          2              2      NA      NA      0         2
## 2        14          1              2      NA      NA      0         2
## 3        15          1              2      NA      NA      0         2
## 4        16          2              2      NA      NA      0         2
## 5        17          2              2      NA      NA      0         2
## 6        18          2              2      NA      NA      0         2
## # ℹ 13 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## #   fruit_freq <dbl>, veg_freq <dbl>, age <dbl>, gender <dbl>,
## #   demo_weight <dbl>, height <dbl>, ethnicity <dbl>, edu <dbl>,
## #   fruit_avrg <dbl>, veg_avrg <dbl>

Now to add the combined variable of fruit and vegetables comb_avrg

comb_variables <- c("fruit_avrg", "veg_avrg")

# Convert the variables to numeric
clean_data3[, comb_variables] <- lapply(clean_data3[, comb_variables], as.numeric)

clean_data3$comb_avrg <- rowSums(clean_data3[, comb_variables], na.rm = TRUE)

head(clean_data3)
## # A tibble: 6 × 21
##   record_id woonplaats cg_participate cg_time cg_freq cg_new home_gard
##       <dbl>      <dbl>          <dbl>   <dbl>   <dbl>  <dbl>     <dbl>
## 1        12          2              2      NA      NA      0         2
## 2        14          1              2      NA      NA      0         2
## 3        15          1              2      NA      NA      0         2
## 4        16          2              2      NA      NA      0         2
## 5        17          2              2      NA      NA      0         2
## 6        18          2              2      NA      NA      0         2
## # ℹ 14 more variables: supermarket <dbl>, sparet1 <dbl>, sparet2 <dbl>,
## #   fruit_freq <dbl>, veg_freq <dbl>, age <dbl>, gender <dbl>,
## #   demo_weight <dbl>, height <dbl>, ethnicity <dbl>, edu <dbl>,
## #   fruit_avrg <dbl>, veg_avrg <dbl>, comb_avrg <dbl>

Now to collabs the spare time variables (they will remain in clean_data5) Matteo: I will go with weekly here (to make it dayly it just needs to be divided by 7) Tu: I add new variable comb_avrg_collapse to catogorize the outcome

library(dplyr)

clean_data4 <- clean_data3 %>%
  mutate(sparet_w = sparet1 * 5 + sparet2 * 2,
         comb_avrg_collapse = ifelse(comb_avrg>=5,1,0))

Now remove the unnecesarry variables

clean_data5 <- subset(clean_data4, select = -c(sparet1, sparet2,
                                               cg_time, cg_freq,cg_participate))
                                               
head(clean_data5)
## # A tibble: 6 × 18
##   record_id woonplaats cg_new home_gard supermarket fruit_freq veg_freq   age
##       <dbl>      <dbl>  <dbl>     <dbl>       <dbl>      <dbl>    <dbl> <dbl>
## 1        12          2      0         2           4          1        6    34
## 2        14          1      0         2           4         72        7    33
## 3        15          1      0         2           4          7        7    32
## 4        16          2      0         2           5          6        7    35
## 5        17          2      0         2           4          3        6    41
## 6        18          2      0         2           3          4        7    23
## # ℹ 10 more variables: gender <dbl>, demo_weight <dbl>, height <dbl>,
## #   ethnicity <dbl>, edu <dbl>, fruit_avrg <dbl>, veg_avrg <dbl>,
## #   comb_avrg <dbl>, sparet_w <dbl>, comb_avrg_collapse <dbl>
# exclude record with invalid values
clean_data_exclude_invalid <- clean_data5 %>% 
  filter(record_id!=71 &
           record_id!=96 &
           record_id!=128)

Now the data is ready for the analysis

Descriptive Statistics add proportions for question 1,2 1. How many participates eat 5 or more portions of fruit or vegetable per day? 2. How many participants eat fruit/vegeatable every of the week? 3. Descriptive stats for fruit/day and vegetable/day (cg yes / cg no) 4. Do they meet the recommendation comparing cg yes and cg no

lets go

1. How many participates eat 5 or more portions of fruit or vegetable per day?

participants_comb_avg_5_or_more <- clean_data_exclude_invalid %>%
  filter(comb_avrg >= 5) %>%
  nrow()

participants_comb_avg_5_or_more
## [1] 85
# Tu: add proportions
participants_comb_avg_5_or_more/ nrow(clean_data_exclude_invalid)*100
## [1] 85.85859

2. How many participants eat fruit/vegeatable every of day the week? - How many participants eat fruit every day of the week?

sum(clean_data_exclude_invalid$fruit_freq == 7)
## [1] 46
sum(clean_data_exclude_invalid$veg_freq == 7)
## [1] 71
sum(clean_data_exclude_invalid$fruit_freq == 7 | clean_data_exclude_invalid$veg_freq == 7)
## [1] 78
sum(clean_data_exclude_invalid$fruit_freq == 7 & clean_data_exclude_invalid$veg_freq == 7)
## [1] 39

`` Compare daily intake of fruit/veg in cg vs ncg

# Subset the data for community gardeners and non-community gardeners
community_gardeners <- subset(clean_data_exclude_invalid, cg_new == 1)
non_community_gardeners <- subset(clean_data_exclude_invalid, cg_new == 0)

# Calculate the proportion of community gardeners and non-community gardeners meeting the criteria
prop_community_gardeners <- mean(community_gardeners$fruit_freq == 7 & community_gardeners$veg_freq == 7)
prop_non_community_gardeners <- mean(non_community_gardeners$fruit_freq == 7 & non_community_gardeners$veg_freq == 7)

# Print the proportions
print(prop_community_gardeners)
## [1] 0.5862069
print(prop_non_community_gardeners)
## [1] 0.3142857

significance in proportion who have a daily intake in fruit and veg

# Sample sizes for each group (assuming you have this information)
n_community_gardeners <-31
n_non_community_gardeners <- 71

# Calculate the pooled proportion
p_pooled <- (prop_community_gardeners * n_community_gardeners + prop_non_community_gardeners * n_non_community_gardeners) / (n_community_gardeners + n_non_community_gardeners)

# Calculate the standard error
se <- sqrt(p_pooled * (1 - p_pooled) * (1 / n_community_gardeners + 1 / n_non_community_gardeners))

# Calculate the test statistic (z-score)
z <- (prop_community_gardeners - prop_non_community_gardeners) / se

# Calculate the p-value
p_value <- 2 * pnorm(-abs(z))  # 2-tailed test

# Print the test statistic and p-value
print(z)
## [1] 2.58174
print(p_value)
## [1] 0.009830371

plot daily intake cg vs non-cg

library(ggplot2)

# Data
prop_community_gardeners <- 0.5862069
prop_non_community_gardeners <- 0.3142857

# Create a data frame for plotting
data <- data.frame(
  Group = c("Community Gardeners", "Non-Community Gardeners"),
  Proportion = c(prop_community_gardeners, prop_non_community_gardeners)
)

# Create the bar plot
ggplot(data, aes(x = Group, y = Proportion, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.5) +
  labs(
    title = "Daily Intake of Fruit/Vegetables",
    y = "Proportion",
    fill = "Group"
  ) +
  theme_minimal()

analysis on the subset of respondents who consume fruit and vegetables every day (fruit_freq == 7 and veg_freq == 7) and determine how many eat 5 servings a day

# Subset the data for respondents with daily intake of fruit and vegetables
subset_daily_intake <- subset(clean_data_exclude_invalid, fruit_freq == 7 & veg_freq == 7)

# Subset the data further for community gardeners and non-community gardeners
cg_daily_intake <- subset(subset_daily_intake, cg_new == 1)
non_cg_daily_intake <- subset(subset_daily_intake, cg_new == 0)

# Calculate the number of respondents eating 5 servings a day for each group
num_cg_5_servings <- sum(cg_daily_intake$comb_avrg_collapse == 1)
num_non_cg_5_servings <- sum(non_cg_daily_intake$comb_avrg_collapse == 1)

# Print the results
print(num_cg_5_servings)
## [1] 17
print(num_non_cg_5_servings)
## [1] 18
#There are 17 community gardeners within the subset of respondents who consume fruit and vegetables every day and also eat 5 servings a day.
#There are 18 non-community gardeners within the same subset who consume fruit and vegetables every day and also eat 5 servings a day.



#statistical signficant?
# Define the proportions
prop_cg <- 17  # Number of community gardeners eating 5 servings a day
prop_non_cg <- 18  # Number of non-community gardeners eating 5 servings a day

# Sample sizes for each group (assuming you have this information)
n_cg <- 31  # Total number of community gardeners in the subset
n_non_cg <- 71  # Total number of non-community gardeners in the subset

# Calculate the proportions
prop_cg <- prop_cg / n_cg
prop_non_cg <- prop_non_cg / n_non_cg

# Calculate the pooled proportion
p_pooled <- (prop_cg * n_cg + prop_non_cg * n_non_cg) / (n_cg + n_non_cg)

# Calculate the standard error
se <- sqrt(p_pooled * (1 - p_pooled) * (1 / n_cg + 1 / n_non_cg))

# Calculate the test statistic (z-score)
z <- (prop_cg - prop_non_cg) / se

# Calculate the p-value
p_value <- 2 * pnorm(-abs(z))  # 2-tailed test

# Print the test statistic and p-value
print(z)
## [1] 2.885116
print(p_value)
## [1] 0.00391269
#The result of the z-test indicates a z-score of approximately 2.885116 and a corresponding two-tailed p-value of approximately 0.00391269. significant difference in the proportions of individuals eating 5 servings of fruit and vegetables a day between community gardeners and non-community gardeners. 

Number of cg and noncg with daily intake

# Number of community gardeners with daily intake
num_cg_daily_intake <- sum(clean_data_exclude_invalid$cg_new == 1 & clean_data_exclude_invalid$fruit_freq == 7 & clean_data_exclude_invalid$veg_freq == 7)

# Number of non-community gardeners with daily intake
num_non_cg_daily_intake <- sum(clean_data_exclude_invalid$cg_new == 0 & clean_data_exclude_invalid$fruit_freq == 7 & clean_data_exclude_invalid$veg_freq == 7)

# Print the results
print(num_cg_daily_intake)
## [1] 17
print(num_non_cg_daily_intake)
## [1] 22

plot

library(ggplot2)

# Create a data frame for plotting
data_plot <- data.frame(
  Group = c("Community Gardeners", "Non-Community Gardeners"),
  Proportion = c(prop_cg, prop_non_cg)
)

# Create the bar plot
ggplot(data_plot, aes(x = Group, y = Proportion, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.5) +
  labs(
    title = "Proportion of Individuals Eating 5 Servings of Fruit/Vegetables a Day",
    y = "Proportion",
    fill = "Group"
  ) +
  theme_minimal()

Plot community and non-community gardeners who eat fruit veg every day and also 5 servings

library(ggplot2)

# Data
community_gardeners <- 17  # Number of community gardeners eating 5 servings a day
non_community_gardeners <- 18  # Number of non-community gardeners eating 5 servings a day

# Total number of individuals in each group
total_cg <- 31  # Total number of community gardeners in the subset
total_non_cg <- 71  # Total number of non-community gardeners in the subset

# Calculate proportions
prop_cg_5_servings <- community_gardeners / total_cg
prop_non_cg_5_servings <- non_community_gardeners / total_non_cg
prop_cg_not_5_servings <- 1 - prop_cg_5_servings
prop_non_cg_not_5_servings <- 1 - prop_non_cg_5_servings

# Create a data frame for plotting
data <- data.frame(
  Group = rep(c("Community Gardeners", "Non-Community Gardeners"), each = 2),
  Status = rep(c("Five Servings", "Not Five Servings"), times = 2),
  Proportion = c(prop_cg_5_servings, prop_cg_not_5_servings, prop_non_cg_5_servings, prop_non_cg_not_5_servings)
)

# Create the bar plot
ggplot(data, aes(x = Group, y = Proportion, fill = Status)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Proportion of Individuals Eating Daily Fruit/Veg and 5 Servings a Day",
    y = "Proportion",
    fill = "Status"
  ) +
  theme_minimal()

Average number of fruit and veg in group of cg and non cg with daily intake

# Subset the data for individuals eating fruit/vegetables every day
daily_intake_data <- data[clean_data_exclude_invalid$fruit_freq == 7 & data$veg_freq == 7, ]

# Subset the data for community gardeners and calculate averages
cg_daily_intake_data <- daily_intake_data[daily_intake_data$cg_new == 1, ]
if (nrow(cg_daily_intake_data) > 0) {
  avg_fruit_cg <- mean(cg_daily_intake_data$fruit_avrg, na.rm = TRUE)
  avg_veg_cg <- mean(cg_daily_intake_data$veg_avrg, na.rm = TRUE)
  avg_comb_cg <- mean(cg_daily_intake_data$comb_avrg, na.rm = TRUE)
} else {
  avg_fruit_cg <- "No data"
  avg_veg_cg <- "No data"
  avg_comb_cg <- "No data"
}

# Subset the data for non-community gardeners and calculate averages
non_cg_daily_intake_data <- daily_intake_data[daily_intake_data$cg_new == 0, ]
if (nrow(non_cg_daily_intake_data) > 0) {
  avg_fruit_non_cg <- mean(non_cg_daily_intake_data$fruit_avrg, na.rm = TRUE)
  avg_veg_non_cg <- mean(non_cg_daily_intake_data$veg_avrg, na.rm = TRUE)
  avg_comb_non_cg <- mean(non_cg_daily_intake_data$comb_avrg, na.rm = TRUE)
} else {
  avg_fruit_non_cg <- "No data"
  avg_veg_non_cg <- "No data"
  avg_comb_non_cg <- "No data"
}

# Print the results
print("Community Gardeners:")
## [1] "Community Gardeners:"
print(paste("Average servings of fruit:", avg_fruit_cg))
## [1] "Average servings of fruit: No data"
print(paste("Average servings of vegetables:", avg_veg_cg))
## [1] "Average servings of vegetables: No data"
print(paste("Average combined servings:", avg_comb_cg))
## [1] "Average combined servings: No data"
print("Non-Community Gardeners:")
## [1] "Non-Community Gardeners:"
print(paste("Average servings of fruit:", avg_fruit_non_cg))
## [1] "Average servings of fruit: No data"
print(paste("Average servings of vegetables:", avg_veg_non_cg))
## [1] "Average servings of vegetables: No data"
print(paste("Average combined servings:", avg_comb_non_cg))
## [1] "Average combined servings: No data"

3. Descriptive stats for fruit/day and vegetable/day (cg yes / cg no) CG yes group Fruit consumption per day in the community gardeners

summary_stats_fruit_avrg <- clean_data_exclude_invalid %>%
  filter(cg_new == 1) %>%
  summarize(
    mean_fruit_avrg = mean(fruit_avrg, na.rm = TRUE),
    median_fruit_avrg = median(fruit_avrg, na.rm = TRUE),
    min_fruit_avrg = min(fruit_avrg, na.rm = TRUE),
    max_fruit_avrg = max(fruit_avrg, na.rm = TRUE),
    sd_fruit_avrg = sd(fruit_avrg, na.rm = TRUE)
  )

summary_stats_fruit_avrg
## # A tibble: 1 × 5
##   mean_fruit_avrg median_fruit_avrg min_fruit_avrg max_fruit_avrg sd_fruit_avrg
##             <dbl>             <dbl>          <dbl>          <dbl>         <dbl>
## 1            4.66                 3              1             14          3.19

Vegetable consumption per day cg yes group

summary_stats_veg_avrg <- clean_data_exclude_invalid %>%
  filter(cg_new == 1) %>%
  summarize(
    mean_veg_avrg = mean(veg_avrg, na.rm = TRUE),
    median_veg_avrg = median(veg_avrg, na.rm = TRUE),
    min_veg_avrg = min(veg_avrg, na.rm = TRUE),
    max_veg_avrg = max(veg_avrg, na.rm = TRUE),
    sd_veg_avrg = sd(veg_avrg, na.rm = TRUE)
  )

summary_stats_veg_avrg
## # A tibble: 1 × 5
##   mean_veg_avrg median_veg_avrg min_veg_avrg max_veg_avrg sd_veg_avrg
##           <dbl>           <dbl>        <dbl>        <dbl>       <dbl>
## 1          9.76               5            2           52        11.5

Combined consumption per day cg yes group

summary_stats_comb_avrg <- clean_data_exclude_invalid %>%
  filter(cg_new == 1) %>%
  summarize(
    mean_comb_avrg = mean(comb_avrg, na.rm = TRUE),
    median_comb_avrg = median(comb_avrg, na.rm = TRUE),
    min_comb_avrg = min(comb_avrg, na.rm = TRUE),
    max_comb_avrg = max(comb_avrg, na.rm = TRUE),
    sd_comb_avrg = sd(comb_avrg, na.rm = TRUE)
  )

summary_stats_comb_avrg
## # A tibble: 1 × 5
##   mean_comb_avrg median_comb_avrg min_comb_avrg max_comb_avrg sd_comb_avrg
##            <dbl>            <dbl>         <dbl>         <dbl>        <dbl>
## 1           14.4               10             4            59         12.8

CG no group Fruit consumption per day in the cg no group

summary_stats_fruit_avrg <- clean_data_exclude_invalid %>%
  filter(cg_new == 0) %>%
  summarize(
    mean_fruit_avrg = mean(fruit_avrg, na.rm = TRUE),
    median_fruit_avrg = median(fruit_avrg, na.rm = TRUE),
    min_fruit_avrg = min(fruit_avrg, na.rm = TRUE),
    max_fruit_avrg = max(fruit_avrg, na.rm = TRUE),
    sd_fruit_avrg = sd(fruit_avrg, na.rm = TRUE)
  )

summary_stats_fruit_avrg
## # A tibble: 1 × 5
##   mean_fruit_avrg median_fruit_avrg min_fruit_avrg max_fruit_avrg sd_fruit_avrg
##             <dbl>             <dbl>          <dbl>          <dbl>         <dbl>
## 1            3.79                 3              1             14          2.47

Vegetable consumption per day cg no group

summary_stats_veg_avrg <- clean_data_exclude_invalid %>%
  filter(cg_new == 0) %>%
  summarize(
    mean_veg_avrg = mean(veg_avrg, na.rm = TRUE),
    median_veg_avrg = median(veg_avrg, na.rm = TRUE),
    min_veg_avrg = min(veg_avrg, na.rm = TRUE),
    max_veg_avrg = max(veg_avrg, na.rm = TRUE),
    sd_veg_avrg = sd(veg_avrg, na.rm = TRUE)
  )

summary_stats_veg_avrg
## # A tibble: 1 × 5
##   mean_veg_avrg median_veg_avrg min_veg_avrg max_veg_avrg sd_veg_avrg
##           <dbl>           <dbl>        <dbl>        <dbl>       <dbl>
## 1          6.36               5            1           25        4.21

Combined consumption per day cg no group

summary_stats_comb_avrg <- clean_data_exclude_invalid %>%
  filter(cg_new == 0) %>%
  summarize(
    mean_comb_avrg = mean(comb_avrg, na.rm = TRUE),
    median_comb_avrg = median(comb_avrg, na.rm = TRUE),
    min_comb_avrg = min(comb_avrg, na.rm = TRUE),
    max_comb_avrg = max(comb_avrg, na.rm = TRUE),
    sd_comb_avrg = sd(comb_avrg, na.rm = TRUE)
  )

summary_stats_comb_avrg
## # A tibble: 1 × 5
##   mean_comb_avrg median_comb_avrg min_comb_avrg max_comb_avrg sd_comb_avrg
##            <dbl>            <dbl>         <dbl>         <dbl>        <dbl>
## 1           10.1               10             2            29         5.79
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.2.2
write.xlsx(clean_data5, "clean_data5.xlsx")

Perform indpendent t-test to compare mean number of servings F/V between the two groups (cg vs ncg)

#variantie test
var.test(comb_avrg~cg_new, data=clean_data5)
## 
##  F test to compare two variances
## 
## data:  comb_avrg by cg_new
## F = 0.021255, num df = 70, denom df = 30, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.01107329 0.03781766
## sample estimates:
## ratio of variances 
##         0.02125534
var.test(fruit_avrg~cg_new, data=clean_data5)
## 
##  F test to compare two variances
## 
## data:  fruit_avrg by cg_new
## F = 0.61539, num df = 70, denom df = 30, p-value = 0.09873
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3205943 1.0948985
## sample estimates:
## ratio of variances 
##          0.6153856
var.test(veg_avrg~cg_new, data = clean_data5)
## 
##  F test to compare two variances
## 
## data:  veg_avrg by cg_new
## F = 0.012228, num df = 70, denom df = 30, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.00637019 0.02175557
## sample estimates:
## ratio of variances 
##         0.01222767
#equal variance is assumed, dus je kan geen t-test doen, wel Welsh' test
#p-value <0.05, dus you reject H0 = strong evidence to assume unequal variance

#t-test --> Welsh' test
#unequal variance, dus je vult in var_equal= FALSE
t.test(comb_avrg~cg_new, data=clean_data5,var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  comb_avrg by cg_new
## t = -1.4977, df = 30.558, p-value = 0.1445
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -26.180632   4.017979
## sample estimates:
## mean in group 0 mean in group 1 
##        10.33803        21.41935
t.test(fruit_avrg~cg_new, data=clean_data5,var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  fruit_avrg by cg_new
## t = -1.4285, df = 46.838, p-value = 0.1598
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -2.1742651  0.3687222
## sample estimates:
## mean in group 0 mean in group 1 
##        3.774648        4.677419
t.test(veg_avrg~cg_new, data=clean_data5,var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  veg_avrg by cg_new
## t = -1.3811, df = 30.321, p-value = 0.1773
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -25.22275   4.86564
## sample estimates:
## mean in group 0 mean in group 1 
##         6.56338        16.74194
#p-value = geen statistisch significant verschil
# Separate data into two groups based on cg_new
group1 <- clean_data5$comb_avrg[clean_data5$cg_new == 0]
group2 <- clean_data5$comb_avrg[clean_data5$cg_new == 1]

# Perform Welch's t-test
result <- t.test(group1, group2, var.equal = FALSE)

# Print the result
print(result)
## 
##  Welch Two Sample t-test
## 
## data:  group1 and group2
## t = -1.4977, df = 30.558, p-value = 0.1445
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -26.180632   4.017979
## sample estimates:
## mean of x mean of y 
##  10.33803  21.41935

Perform chi-square test to compare minimum intake of servings fruit/veg in cg vs non cg

# Create a contingency table
contingency_table <- table(clean_data5$cg_new, clean_data5$comb_avrg_collapse)

# Perform chi-square test of proportions
result <- prop.test(contingency_table)
## Warning in prop.test(contingency_table): Chi-squared approximation may be
## incorrect
# Print the result
print(result)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  contingency_table
## X-squared = 1.2052, df = 1, p-value = 0.2723
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.04146538  0.25046129
## sample estimates:
##     prop 1     prop 2 
## 0.16901408 0.06451613

Perform Fisher test

# Perform Fisher's exact test
result_fisher <- fisher.test(contingency_table)

# Print the result
print(result_fisher)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table
## p-value = 0.2175
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.5902052 28.6167328
## sample estimates:
## odds ratio 
##   2.923272

Now to the regression (inferential analysis)

lm_model_uni <- lm(comb_avrg ~ cg_new, data = clean_data_exclude_invalid)

# Step 4: Evaluate the Model
summary(lm_model_uni)
## 
## Call:
## lm(formula = comb_avrg ~ cg_new, data = clean_data_exclude_invalid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.414  -6.143  -1.143   2.722  44.586 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   10.143      1.011  10.036   <2e-16 ***
## cg_new         4.271      1.867   2.287   0.0244 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.456 on 97 degrees of freedom
## Multiple R-squared:  0.05117,    Adjusted R-squared:  0.04139 
## F-statistic: 5.231 on 1 and 97 DF,  p-value: 0.02436
plot(lm_model_uni)

Now lets do multiple linear regression with our minimal sufficient set

# Fit the multiple linear regression model
lm_model_multiple <- lm(comb_avrg ~ cg_new + age + gender + edu + sparet_w + ethnicity + home_gard, data = clean_data_exclude_invalid)

# Summarize the model
summary(lm_model_multiple)
## 
## Call:
## lm(formula = comb_avrg ~ cg_new + age + gender + edu + sparet_w + 
##     ethnicity + home_gard, data = clean_data_exclude_invalid)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.891 -5.245 -1.560  2.844 43.036 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 14.24721   10.58627   1.346   0.1817  
## cg_new       3.16189    2.32757   1.358   0.1777  
## age         -0.03155    0.06942  -0.454   0.6506  
## gender      -0.80710    1.64303  -0.491   0.6244  
## edu          0.76888    1.34351   0.572   0.5685  
## sparet_w     0.03158    0.04758   0.664   0.5086  
## ethnicity    0.14697    0.58618   0.251   0.8026  
## home_gard   -3.74672    2.00456  -1.869   0.0648 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.462 on 91 degrees of freedom
## Multiple R-squared:  0.1087, Adjusted R-squared:  0.04009 
## F-statistic: 1.585 on 7 and 91 DF,  p-value: 0.1499

: model diagnostics

# Install and load the broom package

# Install the latest version of the cli package from CRAN
#install.packages("cli")  # Install or update the cli package

# Load the updated cli package
#library(cli)
#install.packages("broom")
#library(broom)

#model_multiple_diagnostic <- augment(lm_model_multiple)

#plot(lm_model_multiple)

# Extract residuals
residuals <- residuals(lm_model_multiple)

# Extract fitted values
fitted_values <- fitted(lm_model_multiple)

# Extract leverage
leverage <- influence.measures(lm_model_multiple)$hat

we can see many assumptions are not satisfied; obs. 82, 25, 83 can be influential

clean_data_exclude_invalid[c(82,25,83), ]
## # A tibble: 3 × 18
##   record_id woonplaats cg_new home_gard supermarket fruit_freq veg_freq   age
##       <dbl>      <dbl>  <dbl>     <dbl>       <dbl>      <dbl>    <dbl> <dbl>
## 1       140          2      1         1           5          7        7    26
## 2        45          2      1         1           4          7        7    54
## 3       141          1      1         1           5          7        7    65
## # ℹ 10 more variables: gender <dbl>, demo_weight <dbl>, height <dbl>,
## #   ethnicity <dbl>, edu <dbl>, fruit_avrg <dbl>, veg_avrg <dbl>,
## #   comb_avrg <dbl>, sparet_w <dbl>, comb_avrg_collapse <dbl>

but I don’t see unusual in these obs.

#fit model again, use log_transform the outcome
lm_model_multiple_log <- lm(log(comb_avrg) ~ cg_new + age + gender + edu + sparet_w + ethnicity + home_gard, data = clean_data_exclude_invalid)

summary(lm_model_multiple_log)
## 
## Call:
## lm(formula = log(comb_avrg) ~ cg_new + age + gender + edu + sparet_w + 
##     ethnicity + home_gard, data = clean_data_exclude_invalid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1964 -0.4874 -0.0147  0.4022  1.6671 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.086495   0.772369   2.701  0.00824 **
## cg_new       0.101424   0.169818   0.597  0.55182   
## age          0.001610   0.005065   0.318  0.75126   
## gender      -0.015350   0.119875  -0.128  0.89839   
## edu          0.107237   0.098022   1.094  0.27684   
## sparet_w     0.002196   0.003471   0.633  0.52852   
## ethnicity   -0.019960   0.042768  -0.467  0.64182   
## home_gard   -0.299969   0.146252  -2.051  0.04314 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6174 on 91 degrees of freedom
## Multiple R-squared:  0.1345, Adjusted R-squared:  0.06796 
## F-statistic: 2.021 on 7 and 91 DF,  p-value: 0.06077
plot(lm_model_multiple_log)

I see the assumptions are more satisfied

#library(tidyverse)
#install.packages("broom")
#library(broom)
#model_diagnostic <-augment(lm_model_multiple_log)
#library(ggplot2)
# check for normality
##model_diagnostic %>% 
 # ggplot(aes(.resid))+
  #geom_histogram(binwidth = 0.05, colour="white")

##shapiro.test(model_diagnostic$.resid)
# p-value >0.05 -> not deviate from normality

interpret the result

summary(lm_model_multiple_log)
## 
## Call:
## lm(formula = log(comb_avrg) ~ cg_new + age + gender + edu + sparet_w + 
##     ethnicity + home_gard, data = clean_data_exclude_invalid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1964 -0.4874 -0.0147  0.4022  1.6671 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.086495   0.772369   2.701  0.00824 **
## cg_new       0.101424   0.169818   0.597  0.55182   
## age          0.001610   0.005065   0.318  0.75126   
## gender      -0.015350   0.119875  -0.128  0.89839   
## edu          0.107237   0.098022   1.094  0.27684   
## sparet_w     0.002196   0.003471   0.633  0.52852   
## ethnicity   -0.019960   0.042768  -0.467  0.64182   
## home_gard   -0.299969   0.146252  -2.051  0.04314 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6174 on 91 degrees of freedom
## Multiple R-squared:  0.1345, Adjusted R-squared:  0.06796 
## F-statistic: 2.021 on 7 and 91 DF,  p-value: 0.06077

p-value for cg_new = 0.55 >0.05 -> not significant adjusted R squared = 0.068 => goodness of fit is very poor (I think we should include other variables such as BMI, city…)

logistic model, just to explore

clean_data_exclude_invalid
## # A tibble: 99 × 18
##    record_id woonplaats cg_new home_gard supermarket fruit_freq veg_freq   age
##        <dbl>      <dbl>  <dbl>     <dbl>       <dbl>      <dbl>    <dbl> <dbl>
##  1        12          2      0         2           4          1        6    34
##  2        14          1      0         2           4         72        7    33
##  3        15          1      0         2           4          7        7    32
##  4        16          2      0         2           5          6        7    35
##  5        17          2      0         2           4          3        6    41
##  6        18          2      0         2           3          4        7    23
##  7        19          1      0         2           5          7        7    51
##  8        20          1      0         2           5          5        5    58
##  9        22          2      1         1           5          5        3    68
## 10        23          2      0         2           4          5        7    55
## # ℹ 89 more rows
## # ℹ 10 more variables: gender <dbl>, demo_weight <dbl>, height <dbl>,
## #   ethnicity <dbl>, edu <dbl>, fruit_avrg <dbl>, veg_avrg <dbl>,
## #   comb_avrg <dbl>, sparet_w <dbl>, comb_avrg_collapse <dbl>
logistic_model <- glm(comb_avrg_collapse ~ cg_new + age + gender + edu + sparet_w + ethnicity + home_gard, data = clean_data_exclude_invalid, family = "binomial")
summary(logistic_model)
## 
## Call:
## glm(formula = comb_avrg_collapse ~ cg_new + age + gender + edu + 
##     sparet_w + ethnicity + home_gard, family = "binomial", data = clean_data_exclude_invalid)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6624   0.1364   0.3096   0.5662   1.4723  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  1.43604    3.72201   0.386   0.6996  
## cg_new       0.16239    0.98659   0.165   0.8693  
## age          0.05142    0.02654   1.937   0.0527 .
## gender      -0.01577    0.68186  -0.023   0.9816  
## edu          0.59511    0.42782   1.391   0.1642  
## sparet_w    -0.03517    0.02012  -1.748   0.0805 .
## ethnicity   -0.27602    0.19164  -1.440   0.1498  
## home_gard   -1.58803    0.92944  -1.709   0.0875 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 80.689  on 98  degrees of freedom
## Residual deviance: 62.430  on 91  degrees of freedom
## AIC: 78.43
## 
## Number of Fisher Scoring iterations: 6

still not significant