#CREATE AND EXPLORE YOUR DATASET
#generating a random dataset of 800
set.seed(123)
n<-800
#simulate age (18-80) that randomly assign ages from 18 to 80
age <- sample(18:80, n, replace = TRUE)
#Assign foodline
foodline <- sample(c("North","South"),n, replace = TRUE)
#Potato salad exposure especially north tent has contaminated potato salad
potato_salad <- ifelse(foodline=="North",rbinom(n,1,0.7),rbinom(n,1,0.3))
#Illness variable about 40% got sick, attack rate around 35% and north tent has a higher illness
illness_prob <- ifelse(foodline=="North"& potato_salad==1,0.65,0.20)
illness_status <- rbinom(n,1,illness_prob)
#symptoms severity where older adults (>50) are more severe
symptom_severity <- ifelse(age > 50 & illness_status==1,sample(7:10,n,replace = TRUE),sample(1:6,n,replace = TRUE))
#adding missing value inorder to make the dateset realistic
age[sample(1:n,20)]<- NA
potato_salad[sample(1:n,15)]<-NA
#Build dataframe
festival_data <- data.frame(age, foodline,potato_salad,illness_status,symptom_severity)
head(festival_data)
str(festival_data)
## 'data.frame': 800 obs. of 5 variables:
## $ age : int 48 32 68 31 20 59 67 71 60 54 ...
## $ foodline : chr "North" "North" "North" "North" ...
## $ potato_salad : int 1 1 1 0 1 0 1 1 0 1 ...
## $ illness_status : int 1 1 1 0 0 0 1 1 1 1 ...
## $ symptom_severity: int 3 2 8 6 4 2 7 7 7 7 ...
summary(festival_data)
## age foodline potato_salad illness_status
## Min. :18.00 Length:800 Min. :0.0000 Min. :0.0000
## 1st Qu.:34.00 Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Median :49.00 Mode :character Median :1.0000 Median :0.0000
## Mean :49.16 Mean :0.5159 Mean :0.3925
## 3rd Qu.:65.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :80.00 Max. :1.0000 Max. :1.0000
## NA's :20 NA's :15
## symptom_severity
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 4.000
## Mean : 4.421
## 3rd Qu.: 6.000
## Max. :10.000
##
#HOW MANY PPLE GOT SICK
table(festival_data$illness_status)
##
## 0 1
## 486 314
#average age of participants ignoring missing values
mean(festival_data$age,na.rm = TRUE)
## [1] 49.15769
#missing values
colSums(is.na(festival_data))
## age foodline potato_salad illness_status
## 20 0 15 0
## symptom_severity
## 0
library(magrittr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'stringr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ ggplot2 4.0.3 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#filtering to show only sick people
sick_people <- festival_data %>% filter(illness_status==1)
#group by foodline and count sick vs not sick
festival_data %>% group_by(foodline,illness_status) %>% summarise(count=n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by foodline and illness_status.
## ℹ Output is grouped by foodline.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(foodline, illness_status))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
#Creating a new variable age group
festival_data<- festival_data %>% mutate(age_group=case_when(age>=18 & age <=35~"young",age>35~"older"))
#Reshaping my data incase more food were available like burger and juice from wide to long format
festival_data$burger<- rbinom(n,1,0.5)
festival_data$juice <- rbinom(n,1,0.5)
long_data <- festival_data %>% pivot_longer(cols = c(potato_salad,burger,juice),names_to = "food",values_to = "consumed")
#how many food eaten with different people
festival_data<-festival_data%>%mutate(food_count=rowSums(select(.,potato_salad,burger,juice),na.rm = TRUE))
# Data with tables and statistics summary by table average by ilnness
festival_data%>%group_by(illness_status)%>%summarise(mean_age=mean(age,na.rm=TRUE))
#count by foodline
festival_data %>% group_by(foodline,illness_status) %>% summarise(count=n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by foodline and illness_status.
## ℹ Output is grouped by foodline.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(foodline, illness_status))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
#Descriptive statistics for age
summary(festival_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.00 34.00 49.00 49.16 65.00 80.00 20
#comparing ages BETWEEN SICK AND NOT sick
festival_data %>% group_by(illness_status) %>% summarise(mean_age=mean(age,na.rm = TRUE))
#Frequency tables
table(festival_data$foodline)
##
## North South
## 408 392
table(festival_data$illness_status)
##
## 0 1
## 486 314
#percentage of those who got sick in foodline
prop.table(
table(festival_data$foodline,
festival_data$illness_status),
1)
##
## 0 1
## North 0.4656863 0.5343137
## South 0.7551020 0.2448980
#Statistical tests
#t test qn do sick and non sick differ in age meaning p<0.05 means significant difference
t.test(age~illness_status,
data=festival_data)
##
## Welch Two Sample t-test
##
## data: age by illness_status
## t = 0.45378, df = 651.27, p-value = 0.6501
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.972232 3.157737
## sample estimates:
## mean in group 0 mean in group 1
## 49.38947 48.79672
#chi-square test
#qn is illness associated with food line?
chisq.test(
table(festival_data$foodline,
festival_data$illness_status))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(festival_data$foodline, festival_data$illness_status)
## X-squared = 69.02, df = 1, p-value < 2.2e-16
#attack rates sick/exposed
festival_data %>%
group_by(foodline) %>%
summarise(
attack_rate=
mean(illness_status)*100
)
#Food item with the highest risk meaning potato salad = likely contaminated source
festival_data %>%
group_by(potato_salad) %>%
summarise(
illness_rate=
mean(illness_status))
#creating a risk score higher score risky foods eaten
festival_data <- festival_data %>%
mutate(risk_score=
potato_salad+
burger+
juice)
#Logistic Regression
model1 <- glm(
illness_status~
age+
foodline,
family=binomial,
data=festival_data)
summary(model1)
##
## Call:
## glm(formula = illness_status ~ age + foodline, family = binomial,
## data = festival_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.163060 0.232836 0.700 0.484
## age -0.000761 0.004324 -0.176 0.860
## foodlineSouth -1.248350 0.155685 -8.018 1.07e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1043.96 on 779 degrees of freedom
## Residual deviance: 975.61 on 777 degrees of freedom
## (20 observations deleted due to missingness)
## AIC: 981.61
##
## Number of Fisher Scoring iterations: 4
#adding variables
model2 <- glm(
illness_status~
age+
foodline+
potato_salad+
risk_score,
family=binomial,
data=festival_data)
#Compare
summary(model2)
##
## Call:
## glm(formula = illness_status ~ age + foodline + potato_salad +
## risk_score, family = binomial, data = festival_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7290840 0.2884823 -2.527 0.0115 *
## age 0.0001264 0.0044990 0.028 0.9776
## foodlineSouth -0.8238491 0.1716750 -4.799 1.60e-06 ***
## potato_salad 1.0294434 0.2099907 4.902 9.47e-07 ***
## risk_score 0.0630773 0.1150792 0.548 0.5836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1025.52 on 764 degrees of freedom
## Residual deviance: 918.21 on 760 degrees of freedom
## (35 observations deleted due to missingness)
## AIC: 928.21
##
## Number of Fisher Scoring iterations: 4
#Odds ratio
exp(coef(model2))
## (Intercept) age foodlineSouth potato_salad risk_score
## 0.4823506 1.0001264 0.4387397 2.7995072 1.0651091
#confidence level
exp(confint(model2))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.2728709 0.8465737
## age 0.9913352 1.0089906
## foodlineSouth 0.3129793 0.6138346
## potato_salad 1.8593766 4.2389547
## risk_score 0.8500662 1.3352645
#Linear regression
lm_model <- lm(
symptom_severity~age,
data=festival_data)
summary(lm_model)
##
## Call:
## lm(formula = symptom_severity ~ age, data = festival_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8504 -1.8692 -0.0991 1.8104 5.5097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.098250 0.248133 8.456 <2e-16 ***
## age 0.046902 0.004746 9.883 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.36 on 778 degrees of freedom
## (20 observations deleted due to missingness)
## Multiple R-squared: 0.1115, Adjusted R-squared: 0.1104
## F-statistic: 97.67 on 1 and 778 DF, p-value: < 2.2e-16
#Visualization
#barchart
ggplot(festival_data,aes(foodline,fill=as.factor(illness_status)))+ geom_bar()+ labs(title = "Illness Rates by Food Line")

#Histogram
ggplot(festival_data,
aes(age))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_bin()`).

#boxplot
ggplot(festival_data,
aes(as.factor(
illness_status),
age))+
geom_boxplot()
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#Plot publication ready
labs(
title="Title",
x="X label",
y="Y label"
)
## <ggplot2::labels> List of 3
## $ x : chr "X label"
## $ y : chr "Y label"
## $ title: chr "Title"