#CREATE AND EXPLORE YOUR DATASET

#generating a random dataset of 800
set.seed(123)
n<-800

#simulate age (18-80) that randomly assign ages from 18 to 80
age <- sample(18:80, n, replace = TRUE)
#Assign foodline
foodline <- sample(c("North","South"),n, replace = TRUE)

#Potato salad exposure especially north tent has contaminated potato salad
potato_salad <- ifelse(foodline=="North",rbinom(n,1,0.7),rbinom(n,1,0.3))

#Illness variable about 40% got sick, attack rate around 35% and north tent has a higher illness
illness_prob <- ifelse(foodline=="North"& potato_salad==1,0.65,0.20)
illness_status <- rbinom(n,1,illness_prob)

#symptoms severity where older adults (>50) are more severe
symptom_severity <- ifelse(age > 50 & illness_status==1,sample(7:10,n,replace = TRUE),sample(1:6,n,replace = TRUE))

#adding missing value inorder to make the dateset realistic
age[sample(1:n,20)]<- NA
potato_salad[sample(1:n,15)]<-NA

#Build dataframe
festival_data <- data.frame(age, foodline,potato_salad,illness_status,symptom_severity)
head(festival_data)
str(festival_data)
## 'data.frame':    800 obs. of  5 variables:
##  $ age             : int  48 32 68 31 20 59 67 71 60 54 ...
##  $ foodline        : chr  "North" "North" "North" "North" ...
##  $ potato_salad    : int  1 1 1 0 1 0 1 1 0 1 ...
##  $ illness_status  : int  1 1 1 0 0 0 1 1 1 1 ...
##  $ symptom_severity: int  3 2 8 6 4 2 7 7 7 7 ...
summary(festival_data)
##       age          foodline          potato_salad    illness_status  
##  Min.   :18.00   Length:800         Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:34.00   Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :49.00   Mode  :character   Median :1.0000   Median :0.0000  
##  Mean   :49.16                      Mean   :0.5159   Mean   :0.3925  
##  3rd Qu.:65.00                      3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :80.00                      Max.   :1.0000   Max.   :1.0000  
##  NA's   :20                         NA's   :15                       
##  symptom_severity
##  Min.   : 1.000  
##  1st Qu.: 2.000  
##  Median : 4.000  
##  Mean   : 4.421  
##  3rd Qu.: 6.000  
##  Max.   :10.000  
## 
#HOW MANY PPLE GOT SICK 
table(festival_data$illness_status)
## 
##   0   1 
## 486 314
#average age of participants ignoring missing values
mean(festival_data$age,na.rm = TRUE)
## [1] 49.15769
#missing values
colSums(is.na(festival_data))
##              age         foodline     potato_salad   illness_status 
##               20                0               15                0 
## symptom_severity 
##                0
library(magrittr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'stringr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ ggplot2   4.0.3     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#filtering to show only sick people
sick_people <- festival_data %>% filter(illness_status==1)
#group by foodline and count sick vs not sick
festival_data %>% group_by(foodline,illness_status) %>% summarise(count=n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by foodline and illness_status.
## ℹ Output is grouped by foodline.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(foodline, illness_status))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
#Creating a new variable age group
festival_data<- festival_data %>% mutate(age_group=case_when(age>=18 & age <=35~"young",age>35~"older"))

#Reshaping my data incase more food were available like burger and juice from wide to long format
festival_data$burger<- rbinom(n,1,0.5)
festival_data$juice <- rbinom(n,1,0.5)
long_data <- festival_data %>% pivot_longer(cols = c(potato_salad,burger,juice),names_to = "food",values_to = "consumed")
#how many food eaten with different people 
festival_data<-festival_data%>%mutate(food_count=rowSums(select(.,potato_salad,burger,juice),na.rm = TRUE))

# Data with tables and statistics summary by table average by ilnness
festival_data%>%group_by(illness_status)%>%summarise(mean_age=mean(age,na.rm=TRUE))
#count by foodline
festival_data %>% group_by(foodline,illness_status) %>% summarise(count=n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by foodline and illness_status.
## ℹ Output is grouped by foodline.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(foodline, illness_status))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
#Descriptive statistics for age 
summary(festival_data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   34.00   49.00   49.16   65.00   80.00      20
#comparing ages BETWEEN SICK AND NOT sick
festival_data %>% group_by(illness_status) %>% summarise(mean_age=mean(age,na.rm = TRUE))
#Frequency tables
table(festival_data$foodline)
## 
## North South 
##   408   392
table(festival_data$illness_status)
## 
##   0   1 
## 486 314
#percentage of those who got sick in  foodline
prop.table(
  table(festival_data$foodline,
        festival_data$illness_status),
  1)
##        
##                 0         1
##   North 0.4656863 0.5343137
##   South 0.7551020 0.2448980
#Statistical tests
#t test qn do sick and non sick differ in age meaning p<0.05 means significant difference
t.test(age~illness_status,
       data=festival_data)
## 
##  Welch Two Sample t-test
## 
## data:  age by illness_status
## t = 0.45378, df = 651.27, p-value = 0.6501
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.972232  3.157737
## sample estimates:
## mean in group 0 mean in group 1 
##        49.38947        48.79672
#chi-square test
#qn is illness associated with food line?
chisq.test(
  table(festival_data$foodline,
        festival_data$illness_status))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(festival_data$foodline, festival_data$illness_status)
## X-squared = 69.02, df = 1, p-value < 2.2e-16
#attack rates sick/exposed
festival_data %>%
  group_by(foodline) %>%
  summarise(
    attack_rate=
      mean(illness_status)*100
  )
#Food item with the highest risk meaning potato salad = likely contaminated source
festival_data %>%
  group_by(potato_salad) %>%
  summarise(
    illness_rate=
      mean(illness_status))
#creating a risk score higher score risky foods eaten
festival_data <- festival_data %>%
  mutate(risk_score=
           potato_salad+
           burger+
           juice)
#Logistic Regression
model1 <- glm(
  illness_status~
    age+
    foodline,
  
  family=binomial,
  
  data=festival_data)

summary(model1)
## 
## Call:
## glm(formula = illness_status ~ age + foodline, family = binomial, 
##     data = festival_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.163060   0.232836   0.700    0.484    
## age           -0.000761   0.004324  -0.176    0.860    
## foodlineSouth -1.248350   0.155685  -8.018 1.07e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1043.96  on 779  degrees of freedom
## Residual deviance:  975.61  on 777  degrees of freedom
##   (20 observations deleted due to missingness)
## AIC: 981.61
## 
## Number of Fisher Scoring iterations: 4
#adding variables
model2 <- glm(
  illness_status~
    age+
    foodline+
    potato_salad+
    risk_score,
  
  family=binomial,
  
  data=festival_data)
#Compare
summary(model2)
## 
## Call:
## glm(formula = illness_status ~ age + foodline + potato_salad + 
##     risk_score, family = binomial, data = festival_data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.7290840  0.2884823  -2.527   0.0115 *  
## age            0.0001264  0.0044990   0.028   0.9776    
## foodlineSouth -0.8238491  0.1716750  -4.799 1.60e-06 ***
## potato_salad   1.0294434  0.2099907   4.902 9.47e-07 ***
## risk_score     0.0630773  0.1150792   0.548   0.5836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1025.52  on 764  degrees of freedom
## Residual deviance:  918.21  on 760  degrees of freedom
##   (35 observations deleted due to missingness)
## AIC: 928.21
## 
## Number of Fisher Scoring iterations: 4
#Odds ratio 
exp(coef(model2))
##   (Intercept)           age foodlineSouth  potato_salad    risk_score 
##     0.4823506     1.0001264     0.4387397     2.7995072     1.0651091
#confidence level
exp(confint(model2))
## Waiting for profiling to be done...
##                   2.5 %    97.5 %
## (Intercept)   0.2728709 0.8465737
## age           0.9913352 1.0089906
## foodlineSouth 0.3129793 0.6138346
## potato_salad  1.8593766 4.2389547
## risk_score    0.8500662 1.3352645
#Linear regression
lm_model <- lm(
  symptom_severity~age,
  
  data=festival_data)

summary(lm_model)
## 
## Call:
## lm(formula = symptom_severity ~ age, data = festival_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8504 -1.8692 -0.0991  1.8104  5.5097 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.098250   0.248133   8.456   <2e-16 ***
## age         0.046902   0.004746   9.883   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.36 on 778 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.1115, Adjusted R-squared:  0.1104 
## F-statistic: 97.67 on 1 and 778 DF,  p-value: < 2.2e-16
#Visualization
#barchart
ggplot(festival_data,aes(foodline,fill=as.factor(illness_status)))+ geom_bar()+ labs(title = "Illness Rates by Food Line")

#Histogram
ggplot(festival_data,
       aes(age))+
  
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_bin()`).

#boxplot
ggplot(festival_data,
       aes(as.factor(
         illness_status),
         age))+
  
  geom_boxplot()
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#Plot publication ready
labs(
  title="Title",
  x="X label",
  y="Y label"
)
## <ggplot2::labels> List of 3
##  $ x    : chr "X label"
##  $ y    : chr "Y label"
##  $ title: chr "Title"