#step1: identify general folder where data is stored
getwd()
## [1] "/cloud/project"
#step2: list file in specific folder
list.files("/cloud/project/data")
## [1] "community_health_genetic_risk_dataset.csv"
#step3: load dataset in the folder
community_health <- read.csv("/cloud/project/data/community_health_genetic_risk_dataset.csv")
head(community_health)
##   ParticipantID Age Gender   Region UrbanRural HeightCm WeightKg  BMI EyeColor
## 1         P0001  56 Female  Central      Urban    172.7     92.5 31.0     Blue
## 2         P0002  37 Female Northern      Rural    165.1     89.0 32.7    Brown
## 3         P0003  52   Male  Central      Rural    170.1     81.0 28.0    Hazel
## 4         P0004  28 Female  Eastern      Urban    154.0     82.9 35.0    Brown
## 5         P0005  36 Female  Eastern      Urban    187.7     42.9 12.2     Blue
## 6         P0006  65   Male  Western      Urban    162.8     67.1 25.3     Gray
##   BloodType ExerciseHoursPerWeek DailyScreenTimeHours SleepHours  SmokingStatus
## 1         B                  2.2                  5.8        4.0     Non-Smoker
## 2         O                  4.7                  5.2        5.3     Non-Smoker
## 3         O                  2.2                  5.8        9.2  Former Smoker
## 4         A                  3.5                  7.7        5.0     Non-Smoker
## 5        AB                  2.5                  6.3        5.0 Current Smoker
## 6        AB                  3.6                  3.1        8.8     Non-Smoker
##   AlcoholConsumptionLevel AverageHeartRate SystolicBP DiastolicBP
## 1                Moderate             90.4      150.1        84.8
## 2                Moderate             76.7      152.9        77.7
## 3                Moderate             82.8      140.2        71.6
## 4                     Low             65.9      151.4        79.8
## 5                Moderate             78.8      120.1        78.4
## 6                Moderate             77.9      151.9        81.9
##   CholesterolLevel GlucoseLevel StressLevel FamilyHistoryDiabetes
## 1            231.8        131.1         Low                     0
## 2            321.2        108.6      Medium                     0
## 3            238.0        102.7         Low                     0
## 4            253.2        121.0      Medium                     0
## 5            176.6         83.9        High                     0
## 6            212.9        110.3        High                     0
##   GeneticRiskScore MonthlyMedicalCost SickDaysLastYear DepressionScore
## 1             65.0             924.06               10            10.9
## 2             69.4            9299.36               10            15.3
## 3             60.9             916.99               11             8.8
## 4             59.8             936.11                6            15.4
## 5             57.7             587.04                9            17.8
## 6             53.1             709.10               11            14.2
##   CognitiveTestScore DiagnosisRisk FollowUpMonth TreatmentResponseScore
## 1               95.7             0    2024-03-29                   61.0
## 2              127.0             0    2024-05-15                   49.5
## 3              108.9             0    2024-03-02                   63.8
## 4               88.5             0    2024-12-08                   75.2
## 5               92.3             0    2024-05-30                   60.6
## 6              103.1             0    2024-08-25                   58.4
# WHAT IS THE AVERAGE BMI ACROSS DIFFERENT AGE GROUPS?
# step1: create age categories
age_groups <- cut(community_health$Age,
                  breaks=c(18,35,45,55,65,75),
                  labels=c("18-35","36-45","46-55","56-65","66-75"),
                  include.lowest = TRUE)

# step2: install dplyr package
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# step3: create a new column age groups
community_health <- community_health %>%
  mutate(
    Age_Groups = age_groups
  )

# step4: aggregate and group by age groups
bmi_status <- community_health %>%
  group_by (Age_Groups)%>%
  summarise(AverageBMI = mean(BMI,na.rm=TRUE),.groups='drop')

# step5: print results
print(bmi_status)
## # A tibble: 5 × 2
##   Age_Groups AverageBMI
##   <fct>           <dbl>
## 1 18-35            25.3
## 2 36-45            25.6
## 3 46-55            25.3
## 4 56-65            26.6
## 5 66-75            26.1
# step6: visualize output
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2)

ggplot(data=bmi_status,mapping=aes(x=Age_Groups,y=AverageBMI,size=AverageBMI))+
  geom_point(color = 'red')+ labs(title='Average BMI by Age Group')

#3. WHAT ARE THE MINIMUM, MAXIMUM, AND QUARTILE VALUES FOR SYSTOLICBP?
summary(community_health$SystolicBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   108.8   136.7   145.0   145.1   153.6   187.3
#4. WHAT IS THE STANDARD DEVIATION OF GLUCOSE LEVELS? 
sd(community_health$GlucoseLevel)
## [1] 13.74917
#5. WHICH AGE GROUP HAS THE HIGHEST AVERAGE CHOLESTEROL LEVEL? 
# step1: aggregate and group by age groups
age_cholestrol <- community_health %>%
  group_by(Age_Groups)%>%
  summarise(Avg_Cholestrol = mean(CholesterolLevel,na.rm=TRUE),.groups='drop')
print(age_cholestrol)
## # A tibble: 5 × 2
##   Age_Groups Avg_Cholestrol
##   <fct>               <dbl>
## 1 18-35                204.
## 2 36-45                215.
## 3 46-55                224.
## 4 56-65                227.
## 5 66-75                230.
# step2: visualize output
ggplot(data=age_cholestrol,mapping=aes(x=Age_Groups,y=Avg_Cholestrol, fill=Age_Groups))+
  geom_col(color='black')+labs(title='Choletrol Level by Age Group')

#6. WHAT IS THE DISTRIBUTION OF EYECOLOR AMONG PARTICIPANTS?

# step1: count of eye color grouped by eye color types
eye_color_distribution <- community_health %>%
  group_by(EyeColor)%>%
  summarise(EyeColorCount=n(),.groups='drop')
print(eye_color_distribution)
## # A tibble: 5 × 2
##   EyeColor EyeColorCount
##   <chr>            <int>
## 1 Blue               196
## 2 Brown              160
## 3 Gray               239
## 4 Green              197
## 5 Hazel              208
# step2: visualize output
ggplot(data=eye_color_distribution, mapping=aes(x=EyeColor,y=EyeColorCount))+
  geom_col(color='black',fill='grey') + labs(title='Distribution of EyeColor')

#7. WHAT IS THE RATE/PROPORTION OF SMOKING STATUS PARTICIPANTS? 
# step1: calculate the rate and group by smoker status
participant_proportion <- community_health %>%
  group_by(SmokingStatus)%>%
  summarise(smoke_status_count = n())%>%
  mutate (
    rate = smoke_status_count / sum(smoke_status_count)*100
  )
print(participant_proportion)
## # A tibble: 3 × 3
##   SmokingStatus  smoke_status_count  rate
##   <chr>                       <int> <dbl>
## 1 Current Smoker                206  20.6
## 2 Former Smoker                 184  18.4
## 3 Non-Smoker                    610  61
# step2: visualize output
ggplot(data=participant_proportion,mapping=aes(x='',y=smoke_status_count,fill=SmokingStatus))+
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(
    title = "Distribution of Smoking Status",
    fill = "Smoking Status"
  ) +
  theme_void()

#8. WHAT IS THE RATE OF PARTICIPANTS WHO HAVE AND DONT HAVE A FAMILYHISTORYDIABETES? 
# step1: count and group by family history diabetes
diabetes_status <- community_health %>%
  group_by(FamilyHistoryDiabetes)%>%
  summarise(diabetes_count=n())%>%
  mutate(
    diabetes_rate = diabetes_count / sum(diabetes_count)*100
  )
print(diabetes_status)
## # A tibble: 2 × 3
##   FamilyHistoryDiabetes diabetes_count diabetes_rate
##                   <int>          <int>         <dbl>
## 1                     0            703          70.3
## 2                     1            297          29.7
# Step2:visualize output
ggplot(data=diabetes_status,mapping=aes(x='',y=diabetes_count,fill=FamilyHistoryDiabetes))+
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(diabetes_rate, 1), "%")),
            position = position_stack(vjust = 0.5)) +
  labs(
    title = "Distribution of Participants by Family History of Diabetes",
    fill = "Family History"
  ) +
  theme_void()

#9. IS SMOKINGSTATUS ASSOCIATED WITH DIAGNOSISRISK? 
# step1: build contingency table
tbl_1 <- table(community_health$SmokingStatus, community_health$DiagnosisRisk)

# step2: calculate degree of association
smoking_diagnosis_association <-chisq.test(tbl_1)
print(smoking_diagnosis_association)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl_1
## X-squared = 98.948, df = 2, p-value < 2.2e-16

The above result indicates is a statistically significant association between the two categorical variables (p < 0.001). The large chi-square value (98.95) indicates that the observed distribution is extremely unlikely to have occurred by chance alone.

#10. IS GENDER ASSOCIATED WITH STRESSLEVEL? 
# step1: create a contingency table
tbl_2 <- table(community_health$Gender,community_health$StressLevel)

# step2: calculate degree of association
gender_stress_association <- chisq.test(tbl_2)
print(gender_stress_association)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl_2
## X-squared = 3.1332, df = 2, p-value = 0.2088

There is no statistically significant association between the two variables (p = 0.2088). The chi-square value (3.13) suggests that any observed difference is likely due to chance, not a real relationship.

#11. IS URBANRURAL ASSOCIATED WITH ALCOHOLCONSUMPTIONLEVEL? 
# step1: create a contingency table
tbl3 <- table(community_health$UrbanRural,community_health$AlcoholConsumptionLevel)

# step2: calculate degree of association between variables
rural_urban_alcohol_association <- chisq.test(tbl3)
print(rural_urban_alcohol_association)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl3
## X-squared = 1.3448, df = 2, p-value = 0.5105

There is no statistically significant association between the variables (p = 0.5105). The very low chi-square value (1.34) confirms that the observed distribution is very close to what would be expected by chance alone. In conclusion, no relationship exists, the variables are independent.

# 12. IS THERE A SIGNIFICANT DIFFERENCE IN BMI BETWEEN MALES AND FEMALES? 
t_result1 <- t.test(BMI ~ Gender, data = community_health, na.rm =TRUE)
print(t_result1)
## 
##  Welch Two Sample t-test
## 
## data:  BMI by Gender
## t = -3.0115, df = 993.17, p-value = 0.002665
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -1.9760186 -0.4168035
## sample estimates:
## mean in group Female   mean in group Male 
##             25.10568             26.30210

There is a statistically significant difference in BMI between males and females (p = 0.0027). Males have a significantly higher mean BMI (26.30) compared to females (25.11), with a mean difference of approximately -1.2 units.

# 13. DO CURRENT SMOKERS HAVE HIGHER CHOLESTEROL LEVELS THAN NON-SMOKERS? 
anova_result <- aov(CholesterolLevel ~ SmokingStatus, data = community_health, na.rm = TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'na.rm' will be disregarded
summary(anova_result)
##                Df  Sum Sq Mean Sq F value Pr(>F)
## SmokingStatus   2     820   409.9   0.389  0.678
## Residuals     997 1050469  1053.6

Since p-value (0.678) > alpha (0.05), we fail to reject the null hypothesis. In Conclusion, there is insufficient evidence to conclude that smoking status affects cholesterol levels. The observed differences in group means are likely due to random chance.

# 14. DOES MONTHLYMEDICALCOST DIFFER SIGNIFICANTLY ACROSS REGIONS? 
aov_result2 <- aov(MonthlyMedicalCost ~ Region,data=community_health,na.rm=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'na.rm' will be disregarded
summary(aov_result2)
##              Df    Sum Sq Mean Sq F value Pr(>F)
## Region        3    737899  245966   0.332  0.802
## Residuals   996 738033277  740997

Since p-value (0.802 > alpha value 0.05, we fail to reject the null hypothesis) There is no evidence that monthly medical costs differ across regions

#15. DO COGNITIVETESTSCORES VARY ACROSS BLOODTYPE GROUPS? 
anova_result3 <- aov(CognitiveTestScore ~ BloodType,data=community_health,na.rm=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'na.rm' will be disregarded
summary(anova_result3)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## BloodType     3    554  184.75   2.218 0.0844 .
## Residuals   996  82949   83.28                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since p-value (0.0844) > alpha (0.05), we fail to reject the null hypothesis. However, the result is marginally significant (p < 0.10), suggesting a weak trend that does not meet conventional statistical significance.

#16. IS THERE A CORRELATION BETWEEN BMI AND BLOODPRESSURE? 
corr1 <- cor.test(community_health$BMI,community_health$SystolicBP,use = 'complete.obs')
print(corr1)
## 
##  Pearson's product-moment correlation
## 
## data:  community_health$BMI and community_health$SystolicBP
## t = 26.576, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6059447 0.6786642
## sample estimates:
##       cor 
## 0.6437555

Since p-value (< 0.001) < alpha (0.05), we REJECT the null hypothesis. There is a statistically significant, moderate-to-strong positive correlation between BMI and systolic blood pressure (r = 0.644, p < 0.001).

#17. HOW STRONGLY IS DAILYSCREENTIMEHOURS RELATED TO DEPRESSIONSCORE? 
corr2 <- cor.test(community_health$DailyScreenTimeHours,community_health$DepressionScore,use='complete.obs')
print(corr2)
## 
##  Pearson's product-moment correlation
## 
## data:  community_health$DailyScreenTimeHours and community_health$DepressionScore
## t = 12.888, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3233273 0.4296796
## sample estimates:
##       cor 
## 0.3777487

Since p-value (< 0.001) < alpha (0.05), we REJECT the null hypothesis. There is a statistically significant, weak to moderate positive correlation between BMI and systolic blood pressure (r = 0.3777, p < 0.001).

#18. IS EXERCISEHOURSPERWEEK NEGATIVELY CORRELATED WITH BMI? 
corr3 <- cor.test(community_health$ExerciseHoursPerWeek,community_health$BMI,use='complete.obs')
print(corr3)
## 
##  Pearson's product-moment correlation
## 
## data:  community_health$ExerciseHoursPerWeek and community_health$BMI
## t = 0.016088, df = 998, p-value = 0.9872
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06148576  0.06250038
## sample estimates:
##          cor 
## 0.0005092686

Exercise hours per week is not significantly correlated with BMI(r = 0.0005, p = 0.9872), suggesting no linear relationship between these variables in this population.

#19. DOES STRESSLEVEL SIGNIFICANTLY PREDICT DEPRESSIONSCORE? 
linear_model1 <- lm(DepressionScore~StressLevel,data=community_health)
summary(linear_model1)
## 
## Call:
## lm(formula = DepressionScore ~ StressLevel, data = community_health)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.8066  -2.9961  -0.0066   2.7972  13.1934 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        15.2928     0.2940  52.024  < 2e-16 ***
## StressLevelLow     -2.6861     0.3859  -6.960 6.16e-12 ***
## StressLevelMedium  -2.2866     0.3488  -6.555 8.91e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.229 on 997 degrees of freedom
## Multiple R-squared:  0.05305,    Adjusted R-squared:  0.05115 
## F-statistic: 27.93 on 2 and 997 DF,  p-value: 1.582e-12

The intercept value of 15.2928 represents the estimated depression score for the reference stress category (StressLevelHigh) Compared to the reference category, participants with low stress had depression scores lower by 2.6861 points, while participants with medium stress had depression scores lower by 2.2866 points. Both relationships were statistically significant (p<0.001). In conclusion, stress level significantly affects depression score.

#20. CAN EXERCISEHOURSPERWEEK PREDICT CHOLESTEROLLEVEL? 
linear_model2 <- lm(CholesterolLevel~ExerciseHoursPerWeek,data=community_health)
summary(linear_model2)
## 
## Call:
## lm(formula = CholesterolLevel ~ ExerciseHoursPerWeek, data = community_health)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.967 -18.260  -2.689  11.469 228.766 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          217.9866     2.2997  94.791   <2e-16 ***
## ExerciseHoursPerWeek  -0.1371     0.5229  -0.262    0.793    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.45 on 998 degrees of freedom
## Multiple R-squared:  6.893e-05,  Adjusted R-squared:  -0.000933 
## F-statistic: 0.06879 on 1 and 998 DF,  p-value: 0.7932

The predicted average cholesterol level for someone with 0 hours of exercise per week is 217.9866. For each additional hour of exercise per week, cholesterol level decreases by only 0.14, but this effect is NOT statistically significant (p = 0.793 > alpha 0.05)

#21. CAN SLEEPHOURS AND SCREENTIME PREDICT COGNITIVETESTSCORE? 
linear_model3 <- lm(CognitiveTestScore~DailyScreenTimeHours+SleepHours,data=community_health)
summary(linear_model3)
## 
## Call:
## lm(formula = CognitiveTestScore ~ DailyScreenTimeHours + SleepHours, 
##     data = community_health)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.345  -5.493  -0.050   5.052  34.507 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          103.4853     1.5220  67.994   <2e-16 ***
## DailyScreenTimeHours  -1.4481     0.1304 -11.109   <2e-16 ***
## SleepHours             1.6760     0.1803   9.295   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.322 on 997 degrees of freedom
## Multiple R-squared:  0.1731, Adjusted R-squared:  0.1714 
## F-statistic: 104.3 on 2 and 997 DF,  p-value: < 2.2e-16

The predicted average cognitive test score is 103.5 when Daily Screen Time is 0 hours and Sleep Hours is 0 — though this is an extrapolation since sleep hours cannot be zero in reality. Each additional hour of screen time decreases cognitive score by 1.45 points holding sleep constant Each additional hour of sleep increases cognitive score by 1.68 points holding screen time constant Since the p-value (< 2.2e-16) is less than alpha (0.05), the overall regression model is statistically significant. In conclusion, both daily screen time and sleep hours are significant predictors of cognitive test scores.

#22. DOES FAMILYHISTORYDIABETES SIGNIFICANTLY INCREASE DISEASE RISK? 
logistical_model1 <- glm(DiagnosisRisk ~ FamilyHistoryDiabetes, data=community_health,family='binomial')
summary(logistical_model1)
## 
## Call:
## glm(formula = DiagnosisRisk ~ FamilyHistoryDiabetes, family = "binomial", 
##     data = community_health)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -5.4525     0.5786  -9.424  < 2e-16 ***
## FamilyHistoryDiabetes   4.0364     0.5968   6.763 1.35e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 459.42  on 999  degrees of freedom
## Residual deviance: 332.04  on 998  degrees of freedom
## AIC: 336.04
## 
## Number of Fisher Scoring iterations: 8

Intercept = -5.4525 indicates that without family history, the chance of high diagnostic risk is very low. Since p-value (1.35e-11) < 0.05, we REJECT the null hypothesis Having a family history of diabetes is associated with 56.6 times higher odds of being at high diagnosis risk (applied exponential to 4.04 to derrive 56.6)

#23. CAN A MODEL USING AGE, BMI, GLUCOSELEVEL, AND STRESSLEVEL ACCURATELY CLASSIFY DIAGNOSISRISK?
logistical_model2 <- glm(DiagnosisRisk~Age+BMI+GlucoseLevel, data=community_health,family='binomial')
summary(logistical_model2)
## 
## Call:
## glm(formula = DiagnosisRisk ~ Age + BMI + GlucoseLevel, family = "binomial", 
##     data = community_health)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -8.065541   1.229057  -6.562 5.30e-11 ***
## Age          -0.004017   0.008173  -0.492    0.623    
## BMI           0.147874   0.030209   4.895 9.83e-07 ***
## GlucoseLevel  0.011436   0.013554   0.844    0.399    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 459.42  on 999  degrees of freedom
## Residual deviance: 398.56  on 996  degrees of freedom
## AIC: 406.56
## 
## Number of Fisher Scoring iterations: 6

When Age = 0, BMI = 0, and GlucoseLevel = 0, the log-odds of the diagnostic outcome being true is -8.07, indicating a very low baseline probability of the outcome.

Age had a coefficient of -0.004017, meaning increasing age slightly decreased the odds of the outcome. Converting the coefficient using e−0.004017e^{-0.004017}e−0.004017 gives an odds ratio of approximately 0.996, indicating that each 1-year increase in age decreases the odds of the outcome by about 0.4%. However, the p-value for Age was 0.623, which is greater than 0.05, showing that age was not a statistically significant predictor of the outcome.

BMI had a coefficient of 0.147874, meaning increasing BMI increased the odds of the outcome. Converting the coefficient using e0.147874e^{0.147874}e0.147874 gives an odds ratio of approximately 1.16, indicating that each 1-unit increase in BMI increases the odds of the outcome by about 16%. The p-value for BMI was 9.83e-07, which is far below 0.05, showing a statistically significant relationship between BMI and the outcome.

GlucoseLevel had a coefficient of 0.011436, meaning increasing glucose level slightly increased the odds of the outcome. Converting the coefficient using e0.011436e^{0.011436}e0.011436 gives an odds ratio of approximately 1.012, indicating that each 1-unit increase in glucose level increases the odds of the outcome by about 1.2%. However, the p-value for GlucoseLevel was 0.399, which is greater than 0.05, indicating that glucose level was not a statistically significant predictor of the outcome in this model.

#24. HOW DOES MONTHLYMEDICALCOST TREND OVER FOLLOWUPMONTH? 

#step0: install lubridate
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(lubridate) 
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
#step1: convert followupmonth column to date format
community_health$FollowUpMonth <- as.Date(community_health$FollowUpMonth)

#step2: extract month from follow up month
community_health <- community_health %>%
  mutate(
    month = month(FollowUpMonth, label = TRUE)
  )

#step3: calculate total cost by month
cost_trend <- community_health %>%
  group_by(month)%>%
  summarise(TotalCost=sum(MonthlyMedicalCost,na.rm=TRUE),.groups='drop')
print(cost_trend)
## # A tibble: 12 × 2
##    month TotalCost
##    <ord>     <dbl>
##  1 Jan      63773.
##  2 Feb      87663.
##  3 Mar      91140.
##  4 Apr      81556.
##  5 May      97270.
##  6 Jun      62690.
##  7 Jul      67527.
##  8 Aug      79232.
##  9 Sep      59252.
## 10 Oct      80539.
## 11 Nov      76698.
## 12 Dec      94247.
#step4: visualize output
ggplot(data=cost_trend,mapping=aes(x=month, y=TotalCost,fill=month, group=1,color=month))+geom_line(size = 1)+ geom_point(size = 2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

  labs(title='How Monthly Medical Costs Trend over Followup Months')
## <ggplot2::labels> List of 1
##  $ title: chr "How Monthly Medical Costs Trend over Followup Months"
#25. IS THERE SEASONALITY IN SICKDAYSLASTYEAR REPORTING? 
# Step1: Extract month from FollowUpMonth
library(lubridate)

community_health <- community_health %>%
  mutate(
    month = month(FollowUpMonth, label = TRUE)
  )

# Step2: Calculate average sick days by month
seasonality_sick <- community_health %>%
  group_by(month) %>%
  summarise(
    AvgSickDays = mean(SickDaysLastYear, na.rm = TRUE),
    TotalSickDays = sum(SickDaysLastYear, na.rm = TRUE),
    Count = n(),
    .groups = 'drop'
  )

# Step3: Print results
print(seasonality_sick)
## # A tibble: 12 × 4
##    month AvgSickDays TotalSickDays Count
##    <ord>       <dbl>         <int> <int>
##  1 Jan          7.61           571    75
##  2 Feb          7.69           692    90
##  3 Mar          7.48           628    84
##  4 Apr          7.57           636    84
##  5 May          7.54           679    90
##  6 Jun          7.76           590    76
##  7 Jul          7.55           574    76
##  8 Aug          7.59           660    87
##  9 Sep          7.42           542    73
## 10 Oct          7.55           649    86
## 11 Nov          7.64           680    89
## 12 Dec          8.1            729    90
# Step4: Visualize seasonality
ggplot(seasonality_sick, aes(x = month, y = AvgSickDays, group = 1)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(title = "Seasonality in Sick Days Reporting",
       x = "Month",
       y = "Average Sick Days") +
  theme_minimal()

#26. WHICH PARTICIPANTS HAVE ABNORMALLY LOW OR HIGH BMI VALUES? 
#step1: calculate Q1, Q3 and IQR
Q1 <- quantile(community_health$BMI,0.25,na.rm = TRUE)
Q3 <- quantile(community_health$BMI,0.75,na.rm = TRUE)
IQR <- Q3 - Q1

#step2: define bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

#step3: visualize outliers using vline
ggplot(data=community_health, mapping=aes(x = BMI)) +
  geom_histogram(bins = 30, fill = "lightblue", color = "black") +
  geom_vline(xintercept = c(lower_bound, upper_bound), 
             color = "red", linetype = "dashed", size = 1) +
  labs(title = "Distribution of BMI Levels",
       subtitle = "Red dashed lines show outlier boundaries",
       x = "BMI",
       y = "Count") +
  theme_minimal()