#step1: identify general folder where data is stored
getwd()
## [1] "/cloud/project"
#step2: list file in specific folder
list.files("/cloud/project/data")
## [1] "community_health_genetic_risk_dataset.csv"
#step3: load dataset in the folder
community_health <- read.csv("/cloud/project/data/community_health_genetic_risk_dataset.csv")
head(community_health)
## ParticipantID Age Gender Region UrbanRural HeightCm WeightKg BMI EyeColor
## 1 P0001 56 Female Central Urban 172.7 92.5 31.0 Blue
## 2 P0002 37 Female Northern Rural 165.1 89.0 32.7 Brown
## 3 P0003 52 Male Central Rural 170.1 81.0 28.0 Hazel
## 4 P0004 28 Female Eastern Urban 154.0 82.9 35.0 Brown
## 5 P0005 36 Female Eastern Urban 187.7 42.9 12.2 Blue
## 6 P0006 65 Male Western Urban 162.8 67.1 25.3 Gray
## BloodType ExerciseHoursPerWeek DailyScreenTimeHours SleepHours SmokingStatus
## 1 B 2.2 5.8 4.0 Non-Smoker
## 2 O 4.7 5.2 5.3 Non-Smoker
## 3 O 2.2 5.8 9.2 Former Smoker
## 4 A 3.5 7.7 5.0 Non-Smoker
## 5 AB 2.5 6.3 5.0 Current Smoker
## 6 AB 3.6 3.1 8.8 Non-Smoker
## AlcoholConsumptionLevel AverageHeartRate SystolicBP DiastolicBP
## 1 Moderate 90.4 150.1 84.8
## 2 Moderate 76.7 152.9 77.7
## 3 Moderate 82.8 140.2 71.6
## 4 Low 65.9 151.4 79.8
## 5 Moderate 78.8 120.1 78.4
## 6 Moderate 77.9 151.9 81.9
## CholesterolLevel GlucoseLevel StressLevel FamilyHistoryDiabetes
## 1 231.8 131.1 Low 0
## 2 321.2 108.6 Medium 0
## 3 238.0 102.7 Low 0
## 4 253.2 121.0 Medium 0
## 5 176.6 83.9 High 0
## 6 212.9 110.3 High 0
## GeneticRiskScore MonthlyMedicalCost SickDaysLastYear DepressionScore
## 1 65.0 924.06 10 10.9
## 2 69.4 9299.36 10 15.3
## 3 60.9 916.99 11 8.8
## 4 59.8 936.11 6 15.4
## 5 57.7 587.04 9 17.8
## 6 53.1 709.10 11 14.2
## CognitiveTestScore DiagnosisRisk FollowUpMonth TreatmentResponseScore
## 1 95.7 0 2024-03-29 61.0
## 2 127.0 0 2024-05-15 49.5
## 3 108.9 0 2024-03-02 63.8
## 4 88.5 0 2024-12-08 75.2
## 5 92.3 0 2024-05-30 60.6
## 6 103.1 0 2024-08-25 58.4
# WHAT IS THE AVERAGE BMI ACROSS DIFFERENT AGE GROUPS?
# step1: create age categories
age_groups <- cut(community_health$Age,
breaks=c(18,35,45,55,65,75),
labels=c("18-35","36-45","46-55","56-65","66-75"),
include.lowest = TRUE)
# step2: install dplyr package
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# step3: create a new column age groups
community_health <- community_health %>%
mutate(
Age_Groups = age_groups
)
# step4: aggregate and group by age groups
bmi_status <- community_health %>%
group_by (Age_Groups)%>%
summarise(AverageBMI = mean(BMI,na.rm=TRUE),.groups='drop')
# step5: print results
print(bmi_status)
## # A tibble: 5 × 2
## Age_Groups AverageBMI
## <fct> <dbl>
## 1 18-35 25.3
## 2 36-45 25.6
## 3 46-55 25.3
## 4 56-65 26.6
## 5 66-75 26.1
# step6: visualize output
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data=bmi_status,mapping=aes(x=Age_Groups,y=AverageBMI,size=AverageBMI))+
geom_point(color = 'red')+ labs(title='Average BMI by Age Group')
#3. WHAT ARE THE MINIMUM, MAXIMUM, AND QUARTILE VALUES FOR SYSTOLICBP?
summary(community_health$SystolicBP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 108.8 136.7 145.0 145.1 153.6 187.3
#4. WHAT IS THE STANDARD DEVIATION OF GLUCOSE LEVELS?
sd(community_health$GlucoseLevel)
## [1] 13.74917
#5. WHICH AGE GROUP HAS THE HIGHEST AVERAGE CHOLESTEROL LEVEL?
# step1: aggregate and group by age groups
age_cholestrol <- community_health %>%
group_by(Age_Groups)%>%
summarise(Avg_Cholestrol = mean(CholesterolLevel,na.rm=TRUE),.groups='drop')
print(age_cholestrol)
## # A tibble: 5 × 2
## Age_Groups Avg_Cholestrol
## <fct> <dbl>
## 1 18-35 204.
## 2 36-45 215.
## 3 46-55 224.
## 4 56-65 227.
## 5 66-75 230.
# step2: visualize output
ggplot(data=age_cholestrol,mapping=aes(x=Age_Groups,y=Avg_Cholestrol, fill=Age_Groups))+
geom_col(color='black')+labs(title='Choletrol Level by Age Group')
#6. WHAT IS THE DISTRIBUTION OF EYECOLOR AMONG PARTICIPANTS?
# step1: count of eye color grouped by eye color types
eye_color_distribution <- community_health %>%
group_by(EyeColor)%>%
summarise(EyeColorCount=n(),.groups='drop')
print(eye_color_distribution)
## # A tibble: 5 × 2
## EyeColor EyeColorCount
## <chr> <int>
## 1 Blue 196
## 2 Brown 160
## 3 Gray 239
## 4 Green 197
## 5 Hazel 208
# step2: visualize output
ggplot(data=eye_color_distribution, mapping=aes(x=EyeColor,y=EyeColorCount))+
geom_col(color='black',fill='grey') + labs(title='Distribution of EyeColor')
#7. WHAT IS THE RATE/PROPORTION OF SMOKING STATUS PARTICIPANTS?
# step1: calculate the rate and group by smoker status
participant_proportion <- community_health %>%
group_by(SmokingStatus)%>%
summarise(smoke_status_count = n())%>%
mutate (
rate = smoke_status_count / sum(smoke_status_count)*100
)
print(participant_proportion)
## # A tibble: 3 × 3
## SmokingStatus smoke_status_count rate
## <chr> <int> <dbl>
## 1 Current Smoker 206 20.6
## 2 Former Smoker 184 18.4
## 3 Non-Smoker 610 61
# step2: visualize output
ggplot(data=participant_proportion,mapping=aes(x='',y=smoke_status_count,fill=SmokingStatus))+
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(
title = "Distribution of Smoking Status",
fill = "Smoking Status"
) +
theme_void()
#8. WHAT IS THE RATE OF PARTICIPANTS WHO HAVE AND DONT HAVE A FAMILYHISTORYDIABETES?
# step1: count and group by family history diabetes
diabetes_status <- community_health %>%
group_by(FamilyHistoryDiabetes)%>%
summarise(diabetes_count=n())%>%
mutate(
diabetes_rate = diabetes_count / sum(diabetes_count)*100
)
print(diabetes_status)
## # A tibble: 2 × 3
## FamilyHistoryDiabetes diabetes_count diabetes_rate
## <int> <int> <dbl>
## 1 0 703 70.3
## 2 1 297 29.7
# Step2:visualize output
ggplot(data=diabetes_status,mapping=aes(x='',y=diabetes_count,fill=FamilyHistoryDiabetes))+
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(diabetes_rate, 1), "%")),
position = position_stack(vjust = 0.5)) +
labs(
title = "Distribution of Participants by Family History of Diabetes",
fill = "Family History"
) +
theme_void()
#9. IS SMOKINGSTATUS ASSOCIATED WITH DIAGNOSISRISK?
# step1: build contingency table
tbl_1 <- table(community_health$SmokingStatus, community_health$DiagnosisRisk)
# step2: calculate degree of association
smoking_diagnosis_association <-chisq.test(tbl_1)
print(smoking_diagnosis_association)
##
## Pearson's Chi-squared test
##
## data: tbl_1
## X-squared = 98.948, df = 2, p-value < 2.2e-16
The above result indicates is a statistically significant association between the two categorical variables (p < 0.001). The large chi-square value (98.95) indicates that the observed distribution is extremely unlikely to have occurred by chance alone.
#10. IS GENDER ASSOCIATED WITH STRESSLEVEL?
# step1: create a contingency table
tbl_2 <- table(community_health$Gender,community_health$StressLevel)
# step2: calculate degree of association
gender_stress_association <- chisq.test(tbl_2)
print(gender_stress_association)
##
## Pearson's Chi-squared test
##
## data: tbl_2
## X-squared = 3.1332, df = 2, p-value = 0.2088
There is no statistically significant association between the two variables (p = 0.2088). The chi-square value (3.13) suggests that any observed difference is likely due to chance, not a real relationship.
#11. IS URBANRURAL ASSOCIATED WITH ALCOHOLCONSUMPTIONLEVEL?
# step1: create a contingency table
tbl3 <- table(community_health$UrbanRural,community_health$AlcoholConsumptionLevel)
# step2: calculate degree of association between variables
rural_urban_alcohol_association <- chisq.test(tbl3)
print(rural_urban_alcohol_association)
##
## Pearson's Chi-squared test
##
## data: tbl3
## X-squared = 1.3448, df = 2, p-value = 0.5105
There is no statistically significant association between the variables (p = 0.5105). The very low chi-square value (1.34) confirms that the observed distribution is very close to what would be expected by chance alone. In conclusion, no relationship exists, the variables are independent.
# 12. IS THERE A SIGNIFICANT DIFFERENCE IN BMI BETWEEN MALES AND FEMALES?
t_result1 <- t.test(BMI ~ Gender, data = community_health, na.rm =TRUE)
print(t_result1)
##
## Welch Two Sample t-test
##
## data: BMI by Gender
## t = -3.0115, df = 993.17, p-value = 0.002665
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -1.9760186 -0.4168035
## sample estimates:
## mean in group Female mean in group Male
## 25.10568 26.30210
There is a statistically significant difference in BMI between males and females (p = 0.0027). Males have a significantly higher mean BMI (26.30) compared to females (25.11), with a mean difference of approximately -1.2 units.
# 13. DO CURRENT SMOKERS HAVE HIGHER CHOLESTEROL LEVELS THAN NON-SMOKERS?
anova_result <- aov(CholesterolLevel ~ SmokingStatus, data = community_health, na.rm = TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## SmokingStatus 2 820 409.9 0.389 0.678
## Residuals 997 1050469 1053.6
Since p-value (0.678) > alpha (0.05), we fail to reject the null hypothesis. In Conclusion, there is insufficient evidence to conclude that smoking status affects cholesterol levels. The observed differences in group means are likely due to random chance.
# 14. DOES MONTHLYMEDICALCOST DIFFER SIGNIFICANTLY ACROSS REGIONS?
aov_result2 <- aov(MonthlyMedicalCost ~ Region,data=community_health,na.rm=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
summary(aov_result2)
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 3 737899 245966 0.332 0.802
## Residuals 996 738033277 740997
Since p-value (0.802 > alpha value 0.05, we fail to reject the null hypothesis) There is no evidence that monthly medical costs differ across regions
#15. DO COGNITIVETESTSCORES VARY ACROSS BLOODTYPE GROUPS?
anova_result3 <- aov(CognitiveTestScore ~ BloodType,data=community_health,na.rm=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
summary(anova_result3)
## Df Sum Sq Mean Sq F value Pr(>F)
## BloodType 3 554 184.75 2.218 0.0844 .
## Residuals 996 82949 83.28
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since p-value (0.0844) > alpha (0.05), we fail to reject the null hypothesis. However, the result is marginally significant (p < 0.10), suggesting a weak trend that does not meet conventional statistical significance.
#16. IS THERE A CORRELATION BETWEEN BMI AND BLOODPRESSURE?
corr1 <- cor.test(community_health$BMI,community_health$SystolicBP,use = 'complete.obs')
print(corr1)
##
## Pearson's product-moment correlation
##
## data: community_health$BMI and community_health$SystolicBP
## t = 26.576, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6059447 0.6786642
## sample estimates:
## cor
## 0.6437555
Since p-value (< 0.001) < alpha (0.05), we REJECT the null hypothesis. There is a statistically significant, moderate-to-strong positive correlation between BMI and systolic blood pressure (r = 0.644, p < 0.001).
#17. HOW STRONGLY IS DAILYSCREENTIMEHOURS RELATED TO DEPRESSIONSCORE?
corr2 <- cor.test(community_health$DailyScreenTimeHours,community_health$DepressionScore,use='complete.obs')
print(corr2)
##
## Pearson's product-moment correlation
##
## data: community_health$DailyScreenTimeHours and community_health$DepressionScore
## t = 12.888, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3233273 0.4296796
## sample estimates:
## cor
## 0.3777487
Since p-value (< 0.001) < alpha (0.05), we REJECT the null hypothesis. There is a statistically significant, weak to moderate positive correlation between BMI and systolic blood pressure (r = 0.3777, p < 0.001).
#18. IS EXERCISEHOURSPERWEEK NEGATIVELY CORRELATED WITH BMI?
corr3 <- cor.test(community_health$ExerciseHoursPerWeek,community_health$BMI,use='complete.obs')
print(corr3)
##
## Pearson's product-moment correlation
##
## data: community_health$ExerciseHoursPerWeek and community_health$BMI
## t = 0.016088, df = 998, p-value = 0.9872
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06148576 0.06250038
## sample estimates:
## cor
## 0.0005092686
Exercise hours per week is not significantly correlated with BMI(r = 0.0005, p = 0.9872), suggesting no linear relationship between these variables in this population.
#19. DOES STRESSLEVEL SIGNIFICANTLY PREDICT DEPRESSIONSCORE?
linear_model1 <- lm(DepressionScore~StressLevel,data=community_health)
summary(linear_model1)
##
## Call:
## lm(formula = DepressionScore ~ StressLevel, data = community_health)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.8066 -2.9961 -0.0066 2.7972 13.1934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.2928 0.2940 52.024 < 2e-16 ***
## StressLevelLow -2.6861 0.3859 -6.960 6.16e-12 ***
## StressLevelMedium -2.2866 0.3488 -6.555 8.91e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.229 on 997 degrees of freedom
## Multiple R-squared: 0.05305, Adjusted R-squared: 0.05115
## F-statistic: 27.93 on 2 and 997 DF, p-value: 1.582e-12
The intercept value of 15.2928 represents the estimated depression score for the reference stress category (StressLevelHigh) Compared to the reference category, participants with low stress had depression scores lower by 2.6861 points, while participants with medium stress had depression scores lower by 2.2866 points. Both relationships were statistically significant (p<0.001). In conclusion, stress level significantly affects depression score.
#20. CAN EXERCISEHOURSPERWEEK PREDICT CHOLESTEROLLEVEL?
linear_model2 <- lm(CholesterolLevel~ExerciseHoursPerWeek,data=community_health)
summary(linear_model2)
##
## Call:
## lm(formula = CholesterolLevel ~ ExerciseHoursPerWeek, data = community_health)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.967 -18.260 -2.689 11.469 228.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 217.9866 2.2997 94.791 <2e-16 ***
## ExerciseHoursPerWeek -0.1371 0.5229 -0.262 0.793
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.45 on 998 degrees of freedom
## Multiple R-squared: 6.893e-05, Adjusted R-squared: -0.000933
## F-statistic: 0.06879 on 1 and 998 DF, p-value: 0.7932
The predicted average cholesterol level for someone with 0 hours of exercise per week is 217.9866. For each additional hour of exercise per week, cholesterol level decreases by only 0.14, but this effect is NOT statistically significant (p = 0.793 > alpha 0.05)
#21. CAN SLEEPHOURS AND SCREENTIME PREDICT COGNITIVETESTSCORE?
linear_model3 <- lm(CognitiveTestScore~DailyScreenTimeHours+SleepHours,data=community_health)
summary(linear_model3)
##
## Call:
## lm(formula = CognitiveTestScore ~ DailyScreenTimeHours + SleepHours,
## data = community_health)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.345 -5.493 -0.050 5.052 34.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.4853 1.5220 67.994 <2e-16 ***
## DailyScreenTimeHours -1.4481 0.1304 -11.109 <2e-16 ***
## SleepHours 1.6760 0.1803 9.295 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.322 on 997 degrees of freedom
## Multiple R-squared: 0.1731, Adjusted R-squared: 0.1714
## F-statistic: 104.3 on 2 and 997 DF, p-value: < 2.2e-16
The predicted average cognitive test score is 103.5 when Daily Screen Time is 0 hours and Sleep Hours is 0 — though this is an extrapolation since sleep hours cannot be zero in reality. Each additional hour of screen time decreases cognitive score by 1.45 points holding sleep constant Each additional hour of sleep increases cognitive score by 1.68 points holding screen time constant Since the p-value (< 2.2e-16) is less than alpha (0.05), the overall regression model is statistically significant. In conclusion, both daily screen time and sleep hours are significant predictors of cognitive test scores.
#22. DOES FAMILYHISTORYDIABETES SIGNIFICANTLY INCREASE DISEASE RISK?
logistical_model1 <- glm(DiagnosisRisk ~ FamilyHistoryDiabetes, data=community_health,family='binomial')
summary(logistical_model1)
##
## Call:
## glm(formula = DiagnosisRisk ~ FamilyHistoryDiabetes, family = "binomial",
## data = community_health)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.4525 0.5786 -9.424 < 2e-16 ***
## FamilyHistoryDiabetes 4.0364 0.5968 6.763 1.35e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 459.42 on 999 degrees of freedom
## Residual deviance: 332.04 on 998 degrees of freedom
## AIC: 336.04
##
## Number of Fisher Scoring iterations: 8
Intercept = -5.4525 indicates that without family history, the chance of high diagnostic risk is very low. Since p-value (1.35e-11) < 0.05, we REJECT the null hypothesis Having a family history of diabetes is associated with 56.6 times higher odds of being at high diagnosis risk (applied exponential to 4.04 to derrive 56.6)
#23. CAN A MODEL USING AGE, BMI, GLUCOSELEVEL, AND STRESSLEVEL ACCURATELY CLASSIFY DIAGNOSISRISK?
logistical_model2 <- glm(DiagnosisRisk~Age+BMI+GlucoseLevel, data=community_health,family='binomial')
summary(logistical_model2)
##
## Call:
## glm(formula = DiagnosisRisk ~ Age + BMI + GlucoseLevel, family = "binomial",
## data = community_health)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.065541 1.229057 -6.562 5.30e-11 ***
## Age -0.004017 0.008173 -0.492 0.623
## BMI 0.147874 0.030209 4.895 9.83e-07 ***
## GlucoseLevel 0.011436 0.013554 0.844 0.399
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 459.42 on 999 degrees of freedom
## Residual deviance: 398.56 on 996 degrees of freedom
## AIC: 406.56
##
## Number of Fisher Scoring iterations: 6
When Age = 0, BMI = 0, and GlucoseLevel = 0, the log-odds of the diagnostic outcome being true is -8.07, indicating a very low baseline probability of the outcome.
Age had a coefficient of -0.004017, meaning increasing age slightly decreased the odds of the outcome. Converting the coefficient using e−0.004017e^{-0.004017}e−0.004017 gives an odds ratio of approximately 0.996, indicating that each 1-year increase in age decreases the odds of the outcome by about 0.4%. However, the p-value for Age was 0.623, which is greater than 0.05, showing that age was not a statistically significant predictor of the outcome.
BMI had a coefficient of 0.147874, meaning increasing BMI increased the odds of the outcome. Converting the coefficient using e0.147874e^{0.147874}e0.147874 gives an odds ratio of approximately 1.16, indicating that each 1-unit increase in BMI increases the odds of the outcome by about 16%. The p-value for BMI was 9.83e-07, which is far below 0.05, showing a statistically significant relationship between BMI and the outcome.
GlucoseLevel had a coefficient of 0.011436, meaning increasing glucose level slightly increased the odds of the outcome. Converting the coefficient using e0.011436e^{0.011436}e0.011436 gives an odds ratio of approximately 1.012, indicating that each 1-unit increase in glucose level increases the odds of the outcome by about 1.2%. However, the p-value for GlucoseLevel was 0.399, which is greater than 0.05, indicating that glucose level was not a statistically significant predictor of the outcome in this model.
#24. HOW DOES MONTHLYMEDICALCOST TREND OVER FOLLOWUPMONTH?
#step0: install lubridate
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#step1: convert followupmonth column to date format
community_health$FollowUpMonth <- as.Date(community_health$FollowUpMonth)
#step2: extract month from follow up month
community_health <- community_health %>%
mutate(
month = month(FollowUpMonth, label = TRUE)
)
#step3: calculate total cost by month
cost_trend <- community_health %>%
group_by(month)%>%
summarise(TotalCost=sum(MonthlyMedicalCost,na.rm=TRUE),.groups='drop')
print(cost_trend)
## # A tibble: 12 × 2
## month TotalCost
## <ord> <dbl>
## 1 Jan 63773.
## 2 Feb 87663.
## 3 Mar 91140.
## 4 Apr 81556.
## 5 May 97270.
## 6 Jun 62690.
## 7 Jul 67527.
## 8 Aug 79232.
## 9 Sep 59252.
## 10 Oct 80539.
## 11 Nov 76698.
## 12 Dec 94247.
#step4: visualize output
ggplot(data=cost_trend,mapping=aes(x=month, y=TotalCost,fill=month, group=1,color=month))+geom_line(size = 1)+ geom_point(size = 2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
labs(title='How Monthly Medical Costs Trend over Followup Months')
## <ggplot2::labels> List of 1
## $ title: chr "How Monthly Medical Costs Trend over Followup Months"
#25. IS THERE SEASONALITY IN SICKDAYSLASTYEAR REPORTING?
# Step1: Extract month from FollowUpMonth
library(lubridate)
community_health <- community_health %>%
mutate(
month = month(FollowUpMonth, label = TRUE)
)
# Step2: Calculate average sick days by month
seasonality_sick <- community_health %>%
group_by(month) %>%
summarise(
AvgSickDays = mean(SickDaysLastYear, na.rm = TRUE),
TotalSickDays = sum(SickDaysLastYear, na.rm = TRUE),
Count = n(),
.groups = 'drop'
)
# Step3: Print results
print(seasonality_sick)
## # A tibble: 12 × 4
## month AvgSickDays TotalSickDays Count
## <ord> <dbl> <int> <int>
## 1 Jan 7.61 571 75
## 2 Feb 7.69 692 90
## 3 Mar 7.48 628 84
## 4 Apr 7.57 636 84
## 5 May 7.54 679 90
## 6 Jun 7.76 590 76
## 7 Jul 7.55 574 76
## 8 Aug 7.59 660 87
## 9 Sep 7.42 542 73
## 10 Oct 7.55 649 86
## 11 Nov 7.64 680 89
## 12 Dec 8.1 729 90
# Step4: Visualize seasonality
ggplot(seasonality_sick, aes(x = month, y = AvgSickDays, group = 1)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(title = "Seasonality in Sick Days Reporting",
x = "Month",
y = "Average Sick Days") +
theme_minimal()
#26. WHICH PARTICIPANTS HAVE ABNORMALLY LOW OR HIGH BMI VALUES?
#step1: calculate Q1, Q3 and IQR
Q1 <- quantile(community_health$BMI,0.25,na.rm = TRUE)
Q3 <- quantile(community_health$BMI,0.75,na.rm = TRUE)
IQR <- Q3 - Q1
#step2: define bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
#step3: visualize outliers using vline
ggplot(data=community_health, mapping=aes(x = BMI)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
geom_vline(xintercept = c(lower_bound, upper_bound),
color = "red", linetype = "dashed", size = 1) +
labs(title = "Distribution of BMI Levels",
subtitle = "Red dashed lines show outlier boundaries",
x = "BMI",
y = "Count") +
theme_minimal()