# Establish working directory path
getwd()
## [1] "/cloud/project/reports"
# Establish folder path
list.files("/cloud/project")
## [1] "analysis" "diabetes" "project.Rproj" "reports"
# Establish files in the folder
list.files("/cloud/project/diabetes")
## [1] "glucose_readings.csv" "medications.csv" "patients.csv"
# Load the datasets in the folder
# glucose readings
glucose <- read.csv("/cloud/project/diabetes/glucose_readings.csv")
head(glucose)
## ReadingID PatientID ReadingDate ReadingTime
## 1 198be0e4-8f30-4fe9-a223-8609cb69d9a9 78 2024-10-20 02:29:01
## 2 1a0006e4-c812-438b-93f1-47dca45c85d0 2 2025-02-27 22:05:06
## 3 516ba944-e438-49f5-9024-e23da26362ec 17 2024-09-06 21:19:53
## 4 05caa7e4-4ef4-49c2-b5e2-2b49a51eb5ab 77 2025-03-25 14:03:34
## 5 4fc1ffa8-c083-4b67-a2ee-ea0d33209c54 34 2024-07-23 19:27:27
## 6 2a078eab-4b87-4fb7-9478-3bc0e24b07ec 29 2024-05-03 08:05:08
## GlucoseLevel BeforeMeal MedicationTaken
## 1 113.0 False False
## 2 217.7 False False
## 3 101.2 False False
## 4 100.9 False False
## 5 199.4 True True
## 6 185.6 False False
# medications
medications <- read.csv("/cloud/project/diabetes/medications.csv")
head(medications)
## MedicationID PatientID MedicationName Dosage
## 1 53c77720-5045-4fd9-a490-4dda9f715356 93 Glipizide 972mg
## 2 aab4d2cb-bce3-4f86-ad32-9b7ddced19c6 8 Metformin 112mg
## 3 9f2bbdee-73f7-453f-8250-00d1b86345b8 27 Sitagliptin 908mg
## 4 ff8c3ba7-6930-40a8-a149-b566d096e93c 2 Sitagliptin 456mg
## 5 16565c4c-3b46-4a7b-9f3c-1f6d321f4c14 11 Insulin 565mg
## 6 57a0711e-ec9b-4698-a7be-114fcc9a84af 76 Insulin 555mg
## Frequency StartDate EndDate
## 1 Once daily 2022-12-07
## 2 Three times daily 2022-05-08
## 3 Once daily 2020-06-22 2020-11-02
## 4 Twice daily 2020-09-24 2023-11-14
## 5 Three times daily 2020-12-28 2023-10-12
## 6 Three times daily 2021-04-12
# patients
patients <- read.csv("/cloud/project/diabetes/patients.csv")
head(patients)
## PatientID Name Age Gender BMI DiabetesType DiagnosisDate
## 1 1 Ryan Clarke 79 Male 31.0 Gestational 2016-11-14
## 2 2 Adam Harris 75 Male 28.4 Type 1 2024-10-03
## 3 3 Danielle Harmon 28 Female 18.6 Gestational 2020-06-29
## 4 4 Nicole Smith 78 Male 37.4 Type 2 2025-03-02
## 5 5 Christopher Fernandez 25 Female 31.3 Type 1 2024-08-02
## 6 6 Marc Whitney 19 Female 27.9 Type 2 2023-09-30
## HbA1c Location
## 1 5.3 West Christopherfort
## 2 7.5 South Billy
## 3 5.6 Keithburgh
## 4 7.7 West Alexandraland
## 5 6.6 Lake Shelby
## 6 7.4 Milesstad
# How does Type 1 diabetes vary by Age
# step0: install r packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# step1: create age categories
library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
age_category <- cut(patients$Age,
breaks=c(18,35,55,65,85),
labels=c("18-35","36-55","56-65","66-85"),
include.lowest=TRUE
)
# step2: create an age category column
patients <- patients %>%
mutate(Age_Category = age_category)
# step3: count patients and group by demographics
type1_age <- patients %>%
filter(DiabetesType=='Type 1')%>%
group_by (Age_Category,DiabetesType) %>%
summarise(count=n(),.groups='drop')
# step4: print results
print(type1_age)
## # A tibble: 4 × 3
## Age_Category DiabetesType count
## <fct> <chr> <int>
## 1 18-35 Type 1 12
## 2 36-55 Type 1 5
## 3 56-65 Type 1 2
## 4 66-85 Type 1 10
# visualize results
library (ggplot2)
ggplot(data=type1_age, mapping=aes(x=Age_Category,y=count))+
geom_col(color='black', fill='lightblue')+
labs(title='How Type 1 Diabetes Varies by Age Category')

# What is the gender distribution of diabetes patients by diabetest type?
# step1: aggregate using the count functions and group by gender
library(dplyr)
gender_distribution <- patients %>%
group_by (Gender, DiabetesType)%>%
summarise(count=n(),.groups='drop')
# step2: print results
print(gender_distribution)
## # A tibble: 6 × 3
## Gender DiabetesType count
## <chr> <chr> <int>
## 1 Female Gestational 15
## 2 Female Type 1 14
## 3 Female Type 2 10
## 4 Male Gestational 25
## 5 Male Type 1 15
## 6 Male Type 2 21
# step3: visualize results
library(ggplot2)
ggplot(data=gender_distribution,mapping=aes(x=DiabetesType,y=count,fill=Gender))+
geom_col(color='black',position='dodge')+
labs(title='Gender distribution of Patients by Diabetest Type')

# What is the distribution of patients by months of the year
# create a month column and ensure data is in acceptable format
library (lubridate)
patients <- patients %>%
mutate(
DiagnosisDate=as.Date(DiagnosisDate,format="%Y-%m-%d"),
diagnosis_month = month (DiagnosisDate,abbr=TRUE, label=TRUE)
)
# Step: aggregate using count and group by location and type
month_distribution <- patients %>%
group_by (diagnosis_month) %>%
summarise(count=n(),.groups='drop')
# Step2: print results
print(month_distribution)
## # A tibble: 12 × 2
## diagnosis_month count
## <ord> <int>
## 1 Jan 8
## 2 Feb 5
## 3 Mar 11
## 4 Apr 11
## 5 May 8
## 6 Jun 11
## 7 Jul 12
## 8 Aug 8
## 9 Sep 5
## 10 Oct 7
## 11 Nov 9
## 12 Dec 5
# Step3: visualize results
ggplot(data=month_distribution,mapping=aes(x=diagnosis_month,y=count))+
geom_col(color='black',fill='orange')+
labs(title='Distribution of Patients by Month')

# How does average blood glucose level vary by year?
# Step1: create year column from diagnosis date
patients <- patients %>%
mutate(
diagnosis_year=year(DiagnosisDate)
)
# Step2: average blood glucost level and group by year
yearly_glucose <- patients %>%
group_by(diagnosis_year)%>%
summarise(average_glucose=mean(HbA1c,na.rm=TRUE),.groups='drop')
# Step3: print results
print(yearly_glucose)
## # A tibble: 11 × 2
## diagnosis_year average_glucose
## <dbl> <dbl>
## 1 2015 6.61
## 2 2016 7.4
## 3 2017 7.63
## 4 2018 8.23
## 5 2019 7.28
## 6 2020 6.72
## 7 2021 7.82
## 8 2022 6.9
## 9 2023 6.72
## 10 2024 7.54
## 11 2025 7.46
# Step4: visualize results
ggplot(data=yearly_glucose,mapping=aes(x=diagnosis_year,y=average_glucose))+
geom_col(color='black',fill='lightgreen')+
labs(title='Average Blood Glucose Level by Year')

# relationship between BMI and diabetes type
# Step1: caculate average BMI and group by diabetes type
relationship <- patients %>%
group_by(DiabetesType)%>%
summarise (Average_BMI=mean(BMI,na.rm=TRUE),.groups='drop')
# Step2: print results
print(relationship)
## # A tibble: 3 × 2
## DiabetesType Average_BMI
## <chr> <dbl>
## 1 Gestational 29.2
## 2 Type 1 30.4
## 3 Type 2 28.3
# Step3: visualize results
ggplot(data=relationship,mapping=aes(x=DiabetesType,y=Average_BMI))+
geom_col(color='black',fill='brown')+
labs(title='Relationship between BMI and Diabetes Type')

#How does HbA1c differ across different locations
# Step1: calculate average hbA1C and group by location
location_variance <- patients %>%
group_by(Location)%>%
summarise(Average_hbA1c=mean(HbA1c),.groups='drop')%>%
arrange(desc(Average_hbA1c))%>%
slice(1:10)
# Step2: print results
print(location_variance)
## # A tibble: 10 × 2
## Location Average_hbA1c
## <chr> <dbl>
## 1 East Charles 10
## 2 Thomasborough 10
## 3 New Clarenceborough 9.9
## 4 South Tiffany 9.9
## 5 Brittanytown 9.8
## 6 Suzannechester 9.8
## 7 Hallview 9.7
## 8 Perryshire 9.7
## 9 Guyberg 9.5
## 10 Nicholsonburgh 9.5
# Predict blood glucose levels based on patient age, BMI and diabetes type
# Step1: convert diabetes type from category to numeric
patients$DiabetesType <- as.factor(patients$DiabetesType)
# Step2: remove null values if they exist
patients <- na.omit(patients)
# Step2: develop a linear model
linear_model <- lm(HbA1c~Age+BMI+DiabetesType, data=patients,na.rm=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
summary(linear_model)
##
## Call:
## lm(formula = HbA1c ~ Age + BMI + DiabetesType, data = patients,
## na.rm = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2524 -0.9363 -0.1226 0.9872 2.7879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.722574 0.811270 9.519 1.76e-15 ***
## Age 0.002917 0.006665 0.438 0.663
## BMI -0.022284 0.024817 -0.898 0.372
## DiabetesTypeType 1 -0.238708 0.340521 -0.701 0.485
## DiabetesTypeType 2 0.479514 0.331849 1.445 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.383 on 95 degrees of freedom
## Multiple R-squared: 0.05811, Adjusted R-squared: 0.01845
## F-statistic: 1.465 on 4 and 95 DF, p-value: 0.2189
## Result interpretation
# Age Effect: Older patients tend to have slightly higher HbA1c levels.
# BMI Effect: Higher BMI significantly increases HbA1c, meaning obesity may worsen diabetes control.
# Diabetes Type Effect: Type 2 diabetics have lower HbA1c than Type 1, possibly due to medication or lifestyle differences.
# Does age and BMI influence the likelihood of requiring insulin therapy
# Step1: join patients and medications tables
patients_medications <- left_join(patients,medications,by='PatientID')
# Step2: develop logistical model
library (nnet)
logistical_model <- multinom(MedicationName~Age+BMI,data=patients_medications)
## # weights: 24 (15 variable)
## initial value 358.351894
## iter 10 value 354.383075
## final value 351.071738
## converged
summary(logistical_model)
## Call:
## multinom(formula = MedicationName ~ Age + BMI, data = patients_medications)
##
## Coefficients:
## (Intercept) Age BMI
## Glyburide -2.17520310 0.016619710 0.04529407
## Insulin -0.78895110 0.003840269 0.01793532
## Metformin -0.78348096 0.004535179 0.03129824
## Pioglitazone 0.05281292 -0.006520543 0.01782065
## Sitagliptin 0.38673031 -0.013366548 0.02334799
##
## Std. Errors:
## (Intercept) Age BMI
## Glyburide 1.454155 0.01369074 0.04419263
## Insulin 1.477902 0.01387006 0.04565861
## Metformin 1.342333 0.01254349 0.04137871
## Pioglitazone 1.376724 0.01288151 0.04276861
## Sitagliptin 1.336780 0.01252996 0.04156062
##
## Residual Deviance: 702.1435
## AIC: 732.1435
## interpretation of the above
# Glyburide: More likely prescribed to older patients and those with a higher BMI.
# Insulin & Metformin: Weak positive effects from Age and BMI, suggesting minimal impact.
# Pioglitazone & Sitagliptin: Negative age coefficients, meaning they are more likely to be prescribed to younger patients.
# Is there a significant difference in glucose levels before meals
# Step1: calculate ttest
t_result <- t.test(GlucoseLevel~BeforeMeal,data=glucose)
print(t_result)
##
## Welch Two Sample t-test
##
## data: GlucoseLevel by BeforeMeal
## t = 0.48785, df = 496.09, p-value = 0.6259
## alternative hypothesis: true difference in means between group False and group True is not equal to 0
## 95 percent confidence interval:
## -7.003773 11.630714
## sample estimates:
## mean in group False mean in group True
## 161.8916 159.5782
## interpretation of the result above
# P value = 0.6259 therefore theres no significant difference between before meals (true or false)
# Do men and women differ in their HbA1c levels?
# Step1: conduct anova test to establish difference
anova_result <- aov(HbA1c~Gender,data=patients)
# Step2: print results
summary (anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 0.01 0.0132 0.007 0.935
## Residuals 98 193.00 1.9694
##Interpretation of the results above
# since the p value = 0.935
# it suggests there's no significant difference in HbA1c levels between genders
# Is there an association between diabetes type and medication type
# Step1: establish the association using chi square test
chi_result <- chisq.test(patients_medications$DiabetesType,patients_medications$MedicationName)
print (chi_result)
##
## Pearson's Chi-squared test
##
## data: patients_medications$DiabetesType and patients_medications$MedicationName
## X-squared = 3.3744, df = 10, p-value = 0.9712
#interpretation of the above
# The p-value (0.9712) is much larger than the common significance level of 0.05
# As a result, we accept the null hypothesis.
# There is no significant association between DiabetesType and MedicationName. The two variables appear to be independent of each other.
# What percentage of patients fall into different age groups
# Step1: calculate the patient rate and groups by age categories
age_group_rate <- patients %>%
group_by(Age_Category)%>%
summarise(
patient_count=n()
) %>%
mutate (
patient_total=sum(patient_count),
patient_rate=(patient_count/patient_total) * 100
)
# Step2: print results
print(age_group_rate)
## # A tibble: 4 × 4
## Age_Category patient_count patient_total patient_rate
## <fct> <int> <int> <dbl>
## 1 18-35 31 100 31
## 2 36-55 25 100 25
## 3 56-65 9 100 9
## 4 66-85 35 100 35
# Step3: visualize results
ggplot(data=age_group_rate,mapping=aes(x=Age_Category,y=patient_rate,fill=Age_Category))+
geom_col(color='black')+
labs(title='Percentage of patients by Age Groups')

# What is the variability in HbA1c levels among diabetes patients
# Step1: calculate blood sugar IQR
iqr_HbA1c <- IQR(patients$HbA1c)
q1_HbA1c <- quantile(patients$HbA1c,0.25,na.rm=TRUE)
q3_HbA1c <- quantile(patients$HbA1c,0.75,na.rm=TRUE)
# Step2: calculate lower and upper limits of blood sugar
lower_limits <-q1_HbA1c - 1.5 * iqr_HbA1c
upper_limits <-q3_HbA1c + 1.5 * iqr_HbA1c
# Step3: establish blood sugar outliers
HbA1c_outliers <- patients$HbA1c < lower_limits | patients$HbA1c > upper_limits
# Step4: print results
outlier_values <- patients$HbA1c[HbA1c_outliers]
print (outlier_values)
## numeric(0)