# Establish working directory path
getwd()
## [1] "/cloud/project/reports"
# Establish folder path
list.files("/cloud/project")
## [1] "analysis"      "diabetes"      "project.Rproj" "reports"
# Establish files in the folder
list.files("/cloud/project/diabetes")
## [1] "glucose_readings.csv" "medications.csv"      "patients.csv"
# Load the datasets in the folder
# glucose readings
glucose <- read.csv("/cloud/project/diabetes/glucose_readings.csv")
head(glucose)
##                              ReadingID PatientID ReadingDate ReadingTime
## 1 198be0e4-8f30-4fe9-a223-8609cb69d9a9        78  2024-10-20    02:29:01
## 2 1a0006e4-c812-438b-93f1-47dca45c85d0         2  2025-02-27    22:05:06
## 3 516ba944-e438-49f5-9024-e23da26362ec        17  2024-09-06    21:19:53
## 4 05caa7e4-4ef4-49c2-b5e2-2b49a51eb5ab        77  2025-03-25    14:03:34
## 5 4fc1ffa8-c083-4b67-a2ee-ea0d33209c54        34  2024-07-23    19:27:27
## 6 2a078eab-4b87-4fb7-9478-3bc0e24b07ec        29  2024-05-03    08:05:08
##   GlucoseLevel BeforeMeal MedicationTaken
## 1        113.0      False           False
## 2        217.7      False           False
## 3        101.2      False           False
## 4        100.9      False           False
## 5        199.4       True            True
## 6        185.6      False           False
# medications
medications <- read.csv("/cloud/project/diabetes/medications.csv")
head(medications)
##                           MedicationID PatientID MedicationName Dosage
## 1 53c77720-5045-4fd9-a490-4dda9f715356        93      Glipizide  972mg
## 2 aab4d2cb-bce3-4f86-ad32-9b7ddced19c6         8      Metformin  112mg
## 3 9f2bbdee-73f7-453f-8250-00d1b86345b8        27    Sitagliptin  908mg
## 4 ff8c3ba7-6930-40a8-a149-b566d096e93c         2    Sitagliptin  456mg
## 5 16565c4c-3b46-4a7b-9f3c-1f6d321f4c14        11        Insulin  565mg
## 6 57a0711e-ec9b-4698-a7be-114fcc9a84af        76        Insulin  555mg
##           Frequency  StartDate    EndDate
## 1        Once daily 2022-12-07           
## 2 Three times daily 2022-05-08           
## 3        Once daily 2020-06-22 2020-11-02
## 4       Twice daily 2020-09-24 2023-11-14
## 5 Three times daily 2020-12-28 2023-10-12
## 6 Three times daily 2021-04-12
# patients
patients <- read.csv("/cloud/project/diabetes/patients.csv")
head(patients)
##   PatientID                  Name Age Gender  BMI DiabetesType DiagnosisDate
## 1         1           Ryan Clarke  79   Male 31.0  Gestational    2016-11-14
## 2         2           Adam Harris  75   Male 28.4       Type 1    2024-10-03
## 3         3       Danielle Harmon  28 Female 18.6  Gestational    2020-06-29
## 4         4          Nicole Smith  78   Male 37.4       Type 2    2025-03-02
## 5         5 Christopher Fernandez  25 Female 31.3       Type 1    2024-08-02
## 6         6          Marc Whitney  19 Female 27.9       Type 2    2023-09-30
##   HbA1c             Location
## 1   5.3 West Christopherfort
## 2   7.5          South Billy
## 3   5.6           Keithburgh
## 4   7.7   West Alexandraland
## 5   6.6          Lake Shelby
## 6   7.4            Milesstad
# How does Type 1 diabetes vary by Age

# step0: install r packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# step1: create age categories
library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
age_category <- cut(patients$Age,
                    breaks=c(18,35,55,65,85),
                    labels=c("18-35","36-55","56-65","66-85"),
                    include.lowest=TRUE
)

# step2: create an age category column
patients <- patients %>%
  mutate(Age_Category = age_category)

# step3: count patients and group by demographics
type1_age <- patients %>%
  filter(DiabetesType=='Type 1')%>%
  group_by (Age_Category,DiabetesType) %>%
  summarise(count=n(),.groups='drop')

# step4: print results
print(type1_age)
## # A tibble: 4 × 3
##   Age_Category DiabetesType count
##   <fct>        <chr>        <int>
## 1 18-35        Type 1          12
## 2 36-55        Type 1           5
## 3 56-65        Type 1           2
## 4 66-85        Type 1          10
# visualize results
library (ggplot2)
ggplot(data=type1_age, mapping=aes(x=Age_Category,y=count))+
  geom_col(color='black', fill='lightblue')+
  labs(title='How Type 1 Diabetes Varies by Age Category')

# What is the gender distribution of diabetes patients by diabetest type?

# step1: aggregate using the count functions and group by gender
library(dplyr)
gender_distribution <- patients %>%
  group_by (Gender, DiabetesType)%>%
  summarise(count=n(),.groups='drop')

# step2: print results
print(gender_distribution)
## # A tibble: 6 × 3
##   Gender DiabetesType count
##   <chr>  <chr>        <int>
## 1 Female Gestational     15
## 2 Female Type 1          14
## 3 Female Type 2          10
## 4 Male   Gestational     25
## 5 Male   Type 1          15
## 6 Male   Type 2          21
# step3: visualize results
library(ggplot2)
ggplot(data=gender_distribution,mapping=aes(x=DiabetesType,y=count,fill=Gender))+
  geom_col(color='black',position='dodge')+
  labs(title='Gender distribution of Patients by Diabetest Type')

# What is the distribution of patients by months of the year

# create a month column and ensure data is in acceptable format
library (lubridate)
patients <- patients %>%
  mutate(
    DiagnosisDate=as.Date(DiagnosisDate,format="%Y-%m-%d"),
    diagnosis_month = month (DiagnosisDate,abbr=TRUE, label=TRUE)
  )


# Step: aggregate using count and group by location and type
month_distribution <- patients %>%
  group_by (diagnosis_month) %>%
  summarise(count=n(),.groups='drop')
  
# Step2: print results
print(month_distribution)
## # A tibble: 12 × 2
##    diagnosis_month count
##    <ord>           <int>
##  1 Jan                 8
##  2 Feb                 5
##  3 Mar                11
##  4 Apr                11
##  5 May                 8
##  6 Jun                11
##  7 Jul                12
##  8 Aug                 8
##  9 Sep                 5
## 10 Oct                 7
## 11 Nov                 9
## 12 Dec                 5
# Step3: visualize results
ggplot(data=month_distribution,mapping=aes(x=diagnosis_month,y=count))+
  geom_col(color='black',fill='orange')+
  labs(title='Distribution of Patients by Month')

# How does average blood glucose level vary by year?

# Step1: create year column from diagnosis date
patients <- patients %>%
  mutate(
    diagnosis_year=year(DiagnosisDate)
  )

# Step2: average blood glucost level and group by year
yearly_glucose <- patients %>%
  group_by(diagnosis_year)%>%
  summarise(average_glucose=mean(HbA1c,na.rm=TRUE),.groups='drop')

# Step3: print results
print(yearly_glucose)
## # A tibble: 11 × 2
##    diagnosis_year average_glucose
##             <dbl>           <dbl>
##  1           2015            6.61
##  2           2016            7.4 
##  3           2017            7.63
##  4           2018            8.23
##  5           2019            7.28
##  6           2020            6.72
##  7           2021            7.82
##  8           2022            6.9 
##  9           2023            6.72
## 10           2024            7.54
## 11           2025            7.46
# Step4: visualize results
ggplot(data=yearly_glucose,mapping=aes(x=diagnosis_year,y=average_glucose))+
  geom_col(color='black',fill='lightgreen')+
  labs(title='Average Blood Glucose Level by Year')

# relationship between BMI and diabetes type

# Step1: caculate average BMI and group by diabetes type
relationship <- patients %>%
  group_by(DiabetesType)%>%
  summarise (Average_BMI=mean(BMI,na.rm=TRUE),.groups='drop')

# Step2: print results
print(relationship)
## # A tibble: 3 × 2
##   DiabetesType Average_BMI
##   <chr>              <dbl>
## 1 Gestational         29.2
## 2 Type 1              30.4
## 3 Type 2              28.3
# Step3: visualize results
ggplot(data=relationship,mapping=aes(x=DiabetesType,y=Average_BMI))+
  geom_col(color='black',fill='brown')+
  labs(title='Relationship between BMI and Diabetes Type')

#How does HbA1c differ across different locations

# Step1: calculate average hbA1C and group by location
location_variance <- patients %>%
  group_by(Location)%>%
  summarise(Average_hbA1c=mean(HbA1c),.groups='drop')%>%
  arrange(desc(Average_hbA1c))%>%
  slice(1:10)

# Step2: print results
print(location_variance)
## # A tibble: 10 × 2
##    Location            Average_hbA1c
##    <chr>                       <dbl>
##  1 East Charles                 10  
##  2 Thomasborough                10  
##  3 New Clarenceborough           9.9
##  4 South Tiffany                 9.9
##  5 Brittanytown                  9.8
##  6 Suzannechester                9.8
##  7 Hallview                      9.7
##  8 Perryshire                    9.7
##  9 Guyberg                       9.5
## 10 Nicholsonburgh                9.5
# Predict blood glucose levels based on patient age, BMI and diabetes type

# Step1: convert diabetes type from category to numeric
patients$DiabetesType <- as.factor(patients$DiabetesType)

# Step2: remove null values if they exist
patients <- na.omit(patients)

# Step2: develop a linear model
linear_model <- lm(HbA1c~Age+BMI+DiabetesType, data=patients,na.rm=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'na.rm' will be disregarded
summary(linear_model)
## 
## Call:
## lm(formula = HbA1c ~ Age + BMI + DiabetesType, data = patients, 
##     na.rm = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2524 -0.9363 -0.1226  0.9872  2.7879 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         7.722574   0.811270   9.519 1.76e-15 ***
## Age                 0.002917   0.006665   0.438    0.663    
## BMI                -0.022284   0.024817  -0.898    0.372    
## DiabetesTypeType 1 -0.238708   0.340521  -0.701    0.485    
## DiabetesTypeType 2  0.479514   0.331849   1.445    0.152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.383 on 95 degrees of freedom
## Multiple R-squared:  0.05811,    Adjusted R-squared:  0.01845 
## F-statistic: 1.465 on 4 and 95 DF,  p-value: 0.2189
## Result interpretation
# Age Effect: Older patients tend to have slightly higher HbA1c levels.

# BMI Effect: Higher BMI significantly increases HbA1c, meaning obesity may worsen diabetes control.

# Diabetes Type Effect: Type 2 diabetics have lower HbA1c than Type 1, possibly due to medication or lifestyle differences.
# Does age and BMI influence the likelihood of requiring insulin therapy

# Step1: join patients and medications tables
patients_medications <- left_join(patients,medications,by='PatientID')

# Step2: develop logistical model
library (nnet)
logistical_model <- multinom(MedicationName~Age+BMI,data=patients_medications)
## # weights:  24 (15 variable)
## initial  value 358.351894 
## iter  10 value 354.383075
## final  value 351.071738 
## converged
summary(logistical_model)
## Call:
## multinom(formula = MedicationName ~ Age + BMI, data = patients_medications)
## 
## Coefficients:
##              (Intercept)          Age        BMI
## Glyburide    -2.17520310  0.016619710 0.04529407
## Insulin      -0.78895110  0.003840269 0.01793532
## Metformin    -0.78348096  0.004535179 0.03129824
## Pioglitazone  0.05281292 -0.006520543 0.01782065
## Sitagliptin   0.38673031 -0.013366548 0.02334799
## 
## Std. Errors:
##              (Intercept)        Age        BMI
## Glyburide       1.454155 0.01369074 0.04419263
## Insulin         1.477902 0.01387006 0.04565861
## Metformin       1.342333 0.01254349 0.04137871
## Pioglitazone    1.376724 0.01288151 0.04276861
## Sitagliptin     1.336780 0.01252996 0.04156062
## 
## Residual Deviance: 702.1435 
## AIC: 732.1435
## interpretation of the above
# Glyburide: More likely prescribed to older patients and those with a higher BMI.

# Insulin & Metformin: Weak positive effects from Age and BMI, suggesting minimal impact.

# Pioglitazone & Sitagliptin: Negative age coefficients, meaning they are more likely to be prescribed to younger patients.
# Is there a significant difference in glucose levels before meals

# Step1: calculate ttest
t_result <- t.test(GlucoseLevel~BeforeMeal,data=glucose)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  GlucoseLevel by BeforeMeal
## t = 0.48785, df = 496.09, p-value = 0.6259
## alternative hypothesis: true difference in means between group False and group True is not equal to 0
## 95 percent confidence interval:
##  -7.003773 11.630714
## sample estimates:
## mean in group False  mean in group True 
##            161.8916            159.5782
## interpretation of the result above
# P value = 0.6259 therefore theres no significant difference between before meals (true or false)
# Do men and women differ in their HbA1c levels? 

# Step1: conduct anova test to establish difference
anova_result <- aov(HbA1c~Gender,data=patients)

# Step2: print results
summary (anova_result)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Gender       1   0.01  0.0132   0.007  0.935
## Residuals   98 193.00  1.9694
##Interpretation of the results above
# since the p value = 0.935 
# it suggests there's no significant difference in HbA1c levels between genders
# Is there an association between diabetes type and medication type

# Step1: establish the association using chi square test
chi_result <-  chisq.test(patients_medications$DiabetesType,patients_medications$MedicationName)
print (chi_result)
## 
##  Pearson's Chi-squared test
## 
## data:  patients_medications$DiabetesType and patients_medications$MedicationName
## X-squared = 3.3744, df = 10, p-value = 0.9712
#interpretation of the above
# The p-value (0.9712) is much larger than the common significance level of 0.05
# As a result, we accept the null hypothesis. 
# There is no significant association between DiabetesType and MedicationName. The two variables appear to be independent of each other.
# What percentage of patients fall into different age groups 

# Step1: calculate the patient rate and groups by age categories
age_group_rate <- patients %>%
  group_by(Age_Category)%>%
  summarise(
   patient_count=n()
   ) %>%
     mutate (
       patient_total=sum(patient_count),
       patient_rate=(patient_count/patient_total) * 100
     )

# Step2: print results
print(age_group_rate)
## # A tibble: 4 × 4
##   Age_Category patient_count patient_total patient_rate
##   <fct>                <int>         <int>        <dbl>
## 1 18-35                   31           100           31
## 2 36-55                   25           100           25
## 3 56-65                    9           100            9
## 4 66-85                   35           100           35
# Step3: visualize results
ggplot(data=age_group_rate,mapping=aes(x=Age_Category,y=patient_rate,fill=Age_Category))+
  geom_col(color='black')+
  labs(title='Percentage of patients by Age Groups')

# What is the variability in HbA1c levels among diabetes patients

# Step1: calculate blood sugar IQR
iqr_HbA1c <- IQR(patients$HbA1c)
q1_HbA1c <- quantile(patients$HbA1c,0.25,na.rm=TRUE)
q3_HbA1c <- quantile(patients$HbA1c,0.75,na.rm=TRUE)

# Step2: calculate lower and upper limits of blood sugar
lower_limits <-q1_HbA1c - 1.5 * iqr_HbA1c
upper_limits <-q3_HbA1c + 1.5 * iqr_HbA1c

# Step3: establish blood sugar outliers
HbA1c_outliers <- patients$HbA1c < lower_limits | patients$HbA1c > upper_limits

# Step4: print results
outlier_values <- patients$HbA1c[HbA1c_outliers]
print (outlier_values)
## numeric(0)