#import libraries
#install.packages("openxlsx")   # run once
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.5.2
library(survival)
## Warning: package 'survival' was built under R version 4.5.2

Read the dataset

#read the dataset uonhealthinform.xlsx file # Ensure both the dataset  and the notebook file are in the same directory
df <- read.xlsx(file.choose())
attach(df)

#view data
View(df)

#check structure of data
str(df)
## 'data.frame':    11 obs. of  12 variables:
##  $ Time            : num  10 15 20 25 30 35 40 45 50 55 ...
##  $ Event           : num  1 1 0 1 1 0 1 1 0 0 ...
##  $ Patient_ID      : num  2356 4589 3 37 1213 ...
##  $ Age             : num  23 68 97 45 63 47 89 90 27 34 ...
##  $ Gender          : chr  "M" "F" "F" "M" ...
##  $ Height_cm       : num  234 123 345 267 876 356 321 768 986 246 ...
##  $ Weight_kg       : num  56 76 86 79 34 56 29 55 89 43 ...
##  $ Blood_Pressure  : num  124 89 100 78 129 117 115 111 85 120 ...
##  $ Disease_category: chr  "Stroke" "TB" "Malaria" "Malaria" ...
##  $ Region          : chr  "Urban" "Urban" "Rural" "Urban" ...
##  $ County          : chr  "Nakuru" "Isiolo" "Mombasa" "Kisumu" ...
##  $ Education_level : chr  "Primary" "Secondary" "College" "University" ...
#check first variables rows
head(df)

#create A simple linear regression model without predictors/covariances

#we use ~1 if there is no predictor variables/covariance
model0 =lm(Blood_Pressure~1, data=df)

#check model summary
summary(model0)
## 
## Call:
## lm(formula = Blood_Pressure ~ 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.727 -13.727   5.273  12.773  23.273 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  105.727      5.175   20.43 1.74e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.16 on 10 degrees of freedom

#Create multiple linear regression Model with predictors/covariances

#we use ~1 if there is no predictor variables/covariance
model1 =lm(Blood_Pressure~Age+factor(Gender)+Weight_kg)

#check model summary
summary(model1)
## 
## Call:
## lm(formula = Blood_Pressure ~ Age + factor(Gender) + Weight_kg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.116  -7.252   1.834   6.130  14.699 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     149.46407   17.62397   8.481 6.27e-05 ***
## Age              -0.05405    0.14994  -0.361  0.72910    
## factor(Gender)M   1.20910    7.53639   0.160  0.87707    
## Weight_kg        -0.68512    0.19272  -3.555  0.00928 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.94 on 7 degrees of freedom
## Multiple R-squared:  0.6615, Adjusted R-squared:  0.5164 
## F-statistic: 4.559 on 3 and 7 DF,  p-value: 0.04508

#observation: A multiple linear regression model showed that weight was significantly associated with blood pressure (coefficient = −0.69, p = 0.009), while age and gender were not significant predictors having a p=0.73. The model explained approximately 66%(0.6615) of the variance in blood pressure (adjusted R-squared = 0.52% or (0.5164)) and was statistically significant overall (F(3,7) = 4.56, p = 0.045).

The multiple linear regression model perfomed better than the simple linear regression model, as indicated by the higher R-squared value (0.6615 vs. 0.5234) and significant F-statistic (p = 0.045).

#checking the 2 Model comparison (ANOVA)

anova(model0, model1)

#observation: f-statistic = 4.56, p-value = 0.045 The ANOVA results indicate that the multiple linear regression model (model1) is significantly better at explaining the variance in blood pressure compared to the simple model (model0) that includes only the intercept. The p-value of 0.045 suggests that adding the predictors; age, gender and Weight_kg, significantly improves the model fit.

#Checking for linear models

#1. Linearity we assume each predictor to have a lienear relationship with the outcome variable(blood-pressure)

plot(model1, which = 1)

#observation No clear pattern or curves, we escecpt residual to be scattered around zero.However this is a small dataset so we may not see a clear pattern.

#2. Normality of Residuals

#plot
plot(model1, which = 2)

#shapiro-wilk test
shapiro.test(residuals(model1))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model1)
## W = 0.96933, p-value = 0.8797

#observation * The Q-Q plot shows that the residuals generally follow a straight line, this is acceptable normality. * The Shapiro-Wilk test yields a p-value of 0.8, this is >0.5, we fail to reject the null hypothesis and conclude that the residuals are normally distributed.

#3. checking for Homoscedasticity

plot(model1, which = 3)

observation: There’s no funnel shape, this means the variance of residuals is constant across all levels of fitted values, indicating homoscedasticity.

#4. Checking for independence of Residuals Data were collected from different individuals, so we assume independence of residuals.

#5. Checking for Multicollinearity

#Checking for multicollinearity
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
vif(model1)
##            Age factor(Gender)      Weight_kg 
##       1.055340       1.087127       1.073281

#observation: All VIF values are below 5, indicating no significant multicollinearity among the predictors.

# conclusion

Compared to the model0, the multiple linear regression model1 including age, gender, and weight provided a significantly better fit to the data (F(3,7) = 4.56, p = 0.045). Among the predictors, weight was significantly associated with blood pressure, while age and gender were not. Model diagnostics indicated no major violations of linear regression assumptions, though interpretation is limited by the small sample size.

Survival Analysis #create Kplan-Meier survival curves

#call survival libary
library(survival)
#create survival object
surv_object <- Surv(Time, Event)
#fit Kaplan-Meier survival model
km_fit <- survfit(surv_object ~ 1)
#plot survival values
#plot survival values
km_fit$surv
##  [1] 0.9090909 0.8181818 0.8181818 0.7159091 0.6136364 0.6136364 0.4909091
##  [8] 0.3681818 0.3681818 0.3681818 0.0000000
#plot seurval 
plot(km_fit, xlab = "Time (months)", ylab = "Survival Probability",
     main = "Kaplan-Meier Survival Curve", col = "blue", lwd = 2)

observation * The Kaplan–Meier survival curve demonstrates a progressive decline in survival over time, with stepwise decreases corresponding to observed events. The estimated median survival time is approximately 40 months. Confidence intervals widen at later follow-up periods due to decreasing numbers at risk, indicating increased uncertainty in long-term survival estimates.

#Fit the cox model

#create survival object
surv_object <- Surv(Time, Event)

#fit cox proportional hazards model
cox_model2 <- coxph(surv_object ~ Age + factor(Gender) + Weight_kg)

#view model summary
summary(cox_model2)
## Call:
## coxph(formula = surv_object ~ Age + factor(Gender) + Weight_kg)
## 
##   n= 11, number of events= 7 
## 
##                      coef exp(coef)  se(coef)      z Pr(>|z|)
## Age              0.004015  1.004023  0.019940  0.201    0.840
## factor(Gender)M  0.547043  1.728136  0.903612  0.605    0.545
## Weight_kg       -0.005610  0.994406  0.026076 -0.215    0.830
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## Age                1.0040     0.9960    0.9655     1.044
## factor(Gender)M    1.7281     0.5787    0.2941    10.156
## Weight_kg          0.9944     1.0056    0.9449     1.047
## 
## Concordance= 0.538  (se = 0.136 )
## Likelihood ratio test= 0.65  on 3 df,   p=0.9
## Wald test            = 0.59  on 3 df,   p=0.9
## Score (logrank) test = 0.61  on 3 df,   p=0.9

#observation: The fit assesses how Age, Gender, and Weight affect the hazard (risk) of experiencing the event over time.

Weight (kg): HR = 0.994, 95% CI: 0.94 – 1.05, p-value: 0.830, Each 1 kg increase in weight is associated with a 0.6% reduction in hazard, holding age and gender constant. This means Weight is not a significant predictor of event risk.

Overal lmodel performance

#Checking proportional hazards assumption in a Cox regression model

#check proportional hazards assumption 
cox.zph(cox_model2)
##                 chisq df     p
## Age            4.5664  1 0.033
## factor(Gender) 0.0123  1 0.912
## Weight_kg      1.9451  1 0.163
## GLOBAL         5.6990  3 0.127

#observation:

Cox-model assumes the effect of each covariate on the hazard is constant over time.

*GLOBAL:p_value= 0.127 this is >0.05, we fail to reject the null hypothesis and conclude that overall, the proportional hazards assumption holds for the model.Taken together, the model does not show strong evidence of a global violation.

#Visualizing Blood Pressure Incidence on Kenya Map

#install and call the libraries
#install.packages("sf")  # run once
library(sf) # for maps
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(ggrepel)# for boundaries
## Warning: package 'ggrepel' was built under R version 4.5.2
## Loading required package: ggplot2
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(geodata)
## Warning: package 'geodata' was built under R version 4.5.2
## Loading required package: terra
## Warning: package 'terra' was built under R version 4.5.2
## terra 1.8.86
library(terra)
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.5.2
# Load Kenya counties
kenya_counties <- geodata::gadm(country = "KEN", level = 1, path = tempdir())
kenya_counties <- st_as_sf(kenya_counties)

# pick a projected CRS for Kenya
kenya_counties_proj <- st_transform(kenya_counties, 32737)  # WGS84 / UTM zone 37S

# BP data
bp_data <- data.frame(
  county = c("Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret"),
  lon = c(36.8219, 39.6682, 34.7617, 36.0667, 35.2698),
  lat = c(-1.2921, -4.0435, -0.0917, -0.3031, 0.5143),
  incidence = c(28.4, 25.1, 30.2, 27.0, 26.3)
)

# BP points
bp_sf <- st_as_sf(bp_data, coords = c("lon", "lat"), crs = 4326)
bp_sf_proj <- st_transform(bp_sf, 32737)

# Plot map with BP incidence points
coords <- st_coordinates(bp_sf) 
bp_sf$X <- coords[,1] # prevent overlapping
bp_sf$Y <- coords[,2]
ggplot() +
  geom_sf(data = kenya_counties_proj, fill = "gray95", color = "gray40", size = 0.3) +
  geom_sf(data = bp_sf_proj, aes(size = incidence, color = incidence), alpha = 0.8) +
  geom_sf_text(data = bp_sf_proj, aes(label = county), size = 3, nudge_y = 20000) +
  scale_color_viridis_c(name = "BP Incidence (%)") +
  scale_size_continuous(name = "BP Incidence (%)") +
    labs(
    title = "Blood Pressure Incidence by City in Kenya",
    subtitle = "In Selected Counties",
    caption = "Source: UonHealthInform Dataset"
  ) +
  theme_minimal()