#import libraries
#install.packages("openxlsx") # run once
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.5.2
library(survival)
## Warning: package 'survival' was built under R version 4.5.2
#read the dataset uonhealthinform.xlsx file # Ensure both the dataset and the notebook file are in the same directory
df <- read.xlsx(file.choose())
attach(df)
#view data
View(df)
#check structure of data
str(df)
## 'data.frame': 11 obs. of 12 variables:
## $ Time : num 10 15 20 25 30 35 40 45 50 55 ...
## $ Event : num 1 1 0 1 1 0 1 1 0 0 ...
## $ Patient_ID : num 2356 4589 3 37 1213 ...
## $ Age : num 23 68 97 45 63 47 89 90 27 34 ...
## $ Gender : chr "M" "F" "F" "M" ...
## $ Height_cm : num 234 123 345 267 876 356 321 768 986 246 ...
## $ Weight_kg : num 56 76 86 79 34 56 29 55 89 43 ...
## $ Blood_Pressure : num 124 89 100 78 129 117 115 111 85 120 ...
## $ Disease_category: chr "Stroke" "TB" "Malaria" "Malaria" ...
## $ Region : chr "Urban" "Urban" "Rural" "Urban" ...
## $ County : chr "Nakuru" "Isiolo" "Mombasa" "Kisumu" ...
## $ Education_level : chr "Primary" "Secondary" "College" "University" ...
#check first variables rows
head(df)
#create A simple linear regression model without predictors/covariances
#we use ~1 if there is no predictor variables/covariance
model0 =lm(Blood_Pressure~1, data=df)
#check model summary
summary(model0)
##
## Call:
## lm(formula = Blood_Pressure ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.727 -13.727 5.273 12.773 23.273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.727 5.175 20.43 1.74e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.16 on 10 degrees of freedom
#Create multiple linear regression Model with predictors/covariances
#we use ~1 if there is no predictor variables/covariance
model1 =lm(Blood_Pressure~Age+factor(Gender)+Weight_kg)
#check model summary
summary(model1)
##
## Call:
## lm(formula = Blood_Pressure ~ Age + factor(Gender) + Weight_kg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.116 -7.252 1.834 6.130 14.699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.46407 17.62397 8.481 6.27e-05 ***
## Age -0.05405 0.14994 -0.361 0.72910
## factor(Gender)M 1.20910 7.53639 0.160 0.87707
## Weight_kg -0.68512 0.19272 -3.555 0.00928 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.94 on 7 degrees of freedom
## Multiple R-squared: 0.6615, Adjusted R-squared: 0.5164
## F-statistic: 4.559 on 3 and 7 DF, p-value: 0.04508
#observation: A multiple linear regression model showed that weight was significantly associated with blood pressure (coefficient = −0.69, p = 0.009), while age and gender were not significant predictors having a p=0.73. The model explained approximately 66%(0.6615) of the variance in blood pressure (adjusted R-squared = 0.52% or (0.5164)) and was statistically significant overall (F(3,7) = 4.56, p = 0.045).
The multiple linear regression model perfomed better than the simple linear regression model, as indicated by the higher R-squared value (0.6615 vs. 0.5234) and significant F-statistic (p = 0.045).
#checking the 2 Model comparison (ANOVA)
anova(model0, model1)
#observation: f-statistic = 4.56, p-value = 0.045 The ANOVA results indicate that the multiple linear regression model (model1) is significantly better at explaining the variance in blood pressure compared to the simple model (model0) that includes only the intercept. The p-value of 0.045 suggests that adding the predictors; age, gender and Weight_kg, significantly improves the model fit.
#Checking for linear models
#1. Linearity we assume each predictor to have a lienear relationship with the outcome variable(blood-pressure)
plot(model1, which = 1)
#observation No clear pattern or curves, we escecpt residual to be
scattered around zero.However this is a small dataset so we may not see
a clear pattern.
#2. Normality of Residuals
#plot
plot(model1, which = 2)
#shapiro-wilk test
shapiro.test(residuals(model1))
##
## Shapiro-Wilk normality test
##
## data: residuals(model1)
## W = 0.96933, p-value = 0.8797
#observation * The Q-Q plot shows that the residuals generally follow a straight line, this is acceptable normality. * The Shapiro-Wilk test yields a p-value of 0.8, this is >0.5, we fail to reject the null hypothesis and conclude that the residuals are normally distributed.
#3. checking for Homoscedasticity
plot(model1, which = 3)
observation: There’s no funnel shape, this means the variance of
residuals is constant across all levels of fitted values, indicating
homoscedasticity.
#4. Checking for independence of Residuals Data were collected from different individuals, so we assume independence of residuals.
#5. Checking for Multicollinearity
#Checking for multicollinearity
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
vif(model1)
## Age factor(Gender) Weight_kg
## 1.055340 1.087127 1.073281
#observation: All VIF values are below 5, indicating no significant multicollinearity among the predictors.
# conclusion
Compared to the model0, the multiple linear regression model1 including age, gender, and weight provided a significantly better fit to the data (F(3,7) = 4.56, p = 0.045). Among the predictors, weight was significantly associated with blood pressure, while age and gender were not. Model diagnostics indicated no major violations of linear regression assumptions, though interpretation is limited by the small sample size.
Survival Analysis #create Kplan-Meier survival curves
#call survival libary
library(survival)
#create survival object
surv_object <- Surv(Time, Event)
#fit Kaplan-Meier survival model
km_fit <- survfit(surv_object ~ 1)
#plot survival values
#plot survival values
km_fit$surv
## [1] 0.9090909 0.8181818 0.8181818 0.7159091 0.6136364 0.6136364 0.4909091
## [8] 0.3681818 0.3681818 0.3681818 0.0000000
#plot seurval
plot(km_fit, xlab = "Time (months)", ylab = "Survival Probability",
main = "Kaplan-Meier Survival Curve", col = "blue", lwd = 2)
observation * The Kaplan–Meier survival curve demonstrates a progressive
decline in survival over time, with stepwise decreases corresponding to
observed events. The estimated median survival time is approximately 40
months. Confidence intervals widen at later follow-up periods due to
decreasing numbers at risk, indicating increased uncertainty in
long-term survival estimates.
#Fit the cox model
#create survival object
surv_object <- Surv(Time, Event)
#fit cox proportional hazards model
cox_model2 <- coxph(surv_object ~ Age + factor(Gender) + Weight_kg)
#view model summary
summary(cox_model2)
## Call:
## coxph(formula = surv_object ~ Age + factor(Gender) + Weight_kg)
##
## n= 11, number of events= 7
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Age 0.004015 1.004023 0.019940 0.201 0.840
## factor(Gender)M 0.547043 1.728136 0.903612 0.605 0.545
## Weight_kg -0.005610 0.994406 0.026076 -0.215 0.830
##
## exp(coef) exp(-coef) lower .95 upper .95
## Age 1.0040 0.9960 0.9655 1.044
## factor(Gender)M 1.7281 0.5787 0.2941 10.156
## Weight_kg 0.9944 1.0056 0.9449 1.047
##
## Concordance= 0.538 (se = 0.136 )
## Likelihood ratio test= 0.65 on 3 df, p=0.9
## Wald test = 0.59 on 3 df, p=0.9
## Score (logrank) test = 0.61 on 3 df, p=0.9
#observation: The fit assesses how Age, Gender, and Weight affect the hazard (risk) of experiencing the event over time.
Age: exp(coef) = Hazard Ratio (HR), HR= 1.004 at 95% CI (0.980-1.029) for Age, indicating each additional year of age is associated with a 0.4% increase in hazard, adjusting for gender and weight, but not statistically significant (p=0.84); This means Age is not significantly associated with the risk of the event.
Gender (Male vs Female): HR= 1.73 at 95% CI: 0.29 – 10.16,p-value: 0.545, indicating that Males have an estimated 73% higher hazard of experiencing the event compared to females, controlling for age and weight. However, this is not statistically significant (p=0.545). wide CI substantial uncertainty.
Weight (kg): HR = 0.994, 95% CI: 0.94 – 1.05, p-value: 0.830, Each 1 kg increase in weight is associated with a 0.6% reduction in hazard, holding age and gender constant. This means Weight is not a significant predictor of event risk.
Overal lmodel performance
Likelihood ratio test: p = 0.90, Wald test: p = 0.90, Score (log-rank) test: p = 0.90. This means the model1 does not provide a significantly better fit than a model0 with no covariates.
Model discrimination: Concordance (C-index): 0.538, indicating poor ability to distinguish between individuals who experience the event earlier versus later.
Standard error: 0.136: indicates some uncertainty in the C-index estimate.The model has poor discriminatory ability, performing only slightly better than random chance (0.5).
#Checking proportional hazards assumption in a Cox regression model
#check proportional hazards assumption
cox.zph(cox_model2)
## chisq df p
## Age 4.5664 1 0.033
## factor(Gender) 0.0123 1 0.912
## Weight_kg 1.9451 1 0.163
## GLOBAL 5.6990 3 0.127
#observation:
Cox-model assumes the effect of each covariate on the hazard is constant over time.
*GLOBAL:p_value= 0.127 this is >0.05, we fail to reject the null hypothesis and conclude that overall, the proportional hazards assumption holds for the model.Taken together, the model does not show strong evidence of a global violation.
#Visualizing Blood Pressure Incidence on Kenya Map
#install and call the libraries
#install.packages("sf") # run once
library(sf) # for maps
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(ggrepel)# for boundaries
## Warning: package 'ggrepel' was built under R version 4.5.2
## Loading required package: ggplot2
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(geodata)
## Warning: package 'geodata' was built under R version 4.5.2
## Loading required package: terra
## Warning: package 'terra' was built under R version 4.5.2
## terra 1.8.86
library(terra)
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.5.2
# Load Kenya counties
kenya_counties <- geodata::gadm(country = "KEN", level = 1, path = tempdir())
kenya_counties <- st_as_sf(kenya_counties)
# pick a projected CRS for Kenya
kenya_counties_proj <- st_transform(kenya_counties, 32737) # WGS84 / UTM zone 37S
# BP data
bp_data <- data.frame(
county = c("Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret"),
lon = c(36.8219, 39.6682, 34.7617, 36.0667, 35.2698),
lat = c(-1.2921, -4.0435, -0.0917, -0.3031, 0.5143),
incidence = c(28.4, 25.1, 30.2, 27.0, 26.3)
)
# BP points
bp_sf <- st_as_sf(bp_data, coords = c("lon", "lat"), crs = 4326)
bp_sf_proj <- st_transform(bp_sf, 32737)
# Plot map with BP incidence points
coords <- st_coordinates(bp_sf)
bp_sf$X <- coords[,1] # prevent overlapping
bp_sf$Y <- coords[,2]
ggplot() +
geom_sf(data = kenya_counties_proj, fill = "gray95", color = "gray40", size = 0.3) +
geom_sf(data = bp_sf_proj, aes(size = incidence, color = incidence), alpha = 0.8) +
geom_sf_text(data = bp_sf_proj, aes(label = county), size = 3, nudge_y = 20000) +
scale_color_viridis_c(name = "BP Incidence (%)") +
scale_size_continuous(name = "BP Incidence (%)") +
labs(
title = "Blood Pressure Incidence by City in Kenya",
subtitle = "In Selected Counties",
caption = "Source: UonHealthInform Dataset"
) +
theme_minimal()