#import libraries
#install.packages("openxlsx")   # run once
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.5.2
library(survival)
## Warning: package 'survival' was built under R version 4.5.2

Read the dataset

#read the dataset uonhealthinform.xlsx file # Ensure both the dataset  and the notebook file are in the same directory
df <- read.xlsx(file.choose())
attach(df)

#view data
View(df)

#check structure of data
str(df)
## 'data.frame':    11 obs. of  12 variables:
##  $ Time            : num  10 15 20 25 30 35 40 45 50 55 ...
##  $ Event           : num  1 1 0 1 1 0 1 1 0 0 ...
##  $ Patient_ID      : num  2356 4589 3 37 1213 ...
##  $ Age             : num  23 68 97 45 63 47 89 90 27 34 ...
##  $ Gender          : chr  "M" "F" "F" "M" ...
##  $ Height_cm       : num  234 123 345 267 876 356 321 768 986 246 ...
##  $ Weight_kg       : num  56 76 86 79 34 56 29 55 89 43 ...
##  $ Blood_Pressure  : num  124 89 100 78 129 117 115 111 85 120 ...
##  $ Disease_category: chr  "Stroke" "TB" "Malaria" "Malaria" ...
##  $ Region          : chr  "Urban" "Urban" "Rural" "Urban" ...
##  $ County          : chr  "Nakuru" "Isiolo" "Mombasa" "Kisumu" ...
##  $ Education_level : chr  "Primary" "Secondary" "College" "University" ...
#check first variables rows
head(df)
##   Time Event Patient_ID Age Gender Height_cm Weight_kg Blood_Pressure
## 1   10     1       2356  23      M       234        56            124
## 2   15     1       4589  68      F       123        76             89
## 3   20     0          3  97      F       345        86            100
## 4   25     1         37  45      M       267        79             78
## 5   30     1       1213  63      M       876        34            129
## 6   35     0       2390  47      M       356        56            117
##   Disease_category Region   County Education_level
## 1           Stroke  Urban   Nakuru         Primary
## 2               TB  Urban   Isiolo       Secondary
## 3          Malaria  Rural  Mombasa         College
## 4          Malaria  Urban   Kisumu      University
## 5              HIV  Rural Machakos            None
## 6          Cholera  Urban  Garissa         Primary

#create A simple linear regression model without predictors/covariances

#we use ~1 if there is no predictor variables/covariance
model0 =lm(Blood_Pressure~1, data=df)

#check model summary
summary(model0)
## 
## Call:
## lm(formula = Blood_Pressure ~ 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.727 -13.727   5.273  12.773  23.273 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  105.727      5.175   20.43 1.74e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.16 on 10 degrees of freedom

#Create multiple linear regression Model with predictors/covariances

#we use ~1 if there is no predictor variables/covariance
model1 =lm(Blood_Pressure~Age+factor(Gender)+Weight_kg)

#check model summary
summary(model1)
## 
## Call:
## lm(formula = Blood_Pressure ~ Age + factor(Gender) + Weight_kg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.116  -7.252   1.834   6.130  14.699 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     149.46407   17.62397   8.481 6.27e-05 ***
## Age              -0.05405    0.14994  -0.361  0.72910    
## factor(Gender)M   1.20910    7.53639   0.160  0.87707    
## Weight_kg        -0.68512    0.19272  -3.555  0.00928 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.94 on 7 degrees of freedom
## Multiple R-squared:  0.6615, Adjusted R-squared:  0.5164 
## F-statistic: 4.559 on 3 and 7 DF,  p-value: 0.04508

#observation: A multiple linear regression model showed that weight was significantly associated with blood pressure (coefficient = −0.69, p = 0.009), while age and gender were not significant predictors having a p=0.73. The model explained approximately 66%(0.6615) of the variance in blood pressure (adjusted R-squared = 0.52% or (0.5164)) and was statistically significant overall (F(3,7) = 4.56, p = 0.045).

The multiple linear regression model perfomed better than the simple linear regression model, as indicated by the higher R-squared value (0.6615 vs. 0.5234) and significant F-statistic (p = 0.045).

#checking the 2 Model comparison (ANOVA)

anova(model0, model1)
## Analysis of Variance Table
## 
## Model 1: Blood_Pressure ~ 1
## Model 2: Blood_Pressure ~ Age + factor(Gender) + Weight_kg
##   Res.Df     RSS Df Sum of Sq     F  Pr(>F)  
## 1     10 2946.18                             
## 2      7  997.41  3    1948.8 4.559 0.04508 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#observation: f-statistic = 4.56, p-value = 0.045 The ANOVA results indicate that the multiple linear regression model (model1) is significantly better at explaining the variance in blood pressure compared to the simple model (model0) that includes only the intercept. The p-value of 0.045 suggests that adding the predictors; age, gender and Weight_kg, significantly improves the model fit.

#Checking for linear models

#1. Linearity we assume each predictor to have a lienear relationship with the outcome variable(blood-pressure)

plot(model1, which = 1)

#observation No clear pattern or curves, we escecpt residual to be scattered around zero.However this is a small dataset so we may not see a clear pattern.

#2. Normality of Residuals

#plot
plot(model1, which = 2)

#shapiro-wilk test
shapiro.test(residuals(model1))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model1)
## W = 0.96933, p-value = 0.8797

#observation * The Q-Q plot shows that the residuals generally follow a straight line, this is acceptable normality. * The Shapiro-Wilk test yields a p-value of 0.8, this is >0.5, we fail to reject the null hypothesis and conclude that the residuals are normally distributed.

#3. checking for Homoscedasticity

plot(model1, which = 3)

observation: There’s no funnel shape, this means the variance of residuals is constant across all levels of fitted values, indicating homoscedasticity.

#4. Checking for independence of Residuals Data were collected from different individuals, so we assume independence of residuals.

#5. Checking for Multicollinearity

#Checking for multicollinearity
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
vif(model1)
##            Age factor(Gender)      Weight_kg 
##       1.055340       1.087127       1.073281

#observation: All VIF values are below 5, indicating no significant multicollinearity among the predictors.

# conclusion

Compared to the model0, the multiple linear regression model1 including age, gender, and weight provided a significantly better fit to the data (F(3,7) = 4.56, p = 0.045). Among the predictors, weight was significantly associated with blood pressure, while age and gender were not. Model diagnostics indicated no major violations of linear regression assumptions, though interpretation is limited by the small sample size.

Survival Analysis

#Fit the cox model

#call survival libary
library(survival)

#create survival object
surv_object <- Surv(Time, Event)

#fit cox proportional hazards model
cox_model2 <- coxph(surv_object ~ Age + factor(Gender) + Weight_kg)

#view model summary
summary(cox_model2)
## Call:
## coxph(formula = surv_object ~ Age + factor(Gender) + Weight_kg)
## 
##   n= 11, number of events= 7 
## 
##                      coef exp(coef)  se(coef)      z Pr(>|z|)
## Age              0.004015  1.004023  0.019940  0.201    0.840
## factor(Gender)M  0.547043  1.728136  0.903612  0.605    0.545
## Weight_kg       -0.005610  0.994406  0.026076 -0.215    0.830
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## Age                1.0040     0.9960    0.9655     1.044
## factor(Gender)M    1.7281     0.5787    0.2941    10.156
## Weight_kg          0.9944     1.0056    0.9449     1.047
## 
## Concordance= 0.538  (se = 0.136 )
## Likelihood ratio test= 0.65  on 3 df,   p=0.9
## Wald test            = 0.59  on 3 df,   p=0.9
## Score (logrank) test = 0.61  on 3 df,   p=0.9

#observation: The fit assesses how Age, Gender, and Weight affect the hazard (risk) of experiencing the event over time.

Weight (kg): HR = 0.994, 95% CI: 0.94 – 1.05, p-value: 0.830, Each 1 kg increase in weight is associated with a 0.6% reduction in hazard, holding age and gender constant. This means Weight is not a significant predictor of event risk.

Overal lmodel performance

#Checking proportional hazards assumption in a Cox regression model

#check proportional hazards assumption 
cox.zph(cox_model2)
##                 chisq df     p
## Age            4.5664  1 0.033
## factor(Gender) 0.0123  1 0.912
## Weight_kg      1.9451  1 0.163
## GLOBAL         5.6990  3 0.127

#observation:

Cox-model assumes the effect of each covariate on the hazard is constant over time.

*GLOBAL:p_value= 0.127 this is >0.05, we fail to reject the null hypothesis and conclude that overall, the proportional hazards assumption holds for the model.Taken together, the model does not show strong evidence of a global violation.

#Visualizing Blood Pressure Incidence on Kenya Map

#load kenya counties shapefile
#install.packages("sf")  # run once
library(sf)
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.5.2
#geretate .sh file
Sys.setenv(SHAPE_RESTORE_SHX = "YES")

#read the shapefile # dowload and make sure it is in the same directory as this notebook
kenya_counties <- st_read("County.shp")
## Reading layer `County' from data source 
##   `C:\Users\DELL\OneDrive\Desktop\msc_year2\Health_informatics\County.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 47 features and 0 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 33.91182 ymin: -4.702271 xmax: 41.90626 ymax: 5.430648
## CRS:           NA
names(kenya_counties)
## [1] "geometry"
#create the dataframe to match the counties with blood pressure incidence data
#Sample blood pressure incidence data for selected cities in Kenya
bp_data <- data.frame(
  county = c("Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret"),
  lon = c(36.8219, 39.6682, 34.7617, 36.0667, 35.2698),
  lat = c(-1.2921, -4.0435, -0.0917, -0.3031, 0.5143),
  incidence = c(28.4, 25.1, 30.2, 27.0, 26.3)
)

bp_sf <- st_as_sf(
  bp_data,
  coords = c("lon", "lat"),
  crs = 4326
)
# Load Kenya boundary (sf object)
kenya <- ne_countries(country = "Kenya", returnclass = "sf")

# Plot map with BP incidence points
ggplot() +
  geom_sf(data = kenya, fill = "gray95", color = "gray40") +
  geom_sf(
    data = bp_sf,
    aes(size = incidence, color = incidence),
    alpha = 0.8
  ) +
  scale_color_viridis_c(name = "BP Incidence (%)") +
  scale_size_continuous(name = "BP Incidence (%)") +
  labs(
    title = "Blood Pressure Incidence by City in Kenya",
    subtitle = "Selected Counties",
    caption = "Source: Your dataset"
  ) +
  theme_minimal()