1 1. load and read the CSV file

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)

water_data <- read.csv("water_access_and_waiting_time.csv")

head(water_data)
View(water_data)

2 2. Data Cleaning

names(water_data) <- c("Sex", "Resident_type", "Water_source_type", 
                       "Waiting_minutes", "Trips_per_day", "Water_shortage_days", 
                       "Monthly_cost_UGX", "Age", "Household_size", "Distance_meters")
names(water_data) 
##  [1] "Sex"                 "Resident_type"       "Water_source_type"  
##  [4] "Waiting_minutes"     "Trips_per_day"       "Water_shortage_days"
##  [7] "Monthly_cost_UGX"    "Age"                 "Household_size"     
## [10] "Distance_meters"
# from categorical to factors
water_data$Resident_type <- factor(water_data$Resident_type)
water_data$Sex <- factor(water_data$Sex)
water_data$Water_source_type <- factor(water_data$Water_source_type)

# capitalize the variables in one format
water_data$Resident_type <- recode(water_data$Resident_type,
                                   "hostel" = "Hostel",
                                   "rent" = "Rent",
                                   "home" = "Home")
water_data$Water_source_type <- recode(water_data$Water_source_type,
                                       "tank" = "Tank",
                                       "river" = "River",
                                       "pond" = "Pond",
                                       "tap" = "Tap",
                                       "well" = "Well")
# create a shortage indicator
water_data$Shortage <- ifelse(water_data$Water_shortage_days > 0, "Yes", "No")
View(water_data)

str(water_data)
## 'data.frame':    49 obs. of  11 variables:
##  $ Sex                : Factor w/ 2 levels "F","M": 1 1 2 1 2 2 2 1 2 1 ...
##  $ Resident_type      : Factor w/ 3 levels "Home","Hostel",..: 2 3 3 2 2 2 2 2 2 1 ...
##  $ Water_source_type  : Factor w/ 5 levels "Pond","River",..: 3 3 3 3 3 3 3 3 3 4 ...
##  $ Waiting_minutes    : int  12 13 10 18 15 10 13 11 16 6 ...
##  $ Trips_per_day      : int  1 1 1 4 4 3 4 5 2 4 ...
##  $ Water_shortage_days: int  3 0 5 2 0 4 0 3 1 2 ...
##  $ Monthly_cost_UGX   : int  10 8 8 12 12 8 9 14 15 160 ...
##  $ Age                : int  24 23 23 21 25 32 31 23 23 40 ...
##  $ Household_size     : int  5 4 3 3 19 3 20 8 3 2 ...
##  $ Distance_meters    : int  50 0 200 80 10 60 0 150 15 35 ...
##  $ Shortage           : chr  "Yes" "No" "Yes" "Yes" ...
colSums(is.na(water_data))
##                 Sex       Resident_type   Water_source_type     Waiting_minutes 
##                   0                   0                   0                   0 
##       Trips_per_day Water_shortage_days    Monthly_cost_UGX                 Age 
##                   0                   0                   0                   0 
##      Household_size     Distance_meters            Shortage 
##                   0                   0                   0

3 3. Summarising columns

summary(water_data$Waiting_minutes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00    9.00   12.00   13.63   15.00   40.00
summary(water_data$Trips_per_day)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.653   4.000   6.000
# summary_table for 2 variables 
summary_table <- data.frame(
  Variable = c("Waiting Time (minutes)", "Trips per Day"),
  Min = c(min(water_data$Waiting_minutes), min(water_data$Trips_per_day)),
  Q1 = c(quantile(water_data$Waiting_minutes, 0.25), quantile(water_data$Trips_per_day, 0.25)),
  Median = c(median(water_data$Waiting_minutes), median(water_data$Trips_per_day)),
  Mean = c(mean(water_data$Waiting_minutes), mean(water_data$Trips_per_day)),
  Q3 = c(quantile(water_data$Waiting_minutes, 0.75), quantile(water_data$Trips_per_day, 0.75)),
  Max = c(max(water_data$Waiting_minutes), max(water_data$Trips_per_day))
)
# clean summary_table for R markdown
kable(summary_table, caption = "Summary Statistics for Waiting Time and Trips")
Summary Statistics for Waiting Time and Trips
Variable Min Q1 Median Mean Q3 Max
Waiting Time (minutes) 6 9 12 13.632653 15 40
Trips per Day 1 1 2 2.653061 4 6

4 4. comparision

comparison <- water_data %>%
  group_by(Resident_type) %>%
  summarise(
    Avg_Waiting = round(mean(Waiting_minutes), 1),
    Avg_Trips = round(mean(Trips_per_day), 1),
    Avg_Cost = round(mean(Monthly_cost_UGX), 1),
    Shortage_Rate = round(mean(Water_shortage_days > 0) * 100, 1),
    Count = n()
  )

kable(comparison, caption = "Water Access by Residence Type")
Water Access by Residence Type
Resident_type Avg_Waiting Avg_Trips Avg_Cost Shortage_Rate Count
Home 11.8 3.8 114.7 53.8 13
Hostel 15.3 2.5 14.0 68.4 19
Rent 13.2 1.9 11.9 76.5 17

5 5. proportion reporting shortages

overall_shortage <- prop.table(table(water_data$Shortage))
kable(overall_shortage, caption = "Overall Shortage Proportion")
Overall Shortage Proportion
Var1 Freq
No 0.3265306
Yes 0.6734694
shortage_by_type <- table(water_data$Resident_type, water_data$Shortage)
kable(shortage_by_type, caption = "Shortage by Residence Type")
Shortage by Residence Type
No Yes
Home 6 7
Hostel 6 13
Rent 4 13
shortage_percent <- prop.table(table(water_data$Resident_type, water_data$Shortage), 1) * 100
kable(round(shortage_percent, 1), caption = "Shortage Percentage by Residence Type")
Shortage Percentage by Residence Type
No Yes
Home 46.2 53.8
Hostel 31.6 68.4
Rent 23.5 76.5

6 6. visualisation

ggplot(water_data, aes(x = Waiting_minutes)) +
  geom_histogram(binwidth = 5, fill = "lightyellow", color = "maroon") +
  labs(title = "Distribution of Water Waiting Time",
       x = "Waiting Time (minutes)", 
       y = "Number of Students") +
  theme_bw()

ggplot(water_data, aes(x = Resident_type, y = Waiting_minutes, fill = Resident_type)) +
  geom_boxplot() +
  labs(title = "Waiting Time by Residence Type",
       x = "Residence Type", 
       y = "Waiting Time (minutes)") +
  theme_classic() 

ggplot(water_data, aes(x = Resident_type, fill = Shortage)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Students Experiencing Water Shortage",
       x = "Residence Type", 
       y = "Proportion",
       fill = "Shortage") +
  theme_classic()

ggplot(water_data, aes(x = Monthly_cost_UGX, y = Waiting_minutes, color = Resident_type)) +
  geom_point(size = 3) +
  labs(title = "Waiting Time vs Monthly Water Cost",
       x = "Monthly Cost (UGX 000)", 
       y = "Waiting Time (minutes)",
       color = "Residence Type") +
  theme_minimal()

ggplot(water_data, aes(x = Distance_meters, y = Waiting_minutes, color = Resident_type)) +
  geom_point(size = 3) +
  labs(title = "Waiting Time vs Distance to Water Source",
       x = "Distance (meters)", 
       y = "Waiting Time (minutes)",
       color = "Residence Type") +
  theme_minimal()

7 7. correlations

cor_wait_cost <- cor(water_data$Waiting_minutes, water_data$Monthly_cost_UGX)

cor_dist_wait <- cor(water_data$Distance_meters, water_data$Waiting_minutes)

correlation_table <- data.frame(
  Relationship = c("Waiting Time vs Monthly Cost", "Distance vs Waiting Time"),
  Correlation = c(round(cor_wait_cost, 2), round(cor_dist_wait, 2)),
  Strength = c("Weak Negative", "Weak Positive"),
  Interpretation = c("Higher cost = Slight less waiting", "Farther distance = Slight longer waiting")
)

kable(correlation_table, caption = "Key Relationships")
Key Relationships
Relationship Correlation Strength Interpretation
Waiting Time vs Monthly Cost -0.17 Weak Negative Higher cost = Slight less waiting
Distance vs Waiting Time 0.17 Weak Positive Farther distance = Slight longer waiting

8 8. Interpretations and Recommendations

8.0.1 a. Residence Type Matters

Hostel students wait the longest (15.3 minutes per trip) Home students wait the least (11.8 minutes) but pay much more Rent students fall in between with (13.2 minutes per trip)

8.0.2 b. Shortages Affect Hostel and Rent Students Most

  • 68.4% of Hostel students experienced shortages
  • 76.5% of Rent students experienced shortages
  • Only 53.8% of Home students experienced shortages

8.0.3 c. Money Drives Waiting Time

  • Correlation -0.17 between waiting time and monthly cost
  • Students who pay more for water wait less slightly

8.0.4 d. Distance Drives Waiting Time

  • Correlation +0.17 between distance and waiting time
  • Students walking farther wait longer

8.1 Implications of Student Welfare

Hostel Students Lose approximately 30 minutes fetching water per day hence study less thus poor performances Rent Students Face both long waits (13.2 min) and high shortage rates (76.5%) Home Students Pay more for water than others but they still experience shortages

8.2 Infrastructure Recommendations

  1. Install piped water in university hostels and rentals to reduce waiting time and shortages
  2. Reduce overcrowding at existing water points to reduce waiting time for all
  3. Establish other nearby water access points for Renting students to reduce waiting time and shortages
  4. Consider water subsidies for students in Hostel and Rent accommodations

9 9. Challenges Faced

  1. Data collection: Finding students not willing to participate in survey
  2. Data cleaning: Fixing inconsistent residence type and water source type entries
  3. Cost reporting: Some students did not know exact monthly water costs
  4. Time constraints: Limited time to collect efficient data to use

10 10. Conclusion

This study shows that residence type significantly affects water access for students. Hostel and Rent students face longer waiting times and higher shortage rates compared to Home students. There is a clear difference between time, money and shortages in water access. Improving water infrastructure in hostels and rented accommodations would greatly benefit student welfare and academic performance.