1. load and read the
CSV file
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
water_data <- read.csv("water_access_and_waiting_time.csv")
head(water_data)
View(water_data)
2. Data Cleaning
names(water_data) <- c("Sex", "Resident_type", "Water_source_type",
"Waiting_minutes", "Trips_per_day", "Water_shortage_days",
"Monthly_cost_UGX", "Age", "Household_size", "Distance_meters")
names(water_data)
## [1] "Sex" "Resident_type" "Water_source_type"
## [4] "Waiting_minutes" "Trips_per_day" "Water_shortage_days"
## [7] "Monthly_cost_UGX" "Age" "Household_size"
## [10] "Distance_meters"
# from categorical to factors
water_data$Resident_type <- factor(water_data$Resident_type)
water_data$Sex <- factor(water_data$Sex)
water_data$Water_source_type <- factor(water_data$Water_source_type)
# capitalize the variables in one format
water_data$Resident_type <- recode(water_data$Resident_type,
"hostel" = "Hostel",
"rent" = "Rent",
"home" = "Home")
water_data$Water_source_type <- recode(water_data$Water_source_type,
"tank" = "Tank",
"river" = "River",
"pond" = "Pond",
"tap" = "Tap",
"well" = "Well")
# create a shortage indicator
water_data$Shortage <- ifelse(water_data$Water_shortage_days > 0, "Yes", "No")
View(water_data)
str(water_data)
## 'data.frame': 49 obs. of 11 variables:
## $ Sex : Factor w/ 2 levels "F","M": 1 1 2 1 2 2 2 1 2 1 ...
## $ Resident_type : Factor w/ 3 levels "Home","Hostel",..: 2 3 3 2 2 2 2 2 2 1 ...
## $ Water_source_type : Factor w/ 5 levels "Pond","River",..: 3 3 3 3 3 3 3 3 3 4 ...
## $ Waiting_minutes : int 12 13 10 18 15 10 13 11 16 6 ...
## $ Trips_per_day : int 1 1 1 4 4 3 4 5 2 4 ...
## $ Water_shortage_days: int 3 0 5 2 0 4 0 3 1 2 ...
## $ Monthly_cost_UGX : int 10 8 8 12 12 8 9 14 15 160 ...
## $ Age : int 24 23 23 21 25 32 31 23 23 40 ...
## $ Household_size : int 5 4 3 3 19 3 20 8 3 2 ...
## $ Distance_meters : int 50 0 200 80 10 60 0 150 15 35 ...
## $ Shortage : chr "Yes" "No" "Yes" "Yes" ...
colSums(is.na(water_data))
## Sex Resident_type Water_source_type Waiting_minutes
## 0 0 0 0
## Trips_per_day Water_shortage_days Monthly_cost_UGX Age
## 0 0 0 0
## Household_size Distance_meters Shortage
## 0 0 0
3. Summarising
columns
summary(water_data$Waiting_minutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 9.00 12.00 13.63 15.00 40.00
summary(water_data$Trips_per_day)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.653 4.000 6.000
# summary_table for 2 variables
summary_table <- data.frame(
Variable = c("Waiting Time (minutes)", "Trips per Day"),
Min = c(min(water_data$Waiting_minutes), min(water_data$Trips_per_day)),
Q1 = c(quantile(water_data$Waiting_minutes, 0.25), quantile(water_data$Trips_per_day, 0.25)),
Median = c(median(water_data$Waiting_minutes), median(water_data$Trips_per_day)),
Mean = c(mean(water_data$Waiting_minutes), mean(water_data$Trips_per_day)),
Q3 = c(quantile(water_data$Waiting_minutes, 0.75), quantile(water_data$Trips_per_day, 0.75)),
Max = c(max(water_data$Waiting_minutes), max(water_data$Trips_per_day))
)
# clean summary_table for R markdown
kable(summary_table, caption = "Summary Statistics for Waiting Time and Trips")
Summary Statistics for Waiting Time and Trips
| Waiting Time (minutes) |
6 |
9 |
12 |
13.632653 |
15 |
40 |
| Trips per Day |
1 |
1 |
2 |
2.653061 |
4 |
6 |
4. comparision
comparison <- water_data %>%
group_by(Resident_type) %>%
summarise(
Avg_Waiting = round(mean(Waiting_minutes), 1),
Avg_Trips = round(mean(Trips_per_day), 1),
Avg_Cost = round(mean(Monthly_cost_UGX), 1),
Shortage_Rate = round(mean(Water_shortage_days > 0) * 100, 1),
Count = n()
)
kable(comparison, caption = "Water Access by Residence Type")
Water Access by Residence Type
| Home |
11.8 |
3.8 |
114.7 |
53.8 |
13 |
| Hostel |
15.3 |
2.5 |
14.0 |
68.4 |
19 |
| Rent |
13.2 |
1.9 |
11.9 |
76.5 |
17 |
5. proportion reporting
shortages
overall_shortage <- prop.table(table(water_data$Shortage))
kable(overall_shortage, caption = "Overall Shortage Proportion")
Overall Shortage Proportion
| No |
0.3265306 |
| Yes |
0.6734694 |
shortage_by_type <- table(water_data$Resident_type, water_data$Shortage)
kable(shortage_by_type, caption = "Shortage by Residence Type")
Shortage by Residence Type
| Home |
6 |
7 |
| Hostel |
6 |
13 |
| Rent |
4 |
13 |
shortage_percent <- prop.table(table(water_data$Resident_type, water_data$Shortage), 1) * 100
kable(round(shortage_percent, 1), caption = "Shortage Percentage by Residence Type")
Shortage Percentage by Residence Type
| Home |
46.2 |
53.8 |
| Hostel |
31.6 |
68.4 |
| Rent |
23.5 |
76.5 |
6. visualisation
ggplot(water_data, aes(x = Waiting_minutes)) +
geom_histogram(binwidth = 5, fill = "lightyellow", color = "maroon") +
labs(title = "Distribution of Water Waiting Time",
x = "Waiting Time (minutes)",
y = "Number of Students") +
theme_bw()

ggplot(water_data, aes(x = Resident_type, y = Waiting_minutes, fill = Resident_type)) +
geom_boxplot() +
labs(title = "Waiting Time by Residence Type",
x = "Residence Type",
y = "Waiting Time (minutes)") +
theme_classic()

ggplot(water_data, aes(x = Resident_type, fill = Shortage)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Students Experiencing Water Shortage",
x = "Residence Type",
y = "Proportion",
fill = "Shortage") +
theme_classic()

ggplot(water_data, aes(x = Monthly_cost_UGX, y = Waiting_minutes, color = Resident_type)) +
geom_point(size = 3) +
labs(title = "Waiting Time vs Monthly Water Cost",
x = "Monthly Cost (UGX 000)",
y = "Waiting Time (minutes)",
color = "Residence Type") +
theme_minimal()

ggplot(water_data, aes(x = Distance_meters, y = Waiting_minutes, color = Resident_type)) +
geom_point(size = 3) +
labs(title = "Waiting Time vs Distance to Water Source",
x = "Distance (meters)",
y = "Waiting Time (minutes)",
color = "Residence Type") +
theme_minimal()

7. correlations
cor_wait_cost <- cor(water_data$Waiting_minutes, water_data$Monthly_cost_UGX)
cor_dist_wait <- cor(water_data$Distance_meters, water_data$Waiting_minutes)
correlation_table <- data.frame(
Relationship = c("Waiting Time vs Monthly Cost", "Distance vs Waiting Time"),
Correlation = c(round(cor_wait_cost, 2), round(cor_dist_wait, 2)),
Strength = c("Weak Negative", "Weak Positive"),
Interpretation = c("Higher cost = Slight less waiting", "Farther distance = Slight longer waiting")
)
kable(correlation_table, caption = "Key Relationships")
Key Relationships
| Waiting Time vs Monthly Cost |
-0.17 |
Weak Negative |
Higher cost = Slight less waiting |
| Distance vs Waiting Time |
0.17 |
Weak Positive |
Farther distance = Slight longer waiting |
8. Interpretations and
Recommendations
a. Residence Type
Matters
Hostel students wait the longest (15.3 minutes per
trip) Home students wait the least (11.8 minutes) but
pay much more Rent students fall in between with (13.2
minutes per trip)
b. Shortages Affect
Hostel and Rent Students Most
- 68.4% of Hostel students experienced shortages
- 76.5% of Rent students experienced shortages
- Only 53.8% of Home students experienced shortages
c. Money Drives
Waiting Time
- Correlation -0.17 between waiting time and monthly
cost
- Students who pay more for water wait less slightly
d. Distance Drives
Waiting Time
- Correlation +0.17 between distance and waiting
time
- Students walking farther wait longer
Implications of
Student Welfare
Hostel Students Lose approximately 30 minutes
fetching water per day hence study less thus poor performances
Rent Students Face both long waits (13.2 min) and high
shortage rates (76.5%) Home Students Pay more for water
than others but they still experience shortages
Infrastructure
Recommendations
- Install piped water in university hostels and
rentals to reduce waiting time and shortages
- Reduce overcrowding at existing water points to
reduce waiting time for all
- Establish other nearby water access points for
Renting students to reduce waiting time and shortages
- Consider water subsidies for students in Hostel and
Rent accommodations
9. Challenges
Faced
- Data collection: Finding students not willing to
participate in survey
- Data cleaning: Fixing inconsistent residence type
and water source type entries
- Cost reporting: Some students did not know exact
monthly water costs
- Time constraints: Limited time to collect efficient
data to use
10. Conclusion
This study shows that residence type significantly affects
water access for students. Hostel and Rent students face longer
waiting times and higher shortage rates compared to Home students. There
is a clear difference between time, money and shortages
in water access. Improving water infrastructure in hostels and rented
accommodations would greatly benefit student welfare and academic
performance.