library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(reshape2)
print(head(df))
## period county Total.Dewormed Acute.Malnutrition
## 1 Jan-23 Baringo County 3659 8
## 2 Jan-23 Bomet County 1580 NA
## 3 Jan-23 Bungoma County 6590 24
## 4 Jan-23 Busia County 7564 NA
## 5 Jan-23 Elgeyo Marakwet County 1407 NA
## 6 Jan-23 Embu County 3241 72
## stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
## 1 471 34 380 2620
## 2 1 3 NA 1984
## 3 98 154 23 4576
## 4 396 143 111 2239
## 5 92 71 5 2739
## 6 326 86 24 1376
## Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
## 1 85 739 731
## 2 41 86 16
## 3 231 315 120
## 4 251 608 125
## 5 57 104 21
## 6 141 544 160
print(summary(df))
## period county Total.Dewormed Acute.Malnutrition
## Length:1410 Length:1410 Min. : 97 Min. : 1.0
## Class :character Class :character 1st Qu.: 2454 1st Qu.: 15.0
## Mode :character Mode :character Median : 4564 Median : 39.0
## Mean : 11458 Mean : 125.4
## 3rd Qu.: 8222 3rd Qu.: 143.5
## Max. :392800 Max. :4123.0
## NA's :355
## stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
## Min. : 1.0 Min. : 1.0 Min. : 1.0 Min. : 198
## 1st Qu.: 69.5 1st Qu.: 36.5 1st Qu.: 22.0 1st Qu.: 1464
## Median : 159.0 Median : 84.0 Median : 50.0 Median : 2158
## Mean : 280.2 Mean : 139.8 Mean : 110.8 Mean : 2813
## 3rd Qu.: 328.5 3rd Qu.: 157.0 3rd Qu.: 114.2 3rd Qu.: 3335
## Max. :4398.0 Max. :7900.0 Max. :3169.0 Max. :15795
## NA's :11 NA's :19 NA's :14
## Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
## Min. : 6.0 Min. : 16.0 Min. : 1.00
## 1st Qu.: 87.0 1st Qu.: 249.0 1st Qu.: 51.25
## Median : 162.5 Median : 456.0 Median : 120.50
## Mean : 223.5 Mean : 652.3 Mean : 305.74
## 3rd Qu.: 272.8 3rd Qu.: 791.8 3rd Qu.: 311.00
## Max. :1937.0 Max. :5348.0 Max. :4680.00
##
Checking for missing value
numeric_columns <- names(df)[sapply(df, is.numeric)]
for (col in numeric_columns) {
if (sum(is.na(df[[col]])) > 0) {
df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
}
}
missing_values_count <- colSums(is.na(df))
print(missing_values_count)
## period county Total.Dewormed
## 0 0 0
## Acute.Malnutrition stunted.6.23.months stunted.0..6.months
## 0 0 0
## stunted.24.59.months diarrhoea.cases Underweight.0..6.months
## 0 0 0
## Underweight.6.23.months Underweight.24.59.Months
## 0 0
# Sort the DataFrame in ascending order based on 'Total Dewormed'
df_sorted <- df[order(df$`Total.Dewormed`), ]
# Create the bar plot using ggplot with reordered bars and a color gradient
ggplot(df_sorted, aes(x =`Total.Dewormed`, y = reorder(county, `Total.Dewormed`), fill = `Total.Dewormed`)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Total Dewormed", y = "County", title = "Total Dewormed per County Bar charts")
Nairobi County has the largest number of people who have been dewormed, followed by Kwale county.
Lamu county has the least number of people who have been dewormed.
# Assuming 'df' is your data frame in R
ggplot(df_sorted, aes(x =`diarrhoea.cases`, y = reorder(county, `diarrhoea.cases`), fill = `diarrhoea.cases`)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "No. of Diarrhoea cases", y = "County", title = "Total cases of Diarrhoea per County")
Nairobi County and Kilifi county has majority number of cases of people with Diarrhoea.
Vihiga county has the least number of cases of people with Diarrhoea.
# Assuming 'df' is your data frame in R
ggplot(df_sorted, aes(x =`Acute.Malnutrition`, y = reorder(county, `Acute.Malnutrition`), fill = `Acute.Malnutrition`)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "No. of people with Acute Malnutrition", y = "County", title = "Counties by Total Dewormed with Color Gradient")
#### Observation
Wajir County and Turkana county suffer from Acute Malnutrition respective compared to other counties.
Baringo county has the least number of cases of Acute Malnutrition.
# Calculate the sum of each column
t <- sum(df$`stunted.6.23.months`)
m <- sum(df$`stunted.0..6.months`)
w <- sum(df$`stunted.24.59.months`)
# Create a bar chart to visualize the sums
categories <- c( '6-23 months', '0-<6 months', '24-59 months')
sums <- c(t, m, w)
barplot(sums, names.arg = categories, col = "skyblue",
xlab = "Age Group", ylab = "Total Stunted",
main = "Total Stunted Children in Different Age Groups")
# Assuming 'df' is your data frame in R
# Calculate the sum of each column
p <- sum(df$`Underweight.0..6.months`)
o <- sum(df$`Underweight.6.23.months`)
l <- sum(df$`Underweight.24.59.Months`)
# Create a bar chart to visualize the sums
categories <- c('0-<6 months', '6-23 months', '24-59 months')
sums <- c(p, o, l)
barplot(sums, names.arg = categories, col = "skyblue",
xlab = "Age Group", ylab = "Total Underweight Children",
main = "Total Underweight Children in Different Age Groups")
# Sort the DataFrame in ascending order based on 'Total Dewormed'
df_sorted <- df[order(df$`Total.Dewormed`), ]
# Create the bar plot using ggplot with reordered bars and a color gradient
ggplot(df_sorted, aes(x =`Total.Dewormed`, y = reorder(period, `Total.Dewormed`), fill = `Total.Dewormed`)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Total Dewormed", y = "Period", title = "Total Dewormed per Period Bar charts")
#### Observation
Nov-22 is the month with largest number of people who have been dewormed, followed by Jun-22.
Jan-21 has the least number of people who have been dewormed.
# Extract the month and year from the 'period' column
df <- df %>%
mutate(month = month(parse_date_time(period, "my")),
year = year(parse_date_time(period, "my")))
# Rearrange the data to have the year before the month
df$period <- paste(df$year, month.name[df$month], sep = "-")
# Sum the 'diarrhoea.cases' for each period (month)
df_sum <- df %>%
group_by(period) %>%
summarise(total_cases = sum(`diarrhoea.cases`))
# Create a new column with the date in a format that can be sorted chronologically
df_sum$date <- as.Date(paste(df_sum$period, "01", sep = "-"), format = "%Y-%B-%d")
# Arrange the data by month
df_sum <- df_sum[order(month(df_sum$date)), ]
# Create the line plot regrouping the data
ggplot(df_sum, aes(x = period, y = total_cases, group = 1, color = total_cases)) +
geom_line() +
scale_color_gradient(low = "blue", high = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Month", y = "Total Diarrhoea cases", title = "Total cases of Diarrhoea per period")
#### Observation
2023-May has the with largest number of people with cases of Diarrhea.
2021-January has the least number of people with cases of Diarrhea.
# Create the stacked bar chart using ggplot and coord_flip()
# Load the necessary library
library(ggplot2)
# Assuming 'df' is your data frame in R
# Create the stacked bar chart using ggplot and coord_flip()
ggplot(df, aes(x = county)) +
geom_bar(aes(y = `stunted.6.23.months`, fill = "Stunted 6-23 months"), stat = "identity") +
geom_bar(aes(y = `stunted.0..6.months`, fill = "Stunted 0-<6 months"), stat = "identity", position = "stack") +
geom_bar(aes(y = `stunted.24.59.months`, fill = "Stunted 24-59 months"), stat = "identity", position = "stack") +
scale_fill_manual(values = c("red", "blue", "green"),
labels = c("Stunted 6-23 months", "Stunted 0-<6 months", "Stunted 24-59 months")) +
labs(y = "Number of Stunted Children", x = "County ", title = "Stacked Bar Chart: Stunting by Age Group and County") +
theme_minimal() +
theme(axis.text.y = element_text(angle = 0, hjust = 1)) +
coord_flip() +
guides(fill = guide_legend(reverse = TRUE))
#### Observations
# Create the line chart for 'Total Dewormed' and 'diarrhoea cases' using ggplot
ggplot(df, aes(y = period)) +
geom_line(aes(x = `Total.Dewormed`), color = 'skyblue', linetype = 'solid', size = 4, show.legend = TRUE) +
geom_line(aes(x = `diarrhoea.cases`), color = 'orange', linetype = 'solid', size = 4, show.legend = TRUE) +
# Add labels and title
labs(x = 'Period', y = 'Number of Cases', title = 'Line Chart: Total Dewormed and Diarrhoea Cases over Time') +
# Rotate x-axis labels if needed
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
# Show legend
theme(legend.position = 'top') +
# Customize legend labels
guides(color = guide_legend(title = NULL)) +
# Customize colors and linetypes
scale_color_manual(values = c('skyblue', 'orange'),
labels = c('Total Dewormed', 'Diarrhoea Cases'),
name = 'Legend Title') +
# Customize linetypes
scale_linetype_manual(values = c('solid', 'solid'),
labels = c('Total Dewormed', 'Diarrhoea Cases'),
name = 'Legend Title') +
# Customize line sizes
scale_size_manual(values = c(1, 1)) +
# Set theme
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
### Ploting correlation map
# Select only the numeric columns for correlation calculation
numeric_cols <- df[, sapply(df, is.numeric)]
# Exclude 'period' and 'county' columns
numeric_cols <- numeric_cols[, !(colnames(numeric_cols) %in% c("period", "county"))]
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_cols)
# Reshape the correlation matrix to long format
correlation_data <- melt(correlation_matrix)
# Create the heatmap using ggplot and geom_tile()
ggplot(correlation_data, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "skyblue", high = "red") +
labs(x = "Variable 1", y = "Variable 2", title = "Heatmap: Correlation Matrix of Numeric Variables") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### Observation