Overview

Business Question

Importing Required Libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(reshape2)

Displaying the first few dataset

print(head(df))
##   period                 county Total.Dewormed Acute.Malnutrition
## 1 Jan-23         Baringo County           3659                  8
## 2 Jan-23           Bomet County           1580                 NA
## 3 Jan-23         Bungoma County           6590                 24
## 4 Jan-23           Busia County           7564                 NA
## 5 Jan-23 Elgeyo Marakwet County           1407                 NA
## 6 Jan-23            Embu County           3241                 72
##   stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
## 1                 471                  34                  380            2620
## 2                   1                   3                   NA            1984
## 3                  98                 154                   23            4576
## 4                 396                 143                  111            2239
## 5                  92                  71                    5            2739
## 6                 326                  86                   24            1376
##   Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
## 1                      85                     739                      731
## 2                      41                      86                       16
## 3                     231                     315                      120
## 4                     251                     608                      125
## 5                      57                     104                       21
## 6                     141                     544                      160

Summary of the Dataset

print(summary(df))
##     period             county          Total.Dewormed   Acute.Malnutrition
##  Length:1410        Length:1410        Min.   :    97   Min.   :   1.0    
##  Class :character   Class :character   1st Qu.:  2454   1st Qu.:  15.0    
##  Mode  :character   Mode  :character   Median :  4564   Median :  39.0    
##                                        Mean   : 11458   Mean   : 125.4    
##                                        3rd Qu.:  8222   3rd Qu.: 143.5    
##                                        Max.   :392800   Max.   :4123.0    
##                                                         NA's   :355       
##  stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
##  Min.   :   1.0      Min.   :   1.0      Min.   :   1.0       Min.   :  198  
##  1st Qu.:  69.5      1st Qu.:  36.5      1st Qu.:  22.0       1st Qu.: 1464  
##  Median : 159.0      Median :  84.0      Median :  50.0       Median : 2158  
##  Mean   : 280.2      Mean   : 139.8      Mean   : 110.8       Mean   : 2813  
##  3rd Qu.: 328.5      3rd Qu.: 157.0      3rd Qu.: 114.2       3rd Qu.: 3335  
##  Max.   :4398.0      Max.   :7900.0      Max.   :3169.0       Max.   :15795  
##  NA's   :11          NA's   :19          NA's   :14                          
##  Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
##  Min.   :   6.0          Min.   :  16.0          Min.   :   1.00         
##  1st Qu.:  87.0          1st Qu.: 249.0          1st Qu.:  51.25         
##  Median : 162.5          Median : 456.0          Median : 120.50         
##  Mean   : 223.5          Mean   : 652.3          Mean   : 305.74         
##  3rd Qu.: 272.8          3rd Qu.: 791.8          3rd Qu.: 311.00         
##  Max.   :1937.0          Max.   :5348.0          Max.   :4680.00         
## 

Checking for missing value

numeric_columns <- names(df)[sapply(df, is.numeric)]
for (col in numeric_columns) {
  if (sum(is.na(df[[col]])) > 0) {
    df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
  }
}

missing_values_count <- colSums(is.na(df))
print(missing_values_count)
##                   period                   county           Total.Dewormed 
##                        0                        0                        0 
##       Acute.Malnutrition      stunted.6.23.months      stunted.0..6.months 
##                        0                        0                        0 
##     stunted.24.59.months          diarrhoea.cases  Underweight.0..6.months 
##                        0                        0                        0 
##  Underweight.6.23.months Underweight.24.59.Months 
##                        0                        0

Exploratory Data Analysis

Univariate Analysis

a) Visualization according to counties

1.0 Total Dewormed per County

# Sort the DataFrame in ascending order based on 'Total Dewormed'
df_sorted <- df[order(df$`Total.Dewormed`), ]


# Create the bar plot using ggplot with reordered bars and a color gradient
ggplot(df_sorted, aes(x =`Total.Dewormed`, y = reorder(county, `Total.Dewormed`), fill = `Total.Dewormed`)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "blue", high = "red") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Total Dewormed", y = "County", title = "Total Dewormed per County Bar charts")

Observation

  • Nairobi County has the largest number of people who have been dewormed, followed by Kwale county.

  • Lamu county has the least number of people who have been dewormed.

1.1 Total cases of Diarrhoea per County

# Assuming 'df' is your data frame in R
ggplot(df_sorted, aes(x =`diarrhoea.cases`, y = reorder(county, `diarrhoea.cases`), fill = `diarrhoea.cases`)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "No. of Diarrhoea cases", y = "County", title = "Total cases of Diarrhoea per County")

Observation

  • Nairobi County and Kilifi county has majority number of cases of people with Diarrhoea.

  • Vihiga county has the least number of cases of people with Diarrhoea.

1.2 No. of people with Acute Malnutrition per County

# Assuming 'df' is your data frame in R
ggplot(df_sorted, aes(x =`Acute.Malnutrition`, y = reorder(county, `Acute.Malnutrition`), fill = `Acute.Malnutrition`)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "No. of people with Acute Malnutrition", y = "County", title = "Counties by Total Dewormed with Color Gradient")

#### Observation

  • Wajir County and Turkana county suffer from Acute Malnutrition respective compared to other counties.

  • Baringo county has the least number of cases of Acute Malnutrition.

b) Visualization According to Age Group

2.0 Total Stunted Children in Different Age Groups

# Calculate the sum of each column
t <- sum(df$`stunted.6.23.months`)
m <- sum(df$`stunted.0..6.months`)
w <- sum(df$`stunted.24.59.months`)

# Create a bar chart to visualize the sums
categories <- c( '6-23 months', '0-<6 months', '24-59 months')
sums <- c(t, m, w)

barplot(sums, names.arg = categories, col = "skyblue",
        xlab = "Age Group", ylab = "Total Stunted",
        main = "Total Stunted Children in Different Age Groups")

Observation

  • Majority of stunted children are of age group 6-23 months while 24-59 months are the least.

2.1 Total Underweight Children in Different Age Groups

# Assuming 'df' is your data frame in R

# Calculate the sum of each column
p <- sum(df$`Underweight.0..6.months`)
o <- sum(df$`Underweight.6.23.months`)
l <- sum(df$`Underweight.24.59.Months`)

# Create a bar chart to visualize the sums
categories <- c('0-<6 months', '6-23 months', '24-59 months')
sums <- c(p, o, l)

barplot(sums, names.arg = categories, col = "skyblue",
        xlab = "Age Group", ylab = "Total Underweight Children",
        main = "Total Underweight Children in Different Age Groups")

Observation

  • Majority of Undeweight children are of age group 6-23 months while 0-6 months are the least.

c) Visualization according to period

3.0 Total Dewormed per period

# Sort the DataFrame in ascending order based on 'Total Dewormed'
df_sorted <- df[order(df$`Total.Dewormed`), ]


# Create the bar plot using ggplot with reordered bars and a color gradient
ggplot(df_sorted, aes(x =`Total.Dewormed`, y = reorder(period, `Total.Dewormed`), fill = `Total.Dewormed`)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "blue", high = "red") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Total Dewormed", y = "Period", title = "Total Dewormed per Period Bar charts")

#### Observation

  • Nov-22 is the month with largest number of people who have been dewormed, followed by Jun-22.

  • Jan-21 has the least number of people who have been dewormed.

3.1 Total cases of Diarrhoea per period

# Extract the month and year from the 'period' column
df <- df %>%
  mutate(month = month(parse_date_time(period, "my")),
         year = year(parse_date_time(period, "my")))

# Rearrange the data to have the year before the month
df$period <- paste(df$year, month.name[df$month], sep = "-")

# Sum the 'diarrhoea.cases' for each period (month)
df_sum <- df %>%
  group_by(period) %>%
  summarise(total_cases = sum(`diarrhoea.cases`))

# Create a new column with the date in a format that can be sorted chronologically
df_sum$date <- as.Date(paste(df_sum$period, "01", sep = "-"), format = "%Y-%B-%d")

# Arrange the data by month
df_sum <- df_sum[order(month(df_sum$date)), ]

# Create the line plot regrouping the data
ggplot(df_sum, aes(x = period, y = total_cases, group = 1, color = total_cases)) +
  geom_line() +
  scale_color_gradient(low = "blue", high = "red") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Month", y = "Total Diarrhoea cases", title = "Total cases of Diarrhoea per period")

#### Observation

  • 2023-May has the with largest number of people with cases of Diarrhea.

  • 2021-January has the least number of people with cases of Diarrhea.

Bivariate Analysis

# Create the stacked bar chart using ggplot and coord_flip()
# Load the necessary library
library(ggplot2)

# Assuming 'df' is your data frame in R
# Create the stacked bar chart using ggplot and coord_flip()
ggplot(df, aes(x = county)) +
  geom_bar(aes(y = `stunted.6.23.months`, fill = "Stunted 6-23 months"), stat = "identity") +
  geom_bar(aes(y = `stunted.0..6.months`, fill = "Stunted 0-<6 months"), stat = "identity", position = "stack") +
  geom_bar(aes(y = `stunted.24.59.months`, fill = "Stunted 24-59 months"), stat = "identity", position = "stack") +
  scale_fill_manual(values = c("red", "blue", "green"),
                    labels = c("Stunted 6-23 months", "Stunted 0-<6 months", "Stunted 24-59 months")) +
  labs(y = "Number of Stunted Children", x = "County ", title = "Stacked Bar Chart: Stunting by Age Group and County") +
  theme_minimal() +
  theme(axis.text.y = element_text(angle = 0, hjust = 1)) +
  coord_flip() +
  guides(fill = guide_legend(reverse = TRUE))

#### Observations

Line Chart: Total Dewormed and Diarrhoea Cases over Time

# Create the line chart for 'Total Dewormed' and 'diarrhoea cases' using ggplot
ggplot(df, aes(y = period)) +
  geom_line(aes(x = `Total.Dewormed`), color = 'skyblue', linetype = 'solid', size = 4, show.legend = TRUE) +
  geom_line(aes(x = `diarrhoea.cases`), color = 'orange', linetype = 'solid', size = 4, show.legend = TRUE) +
  
  # Add labels and title
  labs(x = 'Period', y = 'Number of Cases', title = 'Line Chart: Total Dewormed and Diarrhoea Cases over Time') +
  
  # Rotate x-axis labels if needed
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  
  # Show legend
  theme(legend.position = 'top') +
  
  # Customize legend labels
  guides(color = guide_legend(title = NULL)) +
  
  # Customize colors and linetypes
  scale_color_manual(values = c('skyblue', 'orange'),
                     labels = c('Total Dewormed', 'Diarrhoea Cases'),
                     name = 'Legend Title') +
  
  # Customize linetypes
  scale_linetype_manual(values = c('solid', 'solid'),
                        labels = c('Total Dewormed', 'Diarrhoea Cases'),
                        name = 'Legend Title') +
  
  # Customize line sizes
  scale_size_manual(values = c(1, 1)) +
  
  # Set theme
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Observations

  • May-2023 is the month largest Number of Total Dewormed and Diarrhoea Cases over Time compared with the other months while August-2022 has the least.
### Ploting correlation map 
# Select only the numeric columns for correlation calculation
numeric_cols <- df[, sapply(df, is.numeric)]

# Exclude 'period' and 'county' columns
numeric_cols <- numeric_cols[, !(colnames(numeric_cols) %in% c("period", "county"))]

# Calculate the correlation matrix
correlation_matrix <- cor(numeric_cols)

# Reshape the correlation matrix to long format
correlation_data <- melt(correlation_matrix)

# Create the heatmap using ggplot and geom_tile()
ggplot(correlation_data, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "skyblue", high = "red") +
  labs(x = "Variable 1", y = "Variable 2", title = "Heatmap: Correlation Matrix of Numeric Variables") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

### Observation

  • Diarrhoea cases is more correlated with Underweight 0-6 months and Underweight 6-23 months

Conclusion