# Install / load needed packages
#install.packages(c("outbreaks", "tidyverse", "lubridate"))
#library(outbreaks)
#library(tidyverse)
#library(lubridate)
#IMPORTING DATASETS
#Importing Datasets
df <- read.csv("ebola_sierra_leone.csv")
# View first rows
head(df)
## id age sex status date_of_onset date_of_sample district
## 1 92 6 M confirmed 2014-06-10 2014-06-15 Kailahun
## 2 51 46 F confirmed 2014-05-30 2014-06-04 Kailahun
## 3 230 NA M confirmed 2014-06-26 2014-06-30 Kenema
## 4 139 25 F confirmed 2014-06-13 2014-06-18 Kailahun
## 5 8 8 F confirmed 2014-05-22 2014-05-27 Kailahun
## 6 215 49 M confirmed 2014-06-24 2014-06-29 Kailahun
# Summary statistics
summary(df)
## id age sex status
## Min. : 1.00 Min. : 1.80 Length:200 Length:200
## 1st Qu.: 62.75 1st Qu.:20.00 Class :character Class :character
## Median :131.50 Median :35.00 Mode :character Mode :character
## Mean :136.72 Mean :33.85
## 3rd Qu.:208.25 3rd Qu.:45.00
## Max. :285.00 Max. :80.00
## NA's :4
## date_of_onset date_of_sample district
## Length:200 Length:200 Length:200
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
# Structure of dataset
str(df)
## 'data.frame': 200 obs. of 7 variables:
## $ id : int 92 51 230 139 8 215 189 115 218 159 ...
## $ age : num 6 46 NA 25 8 49 13 50 35 38 ...
## $ sex : chr "M" "F" "M" "F" ...
## $ status : chr "confirmed" "confirmed" "confirmed" "confirmed" ...
## $ date_of_onset : chr "2014-06-10" "2014-05-30" "2014-06-26" "2014-06-13" ...
## $ date_of_sample: chr "2014-06-15" "2014-06-04" "2014-06-30" "2014-06-18" ...
## $ district : chr "Kailahun" "Kailahun" "Kenema" "Kailahun" ...
#BASIC DATA CLEANING
# Remove missing values
clean_data <- na.omit(df)
library(dplyr) # or library(tidyverse) - the library to call
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Select specific columns
#subset_data <- clean_data %>% select(1:3) - this is the code
# Filter rows
#filtered_data <- clean_data %>% filter(Sepal.Length > 5) this is the code
#EXPLORATORY DATA ANALYSIS (EDA)
library(ggplot2)
# Count how many cases by status
clean_data %>%
count(status)
## status n
## 1 confirmed 179
## 2 suspected 17
# Age distribution
clean_data %>%
filter(!is.na(age)) %>%
ggplot(aes(x = age)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Age Distribution of Ebola Cases (Sierra Leone, 2014)",
x = "Age", y = "Count")
# Cases by sex
clean_data %>%
count(sex) %>%
ggplot(aes(x = sex, y = n, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Ebola Cases by Sex", x = "Sex", y = "Number of Cases")
# Cases by district
clean_data %>%
count(district) %>%
arrange(desc(n)) %>%
head(10) # top 10 districts
## district n
## 1 Kailahun 152
## 2 Kenema 33
## 3 Western Urban 4
## 4 Bo 2
## 5 Kono 2
## 6 Port Loko 2
## 7 Kambia 1