# Install / load needed packages
#install.packages(c("outbreaks", "tidyverse", "lubridate"))
#library(outbreaks)
#library(tidyverse)
#library(lubridate)

#IMPORTING DATASETS

#Importing Datasets
df <- read.csv("ebola_sierra_leone.csv")

# View first rows
head(df)
##    id age sex    status date_of_onset date_of_sample district
## 1  92   6   M confirmed    2014-06-10     2014-06-15 Kailahun
## 2  51  46   F confirmed    2014-05-30     2014-06-04 Kailahun
## 3 230  NA   M confirmed    2014-06-26     2014-06-30   Kenema
## 4 139  25   F confirmed    2014-06-13     2014-06-18 Kailahun
## 5   8   8   F confirmed    2014-05-22     2014-05-27 Kailahun
## 6 215  49   M confirmed    2014-06-24     2014-06-29 Kailahun
# Summary statistics
summary(df)
##        id              age            sex               status         
##  Min.   :  1.00   Min.   : 1.80   Length:200         Length:200        
##  1st Qu.: 62.75   1st Qu.:20.00   Class :character   Class :character  
##  Median :131.50   Median :35.00   Mode  :character   Mode  :character  
##  Mean   :136.72   Mean   :33.85                                        
##  3rd Qu.:208.25   3rd Qu.:45.00                                        
##  Max.   :285.00   Max.   :80.00                                        
##                   NA's   :4                                            
##  date_of_onset      date_of_sample       district        
##  Length:200         Length:200         Length:200        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
# Structure of dataset
str(df)
## 'data.frame':    200 obs. of  7 variables:
##  $ id            : int  92 51 230 139 8 215 189 115 218 159 ...
##  $ age           : num  6 46 NA 25 8 49 13 50 35 38 ...
##  $ sex           : chr  "M" "F" "M" "F" ...
##  $ status        : chr  "confirmed" "confirmed" "confirmed" "confirmed" ...
##  $ date_of_onset : chr  "2014-06-10" "2014-05-30" "2014-06-26" "2014-06-13" ...
##  $ date_of_sample: chr  "2014-06-15" "2014-06-04" "2014-06-30" "2014-06-18" ...
##  $ district      : chr  "Kailahun" "Kailahun" "Kenema" "Kailahun" ...

#BASIC DATA CLEANING

# Remove missing values
clean_data <- na.omit(df)

library(dplyr)   # or library(tidyverse) - the library to call 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Select specific columns
#subset_data <- clean_data %>% select(1:3) - this is the code 

# Filter rows
#filtered_data <- clean_data %>% filter(Sepal.Length > 5) this is the code 

#EXPLORATORY DATA ANALYSIS (EDA)

library(ggplot2)
# Count how many cases by status
clean_data %>%
  count(status)
##      status   n
## 1 confirmed 179
## 2 suspected  17
# Age distribution
clean_data %>%
  filter(!is.na(age)) %>%
  ggplot(aes(x = age)) +
    geom_histogram(bins = 30, fill = "steelblue", color = "white") +
    labs(title = "Age Distribution of Ebola Cases (Sierra Leone, 2014)",
         x = "Age", y = "Count")

# Cases by sex
clean_data %>%
  count(sex) %>%
  ggplot(aes(x = sex, y = n, fill = sex)) +
    geom_bar(stat = "identity") +
    labs(title = "Ebola Cases by Sex", x = "Sex", y = "Number of Cases")

# Cases by district
clean_data %>%
  count(district) %>%
  arrange(desc(n)) %>%
  head(10)  # top 10 districts
##        district   n
## 1      Kailahun 152
## 2        Kenema  33
## 3 Western Urban   4
## 4            Bo   2
## 5          Kono   2
## 6     Port Loko   2
## 7        Kambia   1