library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(readr)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(tidyverse)    # Data manipulation & ggplot2 visualisation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()          masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag()             masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)    # Date handling
library(scales)       # Axis formatting
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(epitools)     # Epidemiological tools (OR, RR)
library(patchwork)    # Combine ggplot2 panels
library(lmtest)       # Regression diagnostics
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)          # VIF, Anova
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
library(broom)        # Tidy model output
data_1 <- read_xlsx("C:/Users/ELOHOME/Documents/AUCA Doc/R Programming/Final_Exam/cholera.xlsx")
data_2 <- read.csv("C:/Users/ELOHOME/Documents/AUCA Doc/R Programming/Final_Exam/bacteria.csv")
view(data_1)
view(data_2)
head(data_1,2)
## # A tibble: 2 × 7
##   Case_ID Province   District    Date_Onset            Age Sex    Outcome  
##   <chr>   <chr>      <chr>       <dttm>              <dbl> <chr>  <chr>    
## 1 <NA>    <NA>       <NA>        NA                     NA <NA>   <NA>     
## 2 CH-0002 Province B District B1 2024-03-12 00:00:00    38 Female Recovered
tail(data_1,2)
## # A tibble: 2 × 7
##   Case_ID Province   District    Date_Onset            Age Sex    Outcome  
##   <chr>   <chr>      <chr>       <dttm>              <dbl> <chr>  <chr>    
## 1 CH-0399 Province A District A2 2024-03-27 00:00:00     0 male   Recovered
## 2 CH-0400 Province A District A3 2024-04-02 00:00:00     2 female Recovered
names(data_1)
## [1] "Case_ID"    "Province"   "District"   "Date_Onset" "Age"       
## [6] "Sex"        "Outcome"
count(data_1)
## # A tibble: 1 × 1
##       n
##   <int>
## 1   400
dim(data_1)
## [1] 400   7
cat("This is the Number of Rows", nrow(data_1),"\n")
## This is the Number of Rows 400
cat("This is the Number of Column", ncol(data_1),"\n")
## This is the Number of Column 7
str(data_1)
## tibble [400 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Case_ID   : chr [1:400] NA "CH-0002" "CH-0003" "CH-0004" ...
##  $ Province  : chr [1:400] NA "Province B" "Province A" "Province B" ...
##  $ District  : chr [1:400] NA "District B1" "District A1" "District B2" ...
##  $ Date_Onset: POSIXct[1:400], format: NA "2024-03-12" ...
##  $ Age       : num [1:400] NA 38 0 9 45 34 1 12 30 19 ...
##  $ Sex       : chr [1:400] NA "Female" "F" "Male" ...
##  $ Outcome   : chr [1:400] NA "Recovered" "Recovered" "Recovered" ...
summary(data_1)
##       Case_ID         Province        District     Date_Onset                 
##  Length   :400   Length   :400   Length   :400   Min.   :2024-03-01 00:00:00  
##  N.unique :399   N.unique :  2   N.unique :  5   1st Qu.:2024-03-20 00:00:00  
##  N.blank  :  0   N.blank  :  0   N.blank  :  0   Median :2024-03-31 00:00:00  
##  Min.nchar:  7   Min.nchar: 10   Min.nchar: 11   Mean   :2024-04-01 15:23:54  
##  Max.nchar:  7   Max.nchar: 10   Max.nchar: 11   3rd Qu.:2024-04-13 12:00:00  
##  NAs      :  1   NAs      :  1   NAs      :  1   Max.   :2024-04-25 00:00:00  
##                                                  NAs    :1                    
##       Age              Sex           Outcome   
##  Min.   : 0.0   Length   :400   Length   :400  
##  1st Qu.: 7.0   N.unique : 10   N.unique :  2  
##  Median :21.0   N.blank  :  0   N.blank  :  0  
##  Mean   :26.5   Min.nchar:  1   Min.nchar:  4  
##  3rd Qu.:41.0   Max.nchar:  6   Max.nchar:  9  
##  Max.   :84.0   NAs      :  1   NAs      :  8  
##  NAs    :12
Names = c("Dell","Shema")
Age = c(2,4)
info <- data.frame(Names,Age) 
colSums(is.na(data_1))
##    Case_ID   Province   District Date_Onset        Age        Sex    Outcome 
##          1          1          1          1         12          1          8
blank_rows <- data_1 %>% filter(if_all(everything(),is.na))
cat("The Black Rows",nrow(blank_rows))
## The Black Rows 1
blank_rows1 <- data_1 %>% filter(!if_all(everything(), is.na))
cat("Remove blank space", nrow(blank_rows1))
## Remove blank space 399
cat("The Duplicated ID", sum(duplicated(data_1$Case_ID , incomparables = NA)))
## The Duplicated ID 0
table(data_1$Age, useNA = "always")
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##   21   17   12   17   10    4    8   11    8    6    6    6    6    4    9    9 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##    7    4    7   12    7    5    6   10    5    8    4    2    3    2    7    3 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##    6    6    3    5    4    5    5    6    4    3    7    4    2    5    1    6 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##    2    4    1    3    1    4    1    2    3    2    2    1    2    1    4    1 
##   64   65   67   68   69   70   71   72   74   75   77   78   79   80   83   84 
##    4    2    1    1    1    4    3    2    3    3    3    2    2    2    2    1 
## <NA> 
##   12
table(data_1$Outcome , useNA = "always")
## 
##      Died Recovered      <NA> 
##        26       366         8
range(data_1$Age, na.rm = TRUE)
## [1]  0 84