library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(readr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyverse) # Data manipulation & ggplot2 visualisation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # Date handling
library(scales) # Axis formatting
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(epitools) # Epidemiological tools (OR, RR)
library(patchwork) # Combine ggplot2 panels
library(lmtest) # Regression diagnostics
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # VIF, Anova
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
library(broom) # Tidy model output
data_1 <- read_xlsx("C:/Users/ELOHOME/Documents/AUCA Doc/R Programming/Final_Exam/cholera.xlsx")
data_2 <- read.csv("C:/Users/ELOHOME/Documents/AUCA Doc/R Programming/Final_Exam/bacteria.csv")
view(data_1)
view(data_2)
head(data_1,2)
## # A tibble: 2 × 7
## Case_ID Province District Date_Onset Age Sex Outcome
## <chr> <chr> <chr> <dttm> <dbl> <chr> <chr>
## 1 <NA> <NA> <NA> NA NA <NA> <NA>
## 2 CH-0002 Province B District B1 2024-03-12 00:00:00 38 Female Recovered
tail(data_1,2)
## # A tibble: 2 × 7
## Case_ID Province District Date_Onset Age Sex Outcome
## <chr> <chr> <chr> <dttm> <dbl> <chr> <chr>
## 1 CH-0399 Province A District A2 2024-03-27 00:00:00 0 male Recovered
## 2 CH-0400 Province A District A3 2024-04-02 00:00:00 2 female Recovered
names(data_1)
## [1] "Case_ID" "Province" "District" "Date_Onset" "Age"
## [6] "Sex" "Outcome"
count(data_1)
## # A tibble: 1 × 1
## n
## <int>
## 1 400
dim(data_1)
## [1] 400 7
cat("This is the Number of Rows", nrow(data_1),"\n")
## This is the Number of Rows 400
cat("This is the Number of Column", ncol(data_1),"\n")
## This is the Number of Column 7
str(data_1)
## tibble [400 × 7] (S3: tbl_df/tbl/data.frame)
## $ Case_ID : chr [1:400] NA "CH-0002" "CH-0003" "CH-0004" ...
## $ Province : chr [1:400] NA "Province B" "Province A" "Province B" ...
## $ District : chr [1:400] NA "District B1" "District A1" "District B2" ...
## $ Date_Onset: POSIXct[1:400], format: NA "2024-03-12" ...
## $ Age : num [1:400] NA 38 0 9 45 34 1 12 30 19 ...
## $ Sex : chr [1:400] NA "Female" "F" "Male" ...
## $ Outcome : chr [1:400] NA "Recovered" "Recovered" "Recovered" ...
summary(data_1)
## Case_ID Province District Date_Onset
## Length :400 Length :400 Length :400 Min. :2024-03-01 00:00:00
## N.unique :399 N.unique : 2 N.unique : 5 1st Qu.:2024-03-20 00:00:00
## N.blank : 0 N.blank : 0 N.blank : 0 Median :2024-03-31 00:00:00
## Min.nchar: 7 Min.nchar: 10 Min.nchar: 11 Mean :2024-04-01 15:23:54
## Max.nchar: 7 Max.nchar: 10 Max.nchar: 11 3rd Qu.:2024-04-13 12:00:00
## NAs : 1 NAs : 1 NAs : 1 Max. :2024-04-25 00:00:00
## NAs :1
## Age Sex Outcome
## Min. : 0.0 Length :400 Length :400
## 1st Qu.: 7.0 N.unique : 10 N.unique : 2
## Median :21.0 N.blank : 0 N.blank : 0
## Mean :26.5 Min.nchar: 1 Min.nchar: 4
## 3rd Qu.:41.0 Max.nchar: 6 Max.nchar: 9
## Max. :84.0 NAs : 1 NAs : 8
## NAs :12
Names = c("Dell","Shema")
Age = c(2,4)
info <- data.frame(Names,Age)
colSums(is.na(data_1))
## Case_ID Province District Date_Onset Age Sex Outcome
## 1 1 1 1 12 1 8
blank_rows <- data_1 %>% filter(if_all(everything(),is.na))
cat("The Black Rows",nrow(blank_rows))
## The Black Rows 1
blank_rows1 <- data_1 %>% filter(!if_all(everything(), is.na))
cat("Remove blank space", nrow(blank_rows1))
## Remove blank space 399
cat("The Duplicated ID", sum(duplicated(data_1$Case_ID , incomparables = NA)))
## The Duplicated ID 0
table(data_1$Age, useNA = "always")
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 21 17 12 17 10 4 8 11 8 6 6 6 6 4 9 9
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 7 4 7 12 7 5 6 10 5 8 4 2 3 2 7 3
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 6 6 3 5 4 5 5 6 4 3 7 4 2 5 1 6
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
## 2 4 1 3 1 4 1 2 3 2 2 1 2 1 4 1
## 64 65 67 68 69 70 71 72 74 75 77 78 79 80 83 84
## 4 2 1 1 1 4 3 2 3 3 3 2 2 2 2 1
## <NA>
## 12
table(data_1$Outcome , useNA = "always")
##
## Died Recovered <NA>
## 26 366 8
range(data_1$Age, na.rm = TRUE)
## [1] 0 84