Read in the two datasets ‘person_info.csv’ and ‘lab_info.csv’ (or ‘person_info1.csv’ and ‘lab_info1.csv’) into two objects ‘person_info’ and ‘lab_info’ (Note that ‘person_info.csv’ and ‘lab_info.csv’ use semicolons as separators and ‘person_info1.csv’ and ‘lab_info1.csv’ use commas as separators).
Short description of the two datasets:
#read.csv() : commas (‘,’) used as separators, periods (‘.’) used as decimals
lab_info <- read.csv('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Practical session2_ThuyNgan/3_PS2_datasets_for_exercise/lab_info1.csv')
#read.csv2(): semicolons (‘;’) used as separators, periods (‘,’) used as decimals
person_info <- read.csv2('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Practical session2_ThuyNgan/3_PS2_datasets_for_exercise/person_info.csv')
How many observations, how many variables, type of each variable, how much missing data
# Give all the above information
skimr::skim(person_info)
| Name | person_info |
| Number of rows | 303 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ID | 0 | 1 | 9 | 9 | 0 | 299 | 0 |
| date_result | 0 | 1 | 10 | 10 | 0 | 10 | 0 |
| date_post | 0 | 1 | 10 | 10 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 33 | 0.89 | 62.28 | 7.65 | 50 | 56 | 62 | 70.0 | 74 | ▅▇▂▆▇ |
| gender | 0 | 1.00 | 1.51 | 0.50 | 1 | 1 | 2 | 2.0 | 2 | ▇▁▁▁▇ |
| result | 0 | 1.00 | 39.36 | 120.22 | 3 | 14 | 15 | 19.5 | 1000 | ▇▁▁▁▁ |
# Show the first 6 rows
head(person_info)
## ID age date_result date_post gender result
## 1 07h03kaab 50 10/02/2021 23/03/2021 1 116
## 2 07h03kaab 50 10/02/2021 23/03/2021 1 116
## 3 44do6oaab 50 12/02/2021 23/03/2021 1 15
## 4 82do9jaab 50 12/02/2021 23/03/2021 1 15
## 5 48dotoaab 50 12/02/2021 23/03/2021 1 15
## 6 94dp1eaab 50 12/02/2021 23/03/2021 1 15
# Give main summaries of dataset (min/max/mean/median/first and third quartiles for numeric variables, number of observations in each level for factor variables)
summary(person_info)
## ID age date_result date_post
## Length:303 Min. :50.00 Length:303 Length:303
## Class :character 1st Qu.:56.00 Class :character Class :character
## Mode :character Median :62.00 Mode :character Mode :character
## Mean :62.28
## 3rd Qu.:70.00
## Max. :74.00
## NA's :33
## gender result
## Min. :1.000 Min. : 3.00
## 1st Qu.:1.000 1st Qu.: 14.00
## Median :2.000 Median : 15.00
## Mean :1.512 Mean : 39.36
## 3rd Qu.:2.000 3rd Qu.: 19.50
## Max. :2.000 Max. :1000.00
##
# Show structure of the dataset (does not show the amount of missing data)
str(person_info)
## 'data.frame': 303 obs. of 6 variables:
## $ ID : chr "07h03kaab" "07h03kaab" "44do6oaab" "82do9jaab" ...
## $ age : int 50 50 50 50 50 50 50 50 50 51 ...
## $ date_result: chr "10/02/2021" "10/02/2021" "12/02/2021" "12/02/2021" ...
## $ date_post : chr "23/03/2021" "23/03/2021" "23/03/2021" "23/03/2021" ...
## $ gender : int 1 1 1 1 1 1 1 1 1 1 ...
## $ result : int 116 116 15 15 15 15 15 15 15 20 ...
#Count missing value
sapply(person_info, function(x) sum(is.na(x)))
## ID age date_result date_post gender result
## 0 33 0 0 0 0
#Total number of duplicates
sum(duplicated(person_info))
## [1] 4
#Show what are the duplicated record
person_info[duplicated(person_info),]
## ID age date_result date_post gender result
## 2 07h03kaab 50 10/02/2021 23/03/2021 1 116
## 9 79dptkaab 50 12/02/2021 23/03/2021 1 15
## 31 50dpdpaab 55 15/02/2021 24/03/2021 2 15
## 59 77ds5laab 59 16/02/2021 24/03/2021 1 15
#Keep the unique row
person_info_unique <- unique(person_info)
names(person_info)
## [1] "ID" "age" "date_result" "date_post" "gender"
## [6] "result"
person_info_ordered <- person_info[, c('ID', 'gender', 'age', 'result', 'date_result', 'date_post')]
names(person_info_ordered)
## [1] "ID" "gender" "age" "result" "date_result"
## [6] "date_post"
str(person_info)
## 'data.frame': 303 obs. of 6 variables:
## $ ID : chr "07h03kaab" "07h03kaab" "44do6oaab" "82do9jaab" ...
## $ age : int 50 50 50 50 50 50 50 50 50 51 ...
## $ date_result: chr "10/02/2021" "10/02/2021" "12/02/2021" "12/02/2021" ...
## $ date_post : chr "23/03/2021" "23/03/2021" "23/03/2021" "23/03/2021" ...
## $ gender : int 1 1 1 1 1 1 1 1 1 1 ...
## $ result : int 116 116 15 15 15 15 15 15 15 20 ...
person_info$date_post <- as.Date(person_info$date_post, format='%d/%m/%Y')
person_info$date_result <- as.Date(person_info$date_result, format = '%d/%m/%Y')
person_info$gender <- as.factor(person_info$gender)
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
summary(person_info$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 50.00 56.00 62.00 62.28 70.00 74.00 33
person_info <- person_info %>%
mutate(age_group = case_when (
age >= 50 & age <= 54 ~ '50-54',
age >= 55 & age <= 59 ~ '55-59',
age >= 60 & age <= 64 ~ '60-64',
age >= 65 & age <= 69 ~ '65-69',
age >= 70 & age <= 74 ~ '70-74'
))
table(person_info$age_group, useNA = 'alway')
##
## 50-54 55-59 60-64 65-69 70-74 <NA>
## 44 81 21 55 69 33
desired_order <- c('70-74', '50-54', '55-59', '60-64', '65-69')
age_group_table <- table(factor(person_info$age_group, levels = desired_order))
print(age_group_table)
##
## 70-74 50-54 55-59 60-64 65-69
## 69 44 81 21 55
person_info <- person_info %>%
mutate(gender_label= case_when(
gender==1 ~ 'male',
gender==2 ~ 'female'
))
person_info <- person_info %>%
mutate(result_binary= case_when(
result<15 ~ "negative",
result>=15 ~ "positive"
))
person_info <- person_info %>%
mutate(date_diff = date_post - date_result)
Join the ‘person_info’ and the ‘lab_info’ datasets to get the buffer_lot info for each person, following the steps below:
person_info <- person_info %>%
mutate(ID_last_4_characters = substr(ID, nchar(ID) - 4 + 1, nchar(ID)))
left_join_result <- merge(person_info, lab_info, by='ID_last_4_characters', all.x = T)
table(left_join_result$result_binary, left_join_result$buffer_lot)
##
## a6 j4 m1 o8 p9
## negative 11 29 9 12 23
## positive 27 74 37 23 58