1. Read dataset

Read in the two datasets ‘person_info.csv’ and ‘lab_info.csv’ (or ‘person_info1.csv’ and ‘lab_info1.csv’) into two objects ‘person_info’ and ‘lab_info’ (Note that ‘person_info.csv’ and ‘lab_info.csv’ use semicolons as separators and ‘person_info1.csv’ and ‘lab_info1.csv’ use commas as separators).

Short description of the two datasets:

The dataset ‘person_info.csv’ contains information on the result of a specific test for a number of patients. The dataset has 6 variables:
- ID: ID of each person, which is unique
- age: age of each person
- gender: gender of each person
- date_result: the date when a person’s result is available
- date_post: the date when the test result is sent to the person by post
- result: test result
The ‘lab_info.csv’ dataset contains information about the lot of the buffer used to analyse the result for each person in the ‘person_info’ dataset: the ‘buffer_lot’ corresponds to the last four characters of each person’s ID.This dataset has two variables:
- ID_last_4_characters: the last four characters of each person’s ID
- buffer_lot: the lot of the buffer used to analyse the result for each person

#read.csv() : commas (‘,’) used as separators, periods (‘.’) used as decimals
lab_info <- read.csv('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Practical session2_ThuyNgan/3_PS2_datasets_for_exercise/lab_info1.csv')
#read.csv2(): semicolons (‘;’) used as separators, periods (‘,’) used as decimals
person_info <- read.csv2('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Practical session2_ThuyNgan/3_PS2_datasets_for_exercise/person_info.csv')

2. Explore the two datasets

How many observations, how many variables, type of each variable, how much missing data

# Give all the above information
skimr::skim(person_info)

Data summary
Name	person_info
Number of rows	303
Number of columns	6
_______________________
Column type frequency:
character	3
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
ID	1	9	9	299
date_result	1	10	10	10
date_post	1	10	10	5

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	33	0.89	62.28	7.65	50	56	62	70.0	74	▅▇▂▆▇
gender	0	1.00	1.51	0.50	1	1	2	2.0	2	▇▁▁▁▇
result	0	1.00	39.36	120.22	3	14	15	19.5	1000	▇▁▁▁▁

# Show the first 6 rows
head(person_info)

##          ID age date_result  date_post gender result
## 1 07h03kaab  50  10/02/2021 23/03/2021      1    116
## 2 07h03kaab  50  10/02/2021 23/03/2021      1    116
## 3 44do6oaab  50  12/02/2021 23/03/2021      1     15
## 4 82do9jaab  50  12/02/2021 23/03/2021      1     15
## 5 48dotoaab  50  12/02/2021 23/03/2021      1     15
## 6 94dp1eaab  50  12/02/2021 23/03/2021      1     15

# Give main summaries of dataset (min/max/mean/median/first and third quartiles for numeric variables, number of observations in each level for factor variables)
summary(person_info)

##       ID                 age        date_result         date_post        
##  Length:303         Min.   :50.00   Length:303         Length:303        
##  Class :character   1st Qu.:56.00   Class :character   Class :character  
##  Mode  :character   Median :62.00   Mode  :character   Mode  :character  
##                     Mean   :62.28                                        
##                     3rd Qu.:70.00                                        
##                     Max.   :74.00                                        
##                     NA's   :33                                           
##      gender          result       
##  Min.   :1.000   Min.   :   3.00  
##  1st Qu.:1.000   1st Qu.:  14.00  
##  Median :2.000   Median :  15.00  
##  Mean   :1.512   Mean   :  39.36  
##  3rd Qu.:2.000   3rd Qu.:  19.50  
##  Max.   :2.000   Max.   :1000.00  
##

# Show structure of the dataset (does not show the amount of missing data)
str(person_info)

## 'data.frame':    303 obs. of  6 variables:
##  $ ID         : chr  "07h03kaab" "07h03kaab" "44do6oaab" "82do9jaab" ...
##  $ age        : int  50 50 50 50 50 50 50 50 50 51 ...
##  $ date_result: chr  "10/02/2021" "10/02/2021" "12/02/2021" "12/02/2021" ...
##  $ date_post  : chr  "23/03/2021" "23/03/2021" "23/03/2021" "23/03/2021" ...
##  $ gender     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ result     : int  116 116 15 15 15 15 15 15 15 20 ...

#Count missing value
sapply(person_info, function(x) sum(is.na(x)))

##          ID         age date_result   date_post      gender      result 
##           0          33           0           0           0           0

3. For the ‘person_info’ dataset

3.1 How many duplicates are there in the dataset, remove duplicated rows and keep only unique rows

#Total number of duplicates
sum(duplicated(person_info))

## [1] 4

#Show what are the duplicated record
person_info[duplicated(person_info),]

##           ID age date_result  date_post gender result
## 2  07h03kaab  50  10/02/2021 23/03/2021      1    116
## 9  79dptkaab  50  12/02/2021 23/03/2021      1     15
## 31 50dpdpaab  55  15/02/2021 24/03/2021      2     15
## 59 77ds5laab  59  16/02/2021 24/03/2021      1     15

#Keep the unique row
person_info_unique <- unique(person_info)

3.2 Re-order the columns into: ‘ID’, ‘gender’, ‘age’, ‘result’, ‘date_result’, ‘date_post’

names(person_info)

## [1] "ID"          "age"         "date_result" "date_post"   "gender"     
## [6] "result"

person_info_ordered <- person_info[, c('ID', 'gender', 'age', 'result', 'date_result', 'date_post')]
names(person_info_ordered)

## [1] "ID"          "gender"      "age"         "result"      "date_result"
## [6] "date_post"

3.3 Re-format the columns if needed so that they are in the correct format: ‘ID’ as character, ‘age’ as numeric/integer, ‘date_result’ and ‘date_post’ as date (dd/mm/yyyy), ‘gender’ as factor and ‘result’ as numeric/integer

str(person_info)

## 'data.frame':    303 obs. of  6 variables:
##  $ ID         : chr  "07h03kaab" "07h03kaab" "44do6oaab" "82do9jaab" ...
##  $ age        : int  50 50 50 50 50 50 50 50 50 51 ...
##  $ date_result: chr  "10/02/2021" "10/02/2021" "12/02/2021" "12/02/2021" ...
##  $ date_post  : chr  "23/03/2021" "23/03/2021" "23/03/2021" "23/03/2021" ...
##  $ gender     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ result     : int  116 116 15 15 15 15 15 15 15 20 ...

person_info$date_post <- as.Date(person_info$date_post, format='%d/%m/%Y')
person_info$date_result <- as.Date(person_info$date_result, format = '%d/%m/%Y')
person_info$gender <- as.factor(person_info$gender)

3.4 Create a column named ‘age_group’ (‘50-54’, ‘55-59’, ‘60-64’, ‘65-69’, ‘70-74’) based on the values in column ‘age’

library(tidyverse)

## Warning: package 'forcats' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

summary(person_info$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   50.00   56.00   62.00   62.28   70.00   74.00      33

person_info <-  person_info %>% 
  mutate(age_group = case_when (
    age >= 50 & age <= 54 ~ '50-54',
    age >= 55 & age <= 59 ~ '55-59',
    age >= 60 & age <= 64 ~ '60-64',
    age >= 65 & age <= 69 ~ '65-69',
    age >= 70 & age <= 74 ~ '70-74'
  ))
table(person_info$age_group, useNA = 'alway')

## 
## 50-54 55-59 60-64 65-69 70-74  <NA> 
##    44    81    21    55    69    33

3.5 Create a table with the number of people in each age group, with the following order ‘70-74’, ‘50-54’, ‘55-59’, ‘60-64’, 65-69’ since the audience is most interested in the oldest and youngest age groups.

desired_order <- c('70-74', '50-54', '55-59', '60-64', '65-69')
age_group_table <- table(factor(person_info$age_group, levels = desired_order))
print(age_group_table)

## 
## 70-74 50-54 55-59 60-64 65-69 
##    69    44    81    21    55

3.6 Create a column named ‘gender_label’ based on the values in the column ‘gender’: 1 -> ‘male’, 2 -> ‘female’

person_info <- person_info %>% 
  mutate(gender_label= case_when(
    gender==1 ~ 'male',
    gender==2 ~ 'female'
  ))

3.7 Create a column named ‘result_binary’ based on the values in the column ‘result’: < 15 -> ‘negative’, >= 15 -> ‘positive’

person_info <- person_info %>% 
  mutate(result_binary= case_when(
    result<15 ~ "negative",
    result>=15 ~ "positive"
  ))

3.8 Create a column named ‘date_diff’ to show the difference between ‘date_post’ and ‘date_result’ in days

person_info <- person_info %>% 
  mutate(date_diff = date_post - date_result)

4. Join data

Join the ‘person_info’ and the ‘lab_info’ datasets to get the buffer_lot info for each person, following the steps below:

4.1 Create a column named ‘ID_last_4_characters’ which contains the last four character of each person’s ID

person_info <- person_info %>%
  mutate(ID_last_4_characters = substr(ID, nchar(ID) - 4 + 1, nchar(ID)))

4.2 Join the ‘person_info’ dataset with the ‘lab_info’ dataset by column ‘ID_last_4_characters’, keeping all the rows in the ‘person_info’ dataset (‘person_info’ dataset as the base dataset)

left_join_result <- merge(person_info, lab_info, by='ID_last_4_characters', all.x = T)

4.3 Create a table that presents how many ‘negative’ and ‘positive’ cases corresponding to each buffer_lot

table(left_join_result$result_binary, left_join_result$buffer_lot)

##           
##            a6 j4 m1 o8 p9
##   negative 11 29  9 12 23
##   positive 27 74 37 23 58

Practical_2: Data management

Thanhthuy