Data Wrangling Report

Michał Malinowski, Julia Koza, Tomasz Błaszczak

2024-03-20

#GATHER
library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.3

set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name = paste0("Student", 1:50),
  midterm1 = sample(60:100, 50, replace = TRUE),
  midterm2 = sample(60:100, 50, replace = TRUE),
  final = sample(60:100, 50, replace = TRUE)
)

student_data_long <- student_data %>% pivot_longer(
               cols = c(midterm1, midterm2, final),
               names_to = "exam_type",
               values_to = "score"
               )

# Print first few rows:
head(student_data_long)

## # A tibble: 6 × 4
##   student_id name     exam_type score
##        <int> <chr>    <chr>     <int>
## 1          1 Student1 midterm1     63
## 2          1 Student1 midterm2     82
## 3          1 Student1 final        60
## 4          2 Student2 midterm1     98
## 5          2 Student2 midterm2     65
## 6          2 Student2 final        88

#SEPARATE
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(student_id = 1:50,  
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),  
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80","midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88"))

# Transform data to long format: 
# This is already long format and we don't have to gather ?

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

# Yes I understand that naming variables like that is a bad practise but I am in a hurry
pom3

## # A tibble: 150 × 5
##    student_id name  age   exam     score
##         <int> <chr> <chr> <chr>    <chr>
##  1          1 John  21    midterm1 80   
##  2          1 John  21    midterm2 85   
##  3          1 John  21    final    75   
##  4          2 Alice 20    midterm1 75   
##  5          2 Alice 20    midterm2 78   
##  6          2 Alice 20    final    80   
##  7          3 Bob   22    midterm1 82   
##  8          3 Bob   22    midterm2 80   
##  9          3 Bob   22    final    85   
## 10          4 Emily 23    midterm1 88   
## # ℹ 140 more rows

# COMPLETE
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

complete(pom3, student_id, name, age, exam, score, fill = list(score = 0))

## # A tibble: 27,000 × 5
##    student_id name  age   exam     score
##         <int> <chr> <chr> <chr>    <chr>
##  1          1 Alice 20    final    75   
##  2          1 Alice 20    final    78   
##  3          1 Alice 20    final    80   
##  4          1 Alice 20    final    82   
##  5          1 Alice 20    final    85   
##  6          1 Alice 20    final    86   
##  7          1 Alice 20    final    88   
##  8          1 Alice 20    final    90   
##  9          1 Alice 20    final    92   
## 10          1 Alice 20    midterm1 75   
## # ℹ 26,990 more rows

# SPREAD
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

# Transform data to wide format
spread(pom3, key=exam, value=score)

## # A tibble: 50 × 6
##    student_id name    age   final midterm1 midterm2
##         <int> <chr>   <chr> <chr> <chr>    <chr>   
##  1          1 John    21    75    80       85      
##  2          2 Alice   20    80    75       78      
##  3          3 Bob     22    85    82       80      
##  4          4 Emily   23    92    88       90      
##  5          5 Michael 22    88    85       86      
##  6          6 John    21    75    80       85      
##  7          7 Alice   20    80    75       78      
##  8          8 Bob     22    85    82       80      
##  9          9 Emily   23    92    88       90      
## 10         10 Michael 22    88    85       86      
## # ℹ 40 more rows

# UNITE
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

unite(pom3, name_age, c("name", "age"))

## # A tibble: 150 × 4
##    student_id name_age exam     score
##         <int> <chr>    <chr>    <chr>
##  1          1 John_21  midterm1 80   
##  2          1 John_21  midterm2 85   
##  3          1 John_21  final    75   
##  4          2 Alice_20 midterm1 75   
##  5          2 Alice_20 midterm2 78   
##  6          2 Alice_20 final    80   
##  7          3 Bob_22   midterm1 82   
##  8          3 Bob_22   midterm2 80   
##  9          3 Bob_22   final    85   
## 10          4 Emily_23 midterm1 88   
## # ℹ 140 more rows