Data Wrangling Report

Michał Malinowski, Julia Koza, Tomasz Błaszczak

2024-03-20

#GATHER
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name = paste0("Student", 1:50),
  midterm1 = sample(60:100, 50, replace = TRUE),
  midterm2 = sample(60:100, 50, replace = TRUE),
  final = sample(60:100, 50, replace = TRUE)
)

student_data_long <- student_data %>% pivot_longer(
               cols = c(midterm1, midterm2, final),
               names_to = "exam_type",
               values_to = "score"
               )

# Print first few rows:
head(student_data_long)
## # A tibble: 6 × 4
##   student_id name     exam_type score
##        <int> <chr>    <chr>     <int>
## 1          1 Student1 midterm1     63
## 2          1 Student1 midterm2     82
## 3          1 Student1 final        60
## 4          2 Student2 midterm1     98
## 5          2 Student2 midterm2     65
## 6          2 Student2 final        88
#SEPARATE
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(student_id = 1:50,  
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),  
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80","midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88"))

# Transform data to long format: 
# This is already long format and we don't have to gather ?

pom <- separate(student_data, name_age, c("name", "age"), "_")
# Yes I understand that naming variables like that is a bad practise but I am in a hurry

# Split variables name_age to 2 separate columns: name and age
pom
##    student_id    name age                      exam_scores
## 1           1    John  21 midterm1_80,midterm2_85,final_75
## 2           2   Alice  20 midterm1_75,midterm2_78,final_80
## 3           3     Bob  22 midterm1_82,midterm2_80,final_85
## 4           4   Emily  23 midterm1_88,midterm2_90,final_92
## 5           5 Michael  22 midterm1_85,midterm2_86,final_88
## 6           6    John  21 midterm1_80,midterm2_85,final_75
## 7           7   Alice  20 midterm1_75,midterm2_78,final_80
## 8           8     Bob  22 midterm1_82,midterm2_80,final_85
## 9           9   Emily  23 midterm1_88,midterm2_90,final_92
## 10         10 Michael  22 midterm1_85,midterm2_86,final_88
## 11         11    John  21 midterm1_80,midterm2_85,final_75
## 12         12   Alice  20 midterm1_75,midterm2_78,final_80
## 13         13     Bob  22 midterm1_82,midterm2_80,final_85
## 14         14   Emily  23 midterm1_88,midterm2_90,final_92
## 15         15 Michael  22 midterm1_85,midterm2_86,final_88
## 16         16    John  21 midterm1_80,midterm2_85,final_75
## 17         17   Alice  20 midterm1_75,midterm2_78,final_80
## 18         18     Bob  22 midterm1_82,midterm2_80,final_85
## 19         19   Emily  23 midterm1_88,midterm2_90,final_92
## 20         20 Michael  22 midterm1_85,midterm2_86,final_88
## 21         21    John  21 midterm1_80,midterm2_85,final_75
## 22         22   Alice  20 midterm1_75,midterm2_78,final_80
## 23         23     Bob  22 midterm1_82,midterm2_80,final_85
## 24         24   Emily  23 midterm1_88,midterm2_90,final_92
## 25         25 Michael  22 midterm1_85,midterm2_86,final_88
## 26         26    John  21 midterm1_80,midterm2_85,final_75
## 27         27   Alice  20 midterm1_75,midterm2_78,final_80
## 28         28     Bob  22 midterm1_82,midterm2_80,final_85
## 29         29   Emily  23 midterm1_88,midterm2_90,final_92
## 30         30 Michael  22 midterm1_85,midterm2_86,final_88
## 31         31    John  21 midterm1_80,midterm2_85,final_75
## 32         32   Alice  20 midterm1_75,midterm2_78,final_80
## 33         33     Bob  22 midterm1_82,midterm2_80,final_85
## 34         34   Emily  23 midterm1_88,midterm2_90,final_92
## 35         35 Michael  22 midterm1_85,midterm2_86,final_88
## 36         36    John  21 midterm1_80,midterm2_85,final_75
## 37         37   Alice  20 midterm1_75,midterm2_78,final_80
## 38         38     Bob  22 midterm1_82,midterm2_80,final_85
## 39         39   Emily  23 midterm1_88,midterm2_90,final_92
## 40         40 Michael  22 midterm1_85,midterm2_86,final_88
## 41         41    John  21 midterm1_80,midterm2_85,final_75
## 42         42   Alice  20 midterm1_75,midterm2_78,final_80
## 43         43     Bob  22 midterm1_82,midterm2_80,final_85
## 44         44   Emily  23 midterm1_88,midterm2_90,final_92
## 45         45 Michael  22 midterm1_85,midterm2_86,final_88
## 46         46    John  21 midterm1_80,midterm2_85,final_75
## 47         47   Alice  20 midterm1_75,midterm2_78,final_80
## 48         48     Bob  22 midterm1_82,midterm2_80,final_85
## 49         49   Emily  23 midterm1_88,midterm2_90,final_92
## 50         50 Michael  22 midterm1_85,midterm2_86,final_88
# Split variables exam_scores to separate columns for each exam type
a <- separate(pom, exam_scores, c("midterm1", "midterm2", "final"), sep = ",")
a$midterm1 <- gsub("midterm1_", "", a$midterm1)
a$midterm2 <- gsub("midterm2_", "", a$midterm2)
a$final <- gsub("final_", "", a$final)
a
##    student_id    name age midterm1 midterm2 final
## 1           1    John  21       80       85    75
## 2           2   Alice  20       75       78    80
## 3           3     Bob  22       82       80    85
## 4           4   Emily  23       88       90    92
## 5           5 Michael  22       85       86    88
## 6           6    John  21       80       85    75
## 7           7   Alice  20       75       78    80
## 8           8     Bob  22       82       80    85
## 9           9   Emily  23       88       90    92
## 10         10 Michael  22       85       86    88
## 11         11    John  21       80       85    75
## 12         12   Alice  20       75       78    80
## 13         13     Bob  22       82       80    85
## 14         14   Emily  23       88       90    92
## 15         15 Michael  22       85       86    88
## 16         16    John  21       80       85    75
## 17         17   Alice  20       75       78    80
## 18         18     Bob  22       82       80    85
## 19         19   Emily  23       88       90    92
## 20         20 Michael  22       85       86    88
## 21         21    John  21       80       85    75
## 22         22   Alice  20       75       78    80
## 23         23     Bob  22       82       80    85
## 24         24   Emily  23       88       90    92
## 25         25 Michael  22       85       86    88
## 26         26    John  21       80       85    75
## 27         27   Alice  20       75       78    80
## 28         28     Bob  22       82       80    85
## 29         29   Emily  23       88       90    92
## 30         30 Michael  22       85       86    88
## 31         31    John  21       80       85    75
## 32         32   Alice  20       75       78    80
## 33         33     Bob  22       82       80    85
## 34         34   Emily  23       88       90    92
## 35         35 Michael  22       85       86    88
## 36         36    John  21       80       85    75
## 37         37   Alice  20       75       78    80
## 38         38     Bob  22       82       80    85
## 39         39   Emily  23       88       90    92
## 40         40 Michael  22       85       86    88
## 41         41    John  21       80       85    75
## 42         42   Alice  20       75       78    80
## 43         43     Bob  22       82       80    85
## 44         44   Emily  23       88       90    92
## 45         45 Michael  22       85       86    88
## 46         46    John  21       80       85    75
## 47         47   Alice  20       75       78    80
## 48         48     Bob  22       82       80    85
## 49         49   Emily  23       88       90    92
## 50         50 Michael  22       85       86    88
# FINAL RESULT Transform student_data2 to long format
#pom2 <- separate_rows(pom, exam_scores, sep=",")
#pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
#pom3

student_data_long <- a %>% pivot_longer(
               cols = c(midterm1, midterm2, final),
               names_to = "exam_type",
               values_to = "score"
               )
student_data_long
## # A tibble: 150 × 5
##    student_id name  age   exam_type score
##         <int> <chr> <chr> <chr>     <chr>
##  1          1 John  21    midterm1  80   
##  2          1 John  21    midterm2  85   
##  3          1 John  21    final     75   
##  4          2 Alice 20    midterm1  75   
##  5          2 Alice 20    midterm2  78   
##  6          2 Alice 20    final     80   
##  7          3 Bob   22    midterm1  82   
##  8          3 Bob   22    midterm2  80   
##  9          3 Bob   22    final     85   
## 10          4 Emily 23    midterm1  88   
## # ℹ 140 more rows
# COMPLETE
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

complete(pom3, student_id, name, age, exam, score, fill = list(score = 0))
## # A tibble: 27,000 × 5
##    student_id name  age   exam     score
##         <int> <chr> <chr> <chr>    <chr>
##  1          1 Alice 20    final    75   
##  2          1 Alice 20    final    78   
##  3          1 Alice 20    final    80   
##  4          1 Alice 20    final    82   
##  5          1 Alice 20    final    85   
##  6          1 Alice 20    final    86   
##  7          1 Alice 20    final    88   
##  8          1 Alice 20    final    90   
##  9          1 Alice 20    final    92   
## 10          1 Alice 20    midterm1 75   
## # ℹ 26,990 more rows
# SPREAD
library(tidyr)
set.seed(1)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

# Transform data to wide format
spread(pom3, key=exam, value=score)
## # A tibble: 50 × 6
##    student_id name    age   final midterm1 midterm2
##         <int> <chr>   <chr> <chr> <chr>    <chr>   
##  1          1 John    21    75    80       85      
##  2          2 Alice   20    80    75       78      
##  3          3 Bob     22    85    82       80      
##  4          4 Emily   23    92    88       90      
##  5          5 Michael 22    88    85       86      
##  6          6 John    21    75    80       85      
##  7          7 Alice   20    80    75       78      
##  8          8 Bob     22    85    82       80      
##  9          9 Emily   23    92    88       90      
## 10         10 Michael 22    88    85       86      
## # ℹ 40 more rows
# UNITE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')

unite(pom3, name_age, c("name", "age"))
## # A tibble: 150 × 4
##    student_id name_age exam     score
##         <int> <chr>    <chr>    <chr>
##  1          1 John_21  midterm1 80   
##  2          1 John_21  midterm2 85   
##  3          1 John_21  final    75   
##  4          2 Alice_20 midterm1 75   
##  5          2 Alice_20 midterm2 78   
##  6          2 Alice_20 final    80   
##  7          3 Bob_22   midterm1 82   
##  8          3 Bob_22   midterm2 80   
##  9          3 Bob_22   final    85   
## 10          4 Emily_23 midterm1 88   
## # ℹ 140 more rows