Data Wrangling Report

Aleksandra Templin & Ksawery Raupuk

2024-03-13

Exercise 1 (Gather)

library(tidyr)
set.seed(7)
# Data
student_data <- data.frame(
  student_id = 1:50,
  name = paste0("Student", 1:50),
  midterm1 = sample(60:100, 50, replace = TRUE),
  midterm2 = sample(60:100, 50, replace = TRUE),
  final = sample(60:100, 50, replace = TRUE)
)

# Transform data to long format using gather:
student_data_long <- gather(student_data, key = "exam", value = "score", -student_id, -name)  

# Print first few rows:
head(student_data_long)
##   student_id     name     exam score
## 1          1 Student1 midterm1    78
## 2          2 Student2 midterm1    90
## 3          3 Student3 midterm1    87
## 4          4 Student4 midterm1    98
## 5          5 Student5 midterm1    61
## 6          6 Student6 midterm1    74

Exercise 2 (Separate) Part 1

library(tidyr)

# Data
student_data2 <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

# Split variables name_age to 2 separate columns: name and age
seperated<-separate(student_data2, name_age, c('name', 'age'), sep = '_')

head(seperated)
##   student_id    name age                      exam_scores
## 1          1    John  21 midterm1_80,midterm2_85,final_75
## 2          2   Alice  20 midterm1_75,midterm2_78,final_80
## 3          3     Bob  22 midterm1_82,midterm2_80,final_85
## 4          4   Emily  23 midterm1_88,midterm2_90,final_92
## 5          5 Michael  22 midterm1_85,midterm2_86,final_88
## 6          6    John  21 midterm1_80,midterm2_85,final_75

Exercise 2 (Separate) Part 2

# Split variables exam_scores to separate columns for each exam type
seperated_exams<-separate(student_data2, exam_scores, c("midterm1", "midterm2", "final"), sep = ",")
head(seperated_exams)
##   student_id   name_age    midterm1    midterm2    final
## 1          1    John_21 midterm1_80 midterm2_85 final_75
## 2          2   Alice_20 midterm1_75 midterm2_78 final_80
## 3          3     Bob_22 midterm1_82 midterm2_80 final_85
## 4          4   Emily_23 midterm1_88 midterm2_90 final_92
## 5          5 Michael_22 midterm1_85 midterm2_86 final_88
## 6          6    John_21 midterm1_80 midterm2_85 final_75
# Transform student_data2 to long format: data is already in long format

Exercise 3 (Complete)

library(tidyr)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", 
                  "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", 
                  "midterm1_85,midterm2_86,final_88")
)

# Complete missing combinations for students
completed_student_data <- complete(student_data, name_age, exam_scores, fill = list(), explicit = TRUE)

head(completed_student_data)
## # A tibble: 6 × 3
##   name_age exam_scores                      student_id
##   <chr>    <chr>                                 <int>
## 1 Alice_20 midterm1_75,midterm2_78,final_80          2
## 2 Alice_20 midterm1_75,midterm2_78,final_80          7
## 3 Alice_20 midterm1_75,midterm2_78,final_80         12
## 4 Alice_20 midterm1_75,midterm2_78,final_80         17
## 5 Alice_20 midterm1_75,midterm2_78,final_80         22
## 6 Alice_20 midterm1_75,midterm2_78,final_80         27

Exercise 4 (Spread)

library(tidyr)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)


# Transform data to wide format
student_data_wide<-separate(student_data, exam_scores, into = c("midterm1", "midterm2", "final"), sep = ",")
student_data_wide<-separate(student_data_wide, name_age, into = c('name', 'age'), sep = '_')

head(student_data_wide)
##   student_id    name age    midterm1    midterm2    final
## 1          1    John  21 midterm1_80 midterm2_85 final_75
## 2          2   Alice  20 midterm1_75 midterm2_78 final_80
## 3          3     Bob  22 midterm1_82 midterm2_80 final_85
## 4          4   Emily  23 midterm1_88 midterm2_90 final_92
## 5          5 Michael  22 midterm1_85 midterm2_86 final_88
## 6          6    John  21 midterm1_80 midterm2_85 final_75

Exercise 5 (Unite)

library(tidyr)

# Student and age were already merged, so I changed source code so we can merge them ourselves
student_data <- data.frame(
  student_id = 1:50,
  name = c("John", "Alice", "Bob", "Emily", "Michael"),
  age = c("21", "20", "22", "23", "22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

# Merge variables name and age to one column called name_age
student_data_united <- unite(student_data, name, age, col="name_age", sep="_")

head(student_data_united)
##   student_id   name_age                      exam_scores
## 1          1    John_21 midterm1_80,midterm2_85,final_75
## 2          2   Alice_20 midterm1_75,midterm2_78,final_80
## 3          3     Bob_22 midterm1_82,midterm2_80,final_85
## 4          4   Emily_23 midterm1_88,midterm2_90,final_92
## 5          5 Michael_22 midterm1_85,midterm2_86,final_88
## 6          6    John_21 midterm1_80,midterm2_85,final_75