#GATHER
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name = paste0("Student", 1:50),
midterm1 = sample(60:100, 50, replace = TRUE),
midterm2 = sample(60:100, 50, replace = TRUE),
final = sample(60:100, 50, replace = TRUE)
)
student_data_long <- student_data %>% pivot_longer(
cols = c(midterm1, midterm2, final),
names_to = "exam_type",
values_to = "score"
)
# Print first few rows:
head(student_data_long)
## # A tibble: 6 × 4
## student_id name exam_type score
## <int> <chr> <chr> <int>
## 1 1 Student1 midterm1 63
## 2 1 Student1 midterm2 82
## 3 1 Student1 final 60
## 4 2 Student2 midterm1 98
## 5 2 Student2 midterm2 65
## 6 2 Student2 final 88
#SEPARATE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80","midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88"))
# Transform data to long format:
# This is already long format and we don't have to gather ?
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
# Yes I understand that naming variables like that is a bad practise but I am in a hurry
pom3
## # A tibble: 150 × 5
## student_id name age exam score
## <int> <chr> <chr> <chr> <chr>
## 1 1 John 21 midterm1 80
## 2 1 John 21 midterm2 85
## 3 1 John 21 final 75
## 4 2 Alice 20 midterm1 75
## 5 2 Alice 20 midterm2 78
## 6 2 Alice 20 final 80
## 7 3 Bob 22 midterm1 82
## 8 3 Bob 22 midterm2 80
## 9 3 Bob 22 final 85
## 10 4 Emily 23 midterm1 88
## # ℹ 140 more rows
# COMPLETE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
complete(pom3, student_id, name, age, exam, score, fill = list(score = 0))
## # A tibble: 27,000 × 5
## student_id name age exam score
## <int> <chr> <chr> <chr> <chr>
## 1 1 Alice 20 final 75
## 2 1 Alice 20 final 78
## 3 1 Alice 20 final 80
## 4 1 Alice 20 final 82
## 5 1 Alice 20 final 85
## 6 1 Alice 20 final 86
## 7 1 Alice 20 final 88
## 8 1 Alice 20 final 90
## 9 1 Alice 20 final 92
## 10 1 Alice 20 midterm1 75
## # ℹ 26,990 more rows
# SPREAD
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
# Transform data to wide format
spread(pom3, key=exam, value=score)
## # A tibble: 50 × 6
## student_id name age final midterm1 midterm2
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 1 John 21 75 80 85
## 2 2 Alice 20 80 75 78
## 3 3 Bob 22 85 82 80
## 4 4 Emily 23 92 88 90
## 5 5 Michael 22 88 85 86
## 6 6 John 21 75 80 85
## 7 7 Alice 20 80 75 78
## 8 8 Bob 22 85 82 80
## 9 9 Emily 23 92 88 90
## 10 10 Michael 22 88 85 86
## # ℹ 40 more rows
# UNITE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
unite(pom3, name_age, c("name", "age"))
## # A tibble: 150 × 4
## student_id name_age exam score
## <int> <chr> <chr> <chr>
## 1 1 John_21 midterm1 80
## 2 1 John_21 midterm2 85
## 3 1 John_21 final 75
## 4 2 Alice_20 midterm1 75
## 5 2 Alice_20 midterm2 78
## 6 2 Alice_20 final 80
## 7 3 Bob_22 midterm1 82
## 8 3 Bob_22 midterm2 80
## 9 3 Bob_22 final 85
## 10 4 Emily_23 midterm1 88
## # ℹ 140 more rows