#GATHER
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name = paste0("Student", 1:50),
midterm1 = sample(60:100, 50, replace = TRUE),
midterm2 = sample(60:100, 50, replace = TRUE),
final = sample(60:100, 50, replace = TRUE)
)
student_data_long <- student_data %>% pivot_longer(
cols = c(midterm1, midterm2, final),
names_to = "exam_type",
values_to = "score"
)
# Print first few rows:
head(student_data_long)
## # A tibble: 6 × 4
## student_id name exam_type score
## <int> <chr> <chr> <int>
## 1 1 Student1 midterm1 63
## 2 1 Student1 midterm2 82
## 3 1 Student1 final 60
## 4 2 Student2 midterm1 98
## 5 2 Student2 midterm2 65
## 6 2 Student2 final 88
#SEPARATE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80","midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88"))
# Transform data to long format:
# This is already long format and we don't have to gather ?
pom <- separate(student_data, name_age, c("name", "age"), "_")
# Yes I understand that naming variables like that is a bad practise but I am in a hurry
# Split variables name_age to 2 separate columns: name and age
pom
## student_id name age exam_scores
## 1 1 John 21 midterm1_80,midterm2_85,final_75
## 2 2 Alice 20 midterm1_75,midterm2_78,final_80
## 3 3 Bob 22 midterm1_82,midterm2_80,final_85
## 4 4 Emily 23 midterm1_88,midterm2_90,final_92
## 5 5 Michael 22 midterm1_85,midterm2_86,final_88
## 6 6 John 21 midterm1_80,midterm2_85,final_75
## 7 7 Alice 20 midterm1_75,midterm2_78,final_80
## 8 8 Bob 22 midterm1_82,midterm2_80,final_85
## 9 9 Emily 23 midterm1_88,midterm2_90,final_92
## 10 10 Michael 22 midterm1_85,midterm2_86,final_88
## 11 11 John 21 midterm1_80,midterm2_85,final_75
## 12 12 Alice 20 midterm1_75,midterm2_78,final_80
## 13 13 Bob 22 midterm1_82,midterm2_80,final_85
## 14 14 Emily 23 midterm1_88,midterm2_90,final_92
## 15 15 Michael 22 midterm1_85,midterm2_86,final_88
## 16 16 John 21 midterm1_80,midterm2_85,final_75
## 17 17 Alice 20 midterm1_75,midterm2_78,final_80
## 18 18 Bob 22 midterm1_82,midterm2_80,final_85
## 19 19 Emily 23 midterm1_88,midterm2_90,final_92
## 20 20 Michael 22 midterm1_85,midterm2_86,final_88
## 21 21 John 21 midterm1_80,midterm2_85,final_75
## 22 22 Alice 20 midterm1_75,midterm2_78,final_80
## 23 23 Bob 22 midterm1_82,midterm2_80,final_85
## 24 24 Emily 23 midterm1_88,midterm2_90,final_92
## 25 25 Michael 22 midterm1_85,midterm2_86,final_88
## 26 26 John 21 midterm1_80,midterm2_85,final_75
## 27 27 Alice 20 midterm1_75,midterm2_78,final_80
## 28 28 Bob 22 midterm1_82,midterm2_80,final_85
## 29 29 Emily 23 midterm1_88,midterm2_90,final_92
## 30 30 Michael 22 midterm1_85,midterm2_86,final_88
## 31 31 John 21 midterm1_80,midterm2_85,final_75
## 32 32 Alice 20 midterm1_75,midterm2_78,final_80
## 33 33 Bob 22 midterm1_82,midterm2_80,final_85
## 34 34 Emily 23 midterm1_88,midterm2_90,final_92
## 35 35 Michael 22 midterm1_85,midterm2_86,final_88
## 36 36 John 21 midterm1_80,midterm2_85,final_75
## 37 37 Alice 20 midterm1_75,midterm2_78,final_80
## 38 38 Bob 22 midterm1_82,midterm2_80,final_85
## 39 39 Emily 23 midterm1_88,midterm2_90,final_92
## 40 40 Michael 22 midterm1_85,midterm2_86,final_88
## 41 41 John 21 midterm1_80,midterm2_85,final_75
## 42 42 Alice 20 midterm1_75,midterm2_78,final_80
## 43 43 Bob 22 midterm1_82,midterm2_80,final_85
## 44 44 Emily 23 midterm1_88,midterm2_90,final_92
## 45 45 Michael 22 midterm1_85,midterm2_86,final_88
## 46 46 John 21 midterm1_80,midterm2_85,final_75
## 47 47 Alice 20 midterm1_75,midterm2_78,final_80
## 48 48 Bob 22 midterm1_82,midterm2_80,final_85
## 49 49 Emily 23 midterm1_88,midterm2_90,final_92
## 50 50 Michael 22 midterm1_85,midterm2_86,final_88
# Split variables exam_scores to separate columns for each exam type
a <- separate(pom, exam_scores, c("midterm1", "midterm2", "final"), sep = ",")
a$midterm1 <- gsub("midterm1_", "", a$midterm1)
a$midterm2 <- gsub("midterm2_", "", a$midterm2)
a$final <- gsub("final_", "", a$final)
a
## student_id name age midterm1 midterm2 final
## 1 1 John 21 80 85 75
## 2 2 Alice 20 75 78 80
## 3 3 Bob 22 82 80 85
## 4 4 Emily 23 88 90 92
## 5 5 Michael 22 85 86 88
## 6 6 John 21 80 85 75
## 7 7 Alice 20 75 78 80
## 8 8 Bob 22 82 80 85
## 9 9 Emily 23 88 90 92
## 10 10 Michael 22 85 86 88
## 11 11 John 21 80 85 75
## 12 12 Alice 20 75 78 80
## 13 13 Bob 22 82 80 85
## 14 14 Emily 23 88 90 92
## 15 15 Michael 22 85 86 88
## 16 16 John 21 80 85 75
## 17 17 Alice 20 75 78 80
## 18 18 Bob 22 82 80 85
## 19 19 Emily 23 88 90 92
## 20 20 Michael 22 85 86 88
## 21 21 John 21 80 85 75
## 22 22 Alice 20 75 78 80
## 23 23 Bob 22 82 80 85
## 24 24 Emily 23 88 90 92
## 25 25 Michael 22 85 86 88
## 26 26 John 21 80 85 75
## 27 27 Alice 20 75 78 80
## 28 28 Bob 22 82 80 85
## 29 29 Emily 23 88 90 92
## 30 30 Michael 22 85 86 88
## 31 31 John 21 80 85 75
## 32 32 Alice 20 75 78 80
## 33 33 Bob 22 82 80 85
## 34 34 Emily 23 88 90 92
## 35 35 Michael 22 85 86 88
## 36 36 John 21 80 85 75
## 37 37 Alice 20 75 78 80
## 38 38 Bob 22 82 80 85
## 39 39 Emily 23 88 90 92
## 40 40 Michael 22 85 86 88
## 41 41 John 21 80 85 75
## 42 42 Alice 20 75 78 80
## 43 43 Bob 22 82 80 85
## 44 44 Emily 23 88 90 92
## 45 45 Michael 22 85 86 88
## 46 46 John 21 80 85 75
## 47 47 Alice 20 75 78 80
## 48 48 Bob 22 82 80 85
## 49 49 Emily 23 88 90 92
## 50 50 Michael 22 85 86 88
# FINAL RESULT Transform student_data2 to long format
#pom2 <- separate_rows(pom, exam_scores, sep=",")
#pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
#pom3
student_data_long <- a %>% pivot_longer(
cols = c(midterm1, midterm2, final),
names_to = "exam_type",
values_to = "score"
)
student_data_long
## # A tibble: 150 × 5
## student_id name age exam_type score
## <int> <chr> <chr> <chr> <chr>
## 1 1 John 21 midterm1 80
## 2 1 John 21 midterm2 85
## 3 1 John 21 final 75
## 4 2 Alice 20 midterm1 75
## 5 2 Alice 20 midterm2 78
## 6 2 Alice 20 final 80
## 7 3 Bob 22 midterm1 82
## 8 3 Bob 22 midterm2 80
## 9 3 Bob 22 final 85
## 10 4 Emily 23 midterm1 88
## # ℹ 140 more rows
# COMPLETE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
complete(pom3, student_id, name, age, exam, score, fill = list(score = 0))
## # A tibble: 27,000 × 5
## student_id name age exam score
## <int> <chr> <chr> <chr> <chr>
## 1 1 Alice 20 final 75
## 2 1 Alice 20 final 78
## 3 1 Alice 20 final 80
## 4 1 Alice 20 final 82
## 5 1 Alice 20 final 85
## 6 1 Alice 20 final 86
## 7 1 Alice 20 final 88
## 8 1 Alice 20 final 90
## 9 1 Alice 20 final 92
## 10 1 Alice 20 midterm1 75
## # ℹ 26,990 more rows
# SPREAD
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
# Transform data to wide format
spread(pom3, key=exam, value=score)
## # A tibble: 50 × 6
## student_id name age final midterm1 midterm2
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 1 John 21 75 80 85
## 2 2 Alice 20 80 75 78
## 3 3 Bob 22 85 82 80
## 4 4 Emily 23 92 88 90
## 5 5 Michael 22 88 85 86
## 6 6 John 21 75 80 85
## 7 7 Alice 20 80 75 78
## 8 8 Bob 22 85 82 80
## 9 9 Emily 23 92 88 90
## 10 10 Michael 22 88 85 86
## # ℹ 40 more rows
# UNITE
library(tidyr)
set.seed(1)
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
pom <- separate(student_data, name_age, c("name", "age"), "_")
pom2 <- separate_rows(pom, exam_scores, sep=",")
pom3 <- separate(pom2, exam_scores, c("exam", "score"), '_')
unite(pom3, name_age, c("name", "age"))
## # A tibble: 150 × 4
## student_id name_age exam score
## <int> <chr> <chr> <chr>
## 1 1 John_21 midterm1 80
## 2 1 John_21 midterm2 85
## 3 1 John_21 final 75
## 4 2 Alice_20 midterm1 75
## 5 2 Alice_20 midterm2 78
## 6 2 Alice_20 final 80
## 7 3 Bob_22 midterm1 82
## 8 3 Bob_22 midterm2 80
## 9 3 Bob_22 final 85
## 10 4 Emily_23 midterm1 88
## # ℹ 140 more rows