Data wrangling group report
R Markdown
Gather
# Data
student_data <- data.frame(
student_id = 1:50,
name = paste0("Student", 1:50),
midterm1 = sample(60:100, 50, replace = TRUE),
midterm2 = sample(60:100, 50, replace = TRUE),
final = sample(60:100, 50, replace = TRUE)
)
# Transform data to long format using gather:
student_data_long <- gather(student_data, key = "exam", value = "score", 3:5)
# Print first few rows:
head(student_data_long)## student_id name exam score
## 1 1 Student1 midterm1 64
## 2 2 Student2 midterm1 71
## 3 3 Student3 midterm1 98
## 4 4 Student4 midterm1 95
## 5 5 Student5 midterm1 99
## 6 6 Student6 midterm1 90
Separate
# Data
student_data2 <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
# Split variables name_age to 2 separate columns: name and age
student_data2<-separate(student_data2, name_age, into = c("name", "age"), sep = "_")
# Split variables exam_scores to separate columns for each exam type
student_data2<-separate(student_data2, exam_scores, into = c("midterm1", "midterm2", "final"), sep = ",")
head(student_data2)## student_id name age midterm1 midterm2 final
## 1 1 John 21 midterm1_80 midterm2_85 final_75
## 2 2 Alice 20 midterm1_75 midterm2_78 final_80
## 3 3 Bob 22 midterm1_82 midterm2_80 final_85
## 4 4 Emily 23 midterm1_88 midterm2_90 final_92
## 5 5 Michael 22 midterm1_85 midterm2_86 final_88
## 6 6 John 21 midterm1_80 midterm2_85 final_75
# Transform data to long format:
student_data2 <- gather(student_data2, key = "exam", value = "score", 4:6) %>%
separate(score, into = c("exam","score"), sep = "_")
head(student_data2)## student_id name age exam score
## 1 1 John 21 midterm1 80
## 2 2 Alice 20 midterm1 75
## 3 3 Bob 22 midterm1 82
## 4 4 Emily 23 midterm1 88
## 5 5 Michael 22 midterm1 85
## 6 6 John 21 midterm1 80
Complete
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
# Complete missing combinations for students
student_data_completed<-complete(student_data, student_id, nesting(name_age, exam_scores), fill = list())
head(student_data_completed)## # A tibble: 6 × 3
## student_id name_age exam_scores
## <int> <chr> <chr>
## 1 1 Alice_20 midterm1_75,midterm2_78,final_80
## 2 1 Bob_22 midterm1_82,midterm2_80,final_85
## 3 1 Emily_23 midterm1_88,midterm2_90,final_92
## 4 1 John_21 midterm1_80,midterm2_85,final_75
## 5 1 Michael_22 midterm1_85,midterm2_86,final_88
## 6 2 Alice_20 midterm1_75,midterm2_78,final_80
Spread
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
# Transform data to wide format
student_data_wide <- separate(student_data, name_age, into = c("name", "age"), sep = "_") %>%
separate(exam_scores, into = c("midterm1", "midterm2", "final"), sep = ",") %>%
gather( key = "exam", value = "score", 4:6) %>%
separate(score, into = c("exam","score"), sep = "_") %>%
spread(key = "exam", value = "score")
head(student_data_wide)## student_id name age final midterm1 midterm2
## 1 1 John 21 75 80 85
## 2 2 Alice 20 80 75 78
## 3 3 Bob 22 85 82 80
## 4 4 Emily 23 92 88 90
## 5 5 Michael 22 88 85 86
## 6 6 John 21 75 80 85
Unite
# Data
student_data <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
# When name and age are in one column
student_data <- separate(student_data, name_age, into = c("name", "age"), sep = "_")
head(student_data)## student_id name age exam_scores
## 1 1 John 21 midterm1_80,midterm2_85,final_75
## 2 2 Alice 20 midterm1_75,midterm2_78,final_80
## 3 3 Bob 22 midterm1_82,midterm2_80,final_85
## 4 4 Emily 23 midterm1_88,midterm2_90,final_92
## 5 5 Michael 22 midterm1_85,midterm2_86,final_88
## 6 6 John 21 midterm1_80,midterm2_85,final_75
# Merge variables name and age to one column called name_age
student_data <- unite(student_data, name_age, name, age, sep = "_")
head(student_data)## student_id name_age exam_scores
## 1 1 John_21 midterm1_80,midterm2_85,final_75
## 2 2 Alice_20 midterm1_75,midterm2_78,final_80
## 3 3 Bob_22 midterm1_82,midterm2_80,final_85
## 4 4 Emily_23 midterm1_88,midterm2_90,final_92
## 5 5 Michael_22 midterm1_85,midterm2_86,final_88
## 6 6 John_21 midterm1_80,midterm2_85,final_75