Data wrangling group report

R Markdown

Gather

# Data
student_data <- data.frame(
  student_id = 1:50,
  name = paste0("Student", 1:50),
  midterm1 = sample(60:100, 50, replace = TRUE),
  midterm2 = sample(60:100, 50, replace = TRUE),
  final = sample(60:100, 50, replace = TRUE)
)

# Transform data to long format using gather:
student_data_long <- gather(student_data, key = "exam", value = "score", 3:5)

# Print first few rows:
head(student_data_long)
##   student_id     name     exam score
## 1          1 Student1 midterm1    64
## 2          2 Student2 midterm1    71
## 3          3 Student3 midterm1    98
## 4          4 Student4 midterm1    95
## 5          5 Student5 midterm1    99
## 6          6 Student6 midterm1    90

Separate

# Data
student_data2 <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

# Split variables name_age to 2 separate columns: name and age
student_data2<-separate(student_data2, name_age, into = c("name", "age"), sep = "_")

# Split variables exam_scores to separate columns for each exam type
student_data2<-separate(student_data2, exam_scores, into = c("midterm1", "midterm2", "final"), sep = ",")

head(student_data2)
##   student_id    name age    midterm1    midterm2    final
## 1          1    John  21 midterm1_80 midterm2_85 final_75
## 2          2   Alice  20 midterm1_75 midterm2_78 final_80
## 3          3     Bob  22 midterm1_82 midterm2_80 final_85
## 4          4   Emily  23 midterm1_88 midterm2_90 final_92
## 5          5 Michael  22 midterm1_85 midterm2_86 final_88
## 6          6    John  21 midterm1_80 midterm2_85 final_75
# Transform data to long format: 
student_data2 <- gather(student_data2, key = "exam", value = "score", 4:6) %>%
  separate(score, into = c("exam","score"), sep = "_")
 
head(student_data2)
##   student_id    name age     exam score
## 1          1    John  21 midterm1    80
## 2          2   Alice  20 midterm1    75
## 3          3     Bob  22 midterm1    82
## 4          4   Emily  23 midterm1    88
## 5          5 Michael  22 midterm1    85
## 6          6    John  21 midterm1    80

Complete

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

# Complete missing combinations for students
student_data_completed<-complete(student_data, student_id, nesting(name_age, exam_scores), fill = list())
head(student_data_completed)
## # A tibble: 6 × 3
##   student_id name_age   exam_scores                     
##        <int> <chr>      <chr>                           
## 1          1 Alice_20   midterm1_75,midterm2_78,final_80
## 2          1 Bob_22     midterm1_82,midterm2_80,final_85
## 3          1 Emily_23   midterm1_88,midterm2_90,final_92
## 4          1 John_21    midterm1_80,midterm2_85,final_75
## 5          1 Michael_22 midterm1_85,midterm2_86,final_88
## 6          2 Alice_20   midterm1_75,midterm2_78,final_80

Spread

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

# Transform data to wide format
student_data_wide <- separate(student_data, name_age, into = c("name", "age"), sep = "_") %>% 
  separate(exam_scores, into = c("midterm1", "midterm2", "final"), sep = ",") %>%  
    gather( key = "exam", value = "score", 4:6) %>% 
      separate(score, into = c("exam","score"), sep = "_") %>% 
        spread(key = "exam", value = "score")
head(student_data_wide)
##   student_id    name age final midterm1 midterm2
## 1          1    John  21    75       80       85
## 2          2   Alice  20    80       75       78
## 3          3     Bob  22    85       82       80
## 4          4   Emily  23    92       88       90
## 5          5 Michael  22    88       85       86
## 6          6    John  21    75       80       85

Unite

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
# When name and age are in one column
student_data <- separate(student_data, name_age, into = c("name", "age"), sep = "_")
head(student_data)
##   student_id    name age                      exam_scores
## 1          1    John  21 midterm1_80,midterm2_85,final_75
## 2          2   Alice  20 midterm1_75,midterm2_78,final_80
## 3          3     Bob  22 midterm1_82,midterm2_80,final_85
## 4          4   Emily  23 midterm1_88,midterm2_90,final_92
## 5          5 Michael  22 midterm1_85,midterm2_86,final_88
## 6          6    John  21 midterm1_80,midterm2_85,final_75
# Merge variables name and age to one column called name_age
student_data <- unite(student_data, name_age, name, age, sep = "_")
head(student_data)
##   student_id   name_age                      exam_scores
## 1          1    John_21 midterm1_80,midterm2_85,final_75
## 2          2   Alice_20 midterm1_75,midterm2_78,final_80
## 3          3     Bob_22 midterm1_82,midterm2_80,final_85
## 4          4   Emily_23 midterm1_88,midterm2_90,final_92
## 5          5 Michael_22 midterm1_85,midterm2_86,final_88
## 6          6    John_21 midterm1_80,midterm2_85,final_75