DataWrangling

set.seed(10)
library(tidyr)
library(dplyr)
## 
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union
# Data 1 Import
student_data <- data.frame(
  student_id = 1:50,
  name = paste0("Student", 1:50),
  midterm1 = sample(60:100, 50, replace = TRUE),
  midterm2 = sample(60:100, 50, replace = TRUE),
  final = sample(60:100, 50, replace = TRUE)
)
# Transform data to long format using gather
 student_data_long <- gather(student_data,
         key= "Exam_Type",
         value = "Score",
         -student_id,
         -name,
        )
head(student_data_long)
##   student_id     name Exam_Type Score
## 1          1 Student1  midterm1    68
## 2          2 Student2  midterm1    69
## 3          3 Student3  midterm1    71
## 4          4 Student4  midterm1    67
## 5          5 Student5  midterm1    98
## 6          6 Student6  midterm1    78
# Data 2 Import

student_data_2 <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
#Split variables name_age to 2 separate columns: name and age
student_data_2 <- separate(student_data_2,
         name_age,
         into=c("name","age"),
         sep = "_",
         remove = TRUE,
         convert = FALSE,
         )
# Split variables exam_scores to separate columns for each exam type
student_data_2 <- separate(student_data_2,
                            exam_scores,
                           into = c("midterm1", "midterm2", "final"),
                           sep = ",")
student_data_2 <- gather(student_data_2,
                              key="Exam_Type",
                              value="Score",
                              -student_id,
                              -name,
                              -age)
student_data_2 <- separate(student_data_2,
                                Score,
                                into=c("Exam_Type_2", "Score"),
                                sep = "_",
                                remove = TRUE,
                                convert = FALSE)
student_data_2$Exam_Type_2 <- NULL

head(student_data_2)
##   student_id    name age Exam_Type Score
## 1          1    John  21  midterm1    80
## 2          2   Alice  20  midterm1    75
## 3          3     Bob  22  midterm1    82
## 4          4   Emily  23  midterm1    88
## 5          5 Michael  22  midterm1    85
## 6          6    John  21  midterm1    80
# Data 3 Import
student_data_3 <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
  )

student_data_3 <- separate(student_data_3,
                           name_age,
                           into = c("name", "age"),
                           sep = "_")

student_data_3 <- separate(student_data_3,
                           exam_scores, 
                           into = c("midterm1_name", "midterm1_score", "midterm2_name", "midterm2_score", "final_name", "final_score"),
                           sep = "[_,]")

colnames(student_data_3) <- c("student_id", "name", "age", "exam_name", "exam_score", "exam_name2", "exam_score2", "exam_name3", "exam_score3")
# Complete missing combinations for students
student_data_3 <- complete(student_data_3, student_id, nesting(name, age),
                           fill = list(exam_name = NA, exam_score = NA, exam_name2 = NA,
                                       exam_score2 = NA, exam_name3 = NA, exam_score3 = NA))
head(student_data_3)
## # A tibble: 6 × 9
##   student_id name   age   exam_name exam_score exam_name2 exam_score2 exam_name3
##        <int> <chr>  <chr> <chr>     <chr>      <chr>      <chr>       <chr>     
## 1          1 Alice  20    <NA>      <NA>       <NA>       <NA>        <NA>      
## 2          1 Bob    22    <NA>      <NA>       <NA>       <NA>        <NA>      
## 3          1 Emily  23    <NA>      <NA>       <NA>       <NA>        <NA>      
## 4          1 John   21    midterm1  80         midterm2   85          final     
## 5          1 Micha… 22    <NA>      <NA>       <NA>       <NA>        <NA>      
## 6          2 Alice  20    midterm1  75         midterm2   78          final     
## # ℹ 1 more variable: exam_score3 <chr>
# Data 4 Import
student_data_4 <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

student_data_4 <- separate(student_data_4, name_age,
                           into = c("name", "age"),
                           sep = "_") %>%
  separate_rows(exam_scores, sep = ",") %>%
  separate(exam_scores, into = c("exam", "score"), sep = "_") %>%
  mutate(score = as.numeric(score))
# Transform data to wide format
student_data_4_wide <- spread(student_data_4, key = exam, value = score)

head(student_data_4)
## # A tibble: 6 × 5
##   student_id name  age   exam     score
##        <int> <chr> <chr> <chr>    <dbl>
## 1          1 John  21    midterm1    80
## 2          1 John  21    midterm2    85
## 3          1 John  21    final       75
## 4          2 Alice 20    midterm1    75
## 5          2 Alice 20    midterm2    78
## 6          2 Alice 20    final       80
head(student_data_4_wide)
## # A tibble: 6 × 6
##   student_id name    age   final midterm1 midterm2
##        <int> <chr>   <chr> <dbl>    <dbl>    <dbl>
## 1          1 John    21       75       80       85
## 2          2 Alice   20       80       75       78
## 3          3 Bob     22       85       82       80
## 4          4 Emily   23       92       88       90
## 5          5 Michael 22       88       85       86
## 6          6 John    21       75       80       85
#Merge variables name and age to one column called name_age
student_data_5 <- unite(student_data_2, "name_age", "name", "age", sep = "_")
head(student_data_5)
##   student_id   name_age Exam_Type Score
## 1          1    John_21  midterm1    80
## 2          2   Alice_20  midterm1    75
## 3          3     Bob_22  midterm1    82
## 4          4   Emily_23  midterm1    88
## 5          5 Michael_22  midterm1    85
## 6          6    John_21  midterm1    80