set.seed(10)
library(tidyr)
library(dplyr)
##
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
# Data 1 Import
student_data <- data.frame(
student_id = 1:50,
name = paste0("Student", 1:50),
midterm1 = sample(60:100, 50, replace = TRUE),
midterm2 = sample(60:100, 50, replace = TRUE),
final = sample(60:100, 50, replace = TRUE)
)
# Transform data to long format using gather
student_data_long <- gather(student_data,
key= "Exam_Type",
value = "Score",
-student_id,
-name,
)
## student_id name Exam_Type Score
## 1 1 Student1 midterm1 68
## 2 2 Student2 midterm1 69
## 3 3 Student3 midterm1 71
## 4 4 Student4 midterm1 67
## 5 5 Student5 midterm1 98
## 6 6 Student6 midterm1 78
# Data 2 Import
student_data_2 <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
#Split variables name_age to 2 separate columns: name and age
student_data_2 <- separate(student_data_2,
name_age,
into=c("name","age"),
sep = "_",
remove = TRUE,
convert = FALSE,
)
# Split variables exam_scores to separate columns for each exam type
student_data_2 <- separate(student_data_2,
exam_scores,
into = c("midterm1", "midterm2", "final"),
sep = ",")
student_data_2 <- gather(student_data_2,
key="Exam_Type",
value="Score",
-student_id,
-name,
-age)
student_data_2 <- separate(student_data_2,
Score,
into=c("Exam_Type_2", "Score"),
sep = "_",
remove = TRUE,
convert = FALSE)
student_data_2$Exam_Type_2 <- NULL
head(student_data_2)
## student_id name age Exam_Type Score
## 1 1 John 21 midterm1 80
## 2 2 Alice 20 midterm1 75
## 3 3 Bob 22 midterm1 82
## 4 4 Emily 23 midterm1 88
## 5 5 Michael 22 midterm1 85
## 6 6 John 21 midterm1 80
# Data 3 Import
student_data_3 <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
student_data_3 <- separate(student_data_3,
name_age,
into = c("name", "age"),
sep = "_")
student_data_3 <- separate(student_data_3,
exam_scores,
into = c("midterm1_name", "midterm1_score", "midterm2_name", "midterm2_score", "final_name", "final_score"),
sep = "[_,]")
colnames(student_data_3) <- c("student_id", "name", "age", "exam_name", "exam_score", "exam_name2", "exam_score2", "exam_name3", "exam_score3")
# Complete missing combinations for students
student_data_3 <- complete(student_data_3, student_id, nesting(name, age),
fill = list(exam_name = NA, exam_score = NA, exam_name2 = NA,
exam_score2 = NA, exam_name3 = NA, exam_score3 = NA))
head(student_data_3)
## # A tibble: 6 × 9
## student_id name age exam_name exam_score exam_name2 exam_score2 exam_name3
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 Alice 20 <NA> <NA> <NA> <NA> <NA>
## 2 1 Bob 22 <NA> <NA> <NA> <NA> <NA>
## 3 1 Emily 23 <NA> <NA> <NA> <NA> <NA>
## 4 1 John 21 midterm1 80 midterm2 85 final
## 5 1 Micha… 22 <NA> <NA> <NA> <NA> <NA>
## 6 2 Alice 20 midterm1 75 midterm2 78 final
## # ℹ 1 more variable: exam_score3 <chr>
# Data 4 Import
student_data_4 <- data.frame(
student_id = 1:50,
name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)
student_data_4 <- separate(student_data_4, name_age,
into = c("name", "age"),
sep = "_") %>%
separate_rows(exam_scores, sep = ",") %>%
separate(exam_scores, into = c("exam", "score"), sep = "_") %>%
mutate(score = as.numeric(score))
# Transform data to wide format
student_data_4_wide <- spread(student_data_4, key = exam, value = score)
head(student_data_4)
## # A tibble: 6 × 5
## student_id name age exam score
## <int> <chr> <chr> <chr> <dbl>
## 1 1 John 21 midterm1 80
## 2 1 John 21 midterm2 85
## 3 1 John 21 final 75
## 4 2 Alice 20 midterm1 75
## 5 2 Alice 20 midterm2 78
## 6 2 Alice 20 final 80
head(student_data_4_wide)
## # A tibble: 6 × 6
## student_id name age final midterm1 midterm2
## <int> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 John 21 75 80 85
## 2 2 Alice 20 80 75 78
## 3 3 Bob 22 85 82 80
## 4 4 Emily 23 92 88 90
## 5 5 Michael 22 88 85 86
## 6 6 John 21 75 80 85
#Merge variables name and age to one column called name_age
student_data_5 <- unite(student_data_2, "name_age", "name", "age", sep = "_")
head(student_data_5)
## student_id name_age Exam_Type Score
## 1 1 John_21 midterm1 80
## 2 2 Alice_20 midterm1 75
## 3 3 Bob_22 midterm1 82
## 4 4 Emily_23 midterm1 88
## 5 5 Michael_22 midterm1 85
## 6 6 John_21 midterm1 80