Data_Wrangling

Evgenei Khodar

First task:

set.seed(8419)
library(tidyr)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name = paste0("Student", 1:50),
  midterm1 = sample(60:100, 50, replace = TRUE),
  midterm2 = sample(60:100, 50, replace = TRUE),
  final = sample(60:100, 50, replace = TRUE)
)

# Transform data to long format using gather:
student_data_1 <- gather(student_data,
key = "Exam_type",
value= "result",
na.rm = FALSE,
convert = FALSE,
factor_key = FALSE,
-name, -student_id
)
# Print first few rows:
head(student_data_1)
##   student_id     name Exam_type result
## 1          1 Student1  midterm1     77
## 2          2 Student2  midterm1     81
## 3          3 Student3  midterm1     88
## 4          4 Student4  midterm1     90
## 5          5 Student5  midterm1     72
## 6          6 Student6  midterm1     92

Second task:

library(tidyr)
library(dplyr)

# Data
student_data2 <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)

# Split variables name_age to 2 separate columns: name and age
student_data2<- separate(
 student_data2,
  name_age,
  into = c("name","age"),
  sep = "_",
  remove = TRUE,
  convert = FALSE,
  extra = "warn",
  fill = "warn",
)

# Split variables exam_scores to separate columns for each exam type
student_data2 <- separate(
 student_data2,
  exam_scores,
  into = c("midterm1", "midterm2", "final"),
  sep = ",",
  remove = TRUE,
  convert = FALSE,
  extra = "drop",
  fill = "warn",
) 
student_data2 <- student_data2%>%
  mutate(
    midterm1 = sub(".*_", "", midterm1),
    midterm2 = sub(".*_", "", midterm2),
    final = sub(".*_", "", final)
  )
# Transform student_data2 to long format

student_data2 <- gather(student_data2,
key = "Exam_type",
value= "result",
na.rm = FALSE,
convert = FALSE,
factor_key = FALSE,
 -student_id, -name, -age
)
student_data2
##     student_id    name age Exam_type result
## 1            1    John  21  midterm1     80
## 2            2   Alice  20  midterm1     75
## 3            3     Bob  22  midterm1     82
## 4            4   Emily  23  midterm1     88
## 5            5 Michael  22  midterm1     85
## 6            6    John  21  midterm1     80
## 7            7   Alice  20  midterm1     75
## 8            8     Bob  22  midterm1     82
## 9            9   Emily  23  midterm1     88
## 10          10 Michael  22  midterm1     85
## 11          11    John  21  midterm1     80
## 12          12   Alice  20  midterm1     75
## 13          13     Bob  22  midterm1     82
## 14          14   Emily  23  midterm1     88
## 15          15 Michael  22  midterm1     85
## 16          16    John  21  midterm1     80
## 17          17   Alice  20  midterm1     75
## 18          18     Bob  22  midterm1     82
## 19          19   Emily  23  midterm1     88
## 20          20 Michael  22  midterm1     85
## 21          21    John  21  midterm1     80
## 22          22   Alice  20  midterm1     75
## 23          23     Bob  22  midterm1     82
## 24          24   Emily  23  midterm1     88
## 25          25 Michael  22  midterm1     85
## 26          26    John  21  midterm1     80
## 27          27   Alice  20  midterm1     75
## 28          28     Bob  22  midterm1     82
## 29          29   Emily  23  midterm1     88
## 30          30 Michael  22  midterm1     85
## 31          31    John  21  midterm1     80
## 32          32   Alice  20  midterm1     75
## 33          33     Bob  22  midterm1     82
## 34          34   Emily  23  midterm1     88
## 35          35 Michael  22  midterm1     85
## 36          36    John  21  midterm1     80
## 37          37   Alice  20  midterm1     75
## 38          38     Bob  22  midterm1     82
## 39          39   Emily  23  midterm1     88
## 40          40 Michael  22  midterm1     85
## 41          41    John  21  midterm1     80
## 42          42   Alice  20  midterm1     75
## 43          43     Bob  22  midterm1     82
## 44          44   Emily  23  midterm1     88
## 45          45 Michael  22  midterm1     85
## 46          46    John  21  midterm1     80
## 47          47   Alice  20  midterm1     75
## 48          48     Bob  22  midterm1     82
## 49          49   Emily  23  midterm1     88
## 50          50 Michael  22  midterm1     85
## 51           1    John  21  midterm2     85
## 52           2   Alice  20  midterm2     78
## 53           3     Bob  22  midterm2     80
## 54           4   Emily  23  midterm2     90
## 55           5 Michael  22  midterm2     86
## 56           6    John  21  midterm2     85
## 57           7   Alice  20  midterm2     78
## 58           8     Bob  22  midterm2     80
## 59           9   Emily  23  midterm2     90
## 60          10 Michael  22  midterm2     86
## 61          11    John  21  midterm2     85
## 62          12   Alice  20  midterm2     78
## 63          13     Bob  22  midterm2     80
## 64          14   Emily  23  midterm2     90
## 65          15 Michael  22  midterm2     86
## 66          16    John  21  midterm2     85
## 67          17   Alice  20  midterm2     78
## 68          18     Bob  22  midterm2     80
## 69          19   Emily  23  midterm2     90
## 70          20 Michael  22  midterm2     86
## 71          21    John  21  midterm2     85
## 72          22   Alice  20  midterm2     78
## 73          23     Bob  22  midterm2     80
## 74          24   Emily  23  midterm2     90
## 75          25 Michael  22  midterm2     86
## 76          26    John  21  midterm2     85
## 77          27   Alice  20  midterm2     78
## 78          28     Bob  22  midterm2     80
## 79          29   Emily  23  midterm2     90
## 80          30 Michael  22  midterm2     86
## 81          31    John  21  midterm2     85
## 82          32   Alice  20  midterm2     78
## 83          33     Bob  22  midterm2     80
## 84          34   Emily  23  midterm2     90
## 85          35 Michael  22  midterm2     86
## 86          36    John  21  midterm2     85
## 87          37   Alice  20  midterm2     78
## 88          38     Bob  22  midterm2     80
## 89          39   Emily  23  midterm2     90
## 90          40 Michael  22  midterm2     86
## 91          41    John  21  midterm2     85
## 92          42   Alice  20  midterm2     78
## 93          43     Bob  22  midterm2     80
## 94          44   Emily  23  midterm2     90
## 95          45 Michael  22  midterm2     86
## 96          46    John  21  midterm2     85
## 97          47   Alice  20  midterm2     78
## 98          48     Bob  22  midterm2     80
## 99          49   Emily  23  midterm2     90
## 100         50 Michael  22  midterm2     86
## 101          1    John  21     final     75
## 102          2   Alice  20     final     80
## 103          3     Bob  22     final     85
## 104          4   Emily  23     final     92
## 105          5 Michael  22     final     88
## 106          6    John  21     final     75
## 107          7   Alice  20     final     80
## 108          8     Bob  22     final     85
## 109          9   Emily  23     final     92
## 110         10 Michael  22     final     88
## 111         11    John  21     final     75
## 112         12   Alice  20     final     80
## 113         13     Bob  22     final     85
## 114         14   Emily  23     final     92
## 115         15 Michael  22     final     88
## 116         16    John  21     final     75
## 117         17   Alice  20     final     80
## 118         18     Bob  22     final     85
## 119         19   Emily  23     final     92
## 120         20 Michael  22     final     88
## 121         21    John  21     final     75
## 122         22   Alice  20     final     80
## 123         23     Bob  22     final     85
## 124         24   Emily  23     final     92
## 125         25 Michael  22     final     88
## 126         26    John  21     final     75
## 127         27   Alice  20     final     80
## 128         28     Bob  22     final     85
## 129         29   Emily  23     final     92
## 130         30 Michael  22     final     88
## 131         31    John  21     final     75
## 132         32   Alice  20     final     80
## 133         33     Bob  22     final     85
## 134         34   Emily  23     final     92
## 135         35 Michael  22     final     88
## 136         36    John  21     final     75
## 137         37   Alice  20     final     80
## 138         38     Bob  22     final     85
## 139         39   Emily  23     final     92
## 140         40 Michael  22     final     88
## 141         41    John  21     final     75
## 142         42   Alice  20     final     80
## 143         43     Bob  22     final     85
## 144         44   Emily  23     final     92
## 145         45 Michael  22     final     88
## 146         46    John  21     final     75
## 147         47   Alice  20     final     80
## 148         48     Bob  22     final     85
## 149         49   Emily  23     final     92
## 150         50 Michael  22     final     88

Third task:

library(tidyr)


# Complete missing combinations for students
 complete (student_data2, Exam_type,result, fill = list(name="-"), explicit = TRUE)
## # A tibble: 162 × 5
##    Exam_type result student_id name  age  
##    <chr>     <chr>       <int> <chr> <chr>
##  1 final     75              1 John  21   
##  2 final     75              6 John  21   
##  3 final     75             11 John  21   
##  4 final     75             16 John  21   
##  5 final     75             21 John  21   
##  6 final     75             26 John  21   
##  7 final     75             31 John  21   
##  8 final     75             36 John  21   
##  9 final     75             41 John  21   
## 10 final     75             46 John  21   
## # ℹ 152 more rows

Fourth task:

library(tidyr)

# Data
student_data <- data.frame(
  student_id = 1:50,
  name_age = c("John_21", "Alice_20", "Bob_22", "Emily_23", "Michael_22"),
  exam_scores = c("midterm1_80,midterm2_85,final_75", "midterm1_75,midterm2_78,final_80", "midterm1_82,midterm2_80,final_85", "midterm1_88,midterm2_90,final_92", "midterm1_85,midterm2_86,final_88")
)


# Transform data to wide format

student_data <- spread(student_data_1, key="Exam_type", value="result", fill = NA, convert = FALSE, drop = TRUE, sep = NULL)
student_data
##    student_id      name final midterm1 midterm2
## 1           1  Student1    67       77       68
## 2           2  Student2   100       81       65
## 3           3  Student3    99       88       89
## 4           4  Student4    85       90       81
## 5           5  Student5    89       72       85
## 6           6  Student6    91       92       83
## 7           7  Student7    83       80       79
## 8           8  Student8    99       68       96
## 9           9  Student9    80       85       60
## 10         10 Student10    93       63       66
## 11         11 Student11    66       61       83
## 12         12 Student12    88       93       63
## 13         13 Student13    96       79       96
## 14         14 Student14    90       68       69
## 15         15 Student15    96       69       89
## 16         16 Student16    89       70       70
## 17         17 Student17    98       75      100
## 18         18 Student18    98       65       65
## 19         19 Student19    99       96       71
## 20         20 Student20    87       89       66
## 21         21 Student21    60       66       82
## 22         22 Student22    71       79       75
## 23         23 Student23    66       89       67
## 24         24 Student24    80       84       74
## 25         25 Student25    97       93       97
## 26         26 Student26    60       62       60
## 27         27 Student27    82       96       84
## 28         28 Student28    72      100       95
## 29         29 Student29    69       66       79
## 30         30 Student30    71       61       63
## 31         31 Student31    97      100       90
## 32         32 Student32    71       85       86
## 33         33 Student33    91       83       67
## 34         34 Student34    65       92       76
## 35         35 Student35    69       79       72
## 36         36 Student36    97       76       79
## 37         37 Student37    99       99       79
## 38         38 Student38    91       76       64
## 39         39 Student39    88       63       64
## 40         40 Student40    81       62       65
## 41         41 Student41    93       82       97
## 42         42 Student42    69       82       62
## 43         43 Student43    63       69       80
## 44         44 Student44    77       95       81
## 45         45 Student45    68       84       96
## 46         46 Student46    99       61       95
## 47         47 Student47    71       60       82
## 48         48 Student48    72       98       66
## 49         49 Student49    88       64       72
## 50         50 Student50    88       74       61

Fifth task:

unite(student_data2, "name_age", c("name","age"), sep = "_", remove = TRUE, na.rm = FALSE)
##     student_id   name_age Exam_type result
## 1            1    John_21  midterm1     80
## 2            2   Alice_20  midterm1     75
## 3            3     Bob_22  midterm1     82
## 4            4   Emily_23  midterm1     88
## 5            5 Michael_22  midterm1     85
## 6            6    John_21  midterm1     80
## 7            7   Alice_20  midterm1     75
## 8            8     Bob_22  midterm1     82
## 9            9   Emily_23  midterm1     88
## 10          10 Michael_22  midterm1     85
## 11          11    John_21  midterm1     80
## 12          12   Alice_20  midterm1     75
## 13          13     Bob_22  midterm1     82
## 14          14   Emily_23  midterm1     88
## 15          15 Michael_22  midterm1     85
## 16          16    John_21  midterm1     80
## 17          17   Alice_20  midterm1     75
## 18          18     Bob_22  midterm1     82
## 19          19   Emily_23  midterm1     88
## 20          20 Michael_22  midterm1     85
## 21          21    John_21  midterm1     80
## 22          22   Alice_20  midterm1     75
## 23          23     Bob_22  midterm1     82
## 24          24   Emily_23  midterm1     88
## 25          25 Michael_22  midterm1     85
## 26          26    John_21  midterm1     80
## 27          27   Alice_20  midterm1     75
## 28          28     Bob_22  midterm1     82
## 29          29   Emily_23  midterm1     88
## 30          30 Michael_22  midterm1     85
## 31          31    John_21  midterm1     80
## 32          32   Alice_20  midterm1     75
## 33          33     Bob_22  midterm1     82
## 34          34   Emily_23  midterm1     88
## 35          35 Michael_22  midterm1     85
## 36          36    John_21  midterm1     80
## 37          37   Alice_20  midterm1     75
## 38          38     Bob_22  midterm1     82
## 39          39   Emily_23  midterm1     88
## 40          40 Michael_22  midterm1     85
## 41          41    John_21  midterm1     80
## 42          42   Alice_20  midterm1     75
## 43          43     Bob_22  midterm1     82
## 44          44   Emily_23  midterm1     88
## 45          45 Michael_22  midterm1     85
## 46          46    John_21  midterm1     80
## 47          47   Alice_20  midterm1     75
## 48          48     Bob_22  midterm1     82
## 49          49   Emily_23  midterm1     88
## 50          50 Michael_22  midterm1     85
## 51           1    John_21  midterm2     85
## 52           2   Alice_20  midterm2     78
## 53           3     Bob_22  midterm2     80
## 54           4   Emily_23  midterm2     90
## 55           5 Michael_22  midterm2     86
## 56           6    John_21  midterm2     85
## 57           7   Alice_20  midterm2     78
## 58           8     Bob_22  midterm2     80
## 59           9   Emily_23  midterm2     90
## 60          10 Michael_22  midterm2     86
## 61          11    John_21  midterm2     85
## 62          12   Alice_20  midterm2     78
## 63          13     Bob_22  midterm2     80
## 64          14   Emily_23  midterm2     90
## 65          15 Michael_22  midterm2     86
## 66          16    John_21  midterm2     85
## 67          17   Alice_20  midterm2     78
## 68          18     Bob_22  midterm2     80
## 69          19   Emily_23  midterm2     90
## 70          20 Michael_22  midterm2     86
## 71          21    John_21  midterm2     85
## 72          22   Alice_20  midterm2     78
## 73          23     Bob_22  midterm2     80
## 74          24   Emily_23  midterm2     90
## 75          25 Michael_22  midterm2     86
## 76          26    John_21  midterm2     85
## 77          27   Alice_20  midterm2     78
## 78          28     Bob_22  midterm2     80
## 79          29   Emily_23  midterm2     90
## 80          30 Michael_22  midterm2     86
## 81          31    John_21  midterm2     85
## 82          32   Alice_20  midterm2     78
## 83          33     Bob_22  midterm2     80
## 84          34   Emily_23  midterm2     90
## 85          35 Michael_22  midterm2     86
## 86          36    John_21  midterm2     85
## 87          37   Alice_20  midterm2     78
## 88          38     Bob_22  midterm2     80
## 89          39   Emily_23  midterm2     90
## 90          40 Michael_22  midterm2     86
## 91          41    John_21  midterm2     85
## 92          42   Alice_20  midterm2     78
## 93          43     Bob_22  midterm2     80
## 94          44   Emily_23  midterm2     90
## 95          45 Michael_22  midterm2     86
## 96          46    John_21  midterm2     85
## 97          47   Alice_20  midterm2     78
## 98          48     Bob_22  midterm2     80
## 99          49   Emily_23  midterm2     90
## 100         50 Michael_22  midterm2     86
## 101          1    John_21     final     75
## 102          2   Alice_20     final     80
## 103          3     Bob_22     final     85
## 104          4   Emily_23     final     92
## 105          5 Michael_22     final     88
## 106          6    John_21     final     75
## 107          7   Alice_20     final     80
## 108          8     Bob_22     final     85
## 109          9   Emily_23     final     92
## 110         10 Michael_22     final     88
## 111         11    John_21     final     75
## 112         12   Alice_20     final     80
## 113         13     Bob_22     final     85
## 114         14   Emily_23     final     92
## 115         15 Michael_22     final     88
## 116         16    John_21     final     75
## 117         17   Alice_20     final     80
## 118         18     Bob_22     final     85
## 119         19   Emily_23     final     92
## 120         20 Michael_22     final     88
## 121         21    John_21     final     75
## 122         22   Alice_20     final     80
## 123         23     Bob_22     final     85
## 124         24   Emily_23     final     92
## 125         25 Michael_22     final     88
## 126         26    John_21     final     75
## 127         27   Alice_20     final     80
## 128         28     Bob_22     final     85
## 129         29   Emily_23     final     92
## 130         30 Michael_22     final     88
## 131         31    John_21     final     75
## 132         32   Alice_20     final     80
## 133         33     Bob_22     final     85
## 134         34   Emily_23     final     92
## 135         35 Michael_22     final     88
## 136         36    John_21     final     75
## 137         37   Alice_20     final     80
## 138         38     Bob_22     final     85
## 139         39   Emily_23     final     92
## 140         40 Michael_22     final     88
## 141         41    John_21     final     75
## 142         42   Alice_20     final     80
## 143         43     Bob_22     final     85
## 144         44   Emily_23     final     92
## 145         45 Michael_22     final     88
## 146         46    John_21     final     75
## 147         47   Alice_20     final     80
## 148         48     Bob_22     final     85
## 149         49   Emily_23     final     92
## 150         50 Michael_22     final     88