To clean and manupulate the data in R, we use tidyr and dplyr. This lesson is prepared to equip students with the ability to manage all data before analysis.

# Import the library
library(tidyr) # Clean the data
library(dplyr) # Manipulate the data
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1 How to create the data in R

# Option 1
## First, create each variable
ID = 1:10
Age = c(25, 30, 22, 35, 28, 40, 19, 33, 26, 37)
Gender = c("Male", "Female", "Female", "Male", "Female", "Male", "Male", "Female", "Female", "Male")

## Second, create dataframe
data <- data.frame(ID, Age, Gender)
# Show the data
data
##    ID Age Gender
## 1   1  25   Male
## 2   2  30 Female
## 3   3  22 Female
## 4   4  35   Male
## 5   5  28 Female
## 6   6  40   Male
## 7   7  19   Male
## 8   8  33 Female
## 9   9  26 Female
## 10 10  37   Male
# Option 2

data <- tibble(
  ID = 1:10,
  Age = c(25, 30, 22, 35, 28, 40, 19, 33, 26, 37),
  Gender = c("Male", "Female", "Female", "Male", "Female", 
             "Male", "Male", "Female", "Female", "Male")
)

# Show the data
data
## # A tibble: 10 × 3
##       ID   Age Gender
##    <int> <dbl> <chr> 
##  1     1    25 Male  
##  2     2    30 Female
##  3     3    22 Female
##  4     4    35 Male  
##  5     5    28 Female
##  6     6    40 Male  
##  7     7    19 Male  
##  8     8    33 Female
##  9     9    26 Female
## 10    10    37 Male

2. Filter the data (filter)

2.1 Filter rows where Age is greater than 30

# Filter rows where Age > 30
df_age <- data %>% filter(Age > 30)
print(df_age)
## # A tibble: 4 × 3
##      ID   Age Gender
##   <int> <dbl> <chr> 
## 1     4    35 Male  
## 2     6    40 Male  
## 3     8    33 Female
## 4    10    37 Male

2.2 Filter rows where Gender is “Female”

# Filter rows where Gender is "Female"
df_female<- data %>% filter(Gender == "Female")
print(df_female)
## # A tibble: 5 × 3
##      ID   Age Gender
##   <int> <dbl> <chr> 
## 1     2    30 Female
## 2     3    22 Female
## 3     5    28 Female
## 4     8    33 Female
## 5     9    26 Female

2.3 Filter rows where Gender, deleting “Female”

# Filter rows where Gender is "Female"
exc_female<- data %>% filter(!Gender == "Female")
print(exc_female)
## # A tibble: 5 × 3
##      ID   Age Gender
##   <int> <dbl> <chr> 
## 1     1    25 Male  
## 2     4    35 Male  
## 3     6    40 Male  
## 4     7    19 Male  
## 5    10    37 Male

2.4 Filter rows where Age is between 25 and 35 (inclusive)

# Filter rows where Age is between 25 and 35
age_range <- data %>% filter(Age >= 25 & Age <= 35)
print(age_range)
## # A tibble: 6 × 3
##      ID   Age Gender
##   <int> <dbl> <chr> 
## 1     1    25 Male  
## 2     2    30 Female
## 3     4    35 Male  
## 4     5    28 Female
## 5     8    33 Female
## 6     9    26 Female

2.5 Filter rows with multiple conditions (e.g., Age > 25 and Gender is “Male”)

# Filter rows where Age > 25 and Gender is "Male"
age_male <- data %>% filter(Age > 25, Gender == "Male")
print(age_male)
## # A tibble: 3 × 3
##      ID   Age Gender
##   <int> <dbl> <chr> 
## 1     4    35 Male  
## 2     6    40 Male  
## 3    10    37 Male

3. Select the data (select)

Example Dataset

# Example dataset
library(dplyr)

data <- tibble(
  ID = 1:10,
  Age = c(25, 30, 22, 35, 28, 40, 19, 33, 26, 37),
  Gender = c("Male", "Female", "Female", "Male", "Female", 
             "Male", "Male", "Female", "Female", "Male"),
  Score = c(85, 90, 78, 88, 76, 95, 65, 84, 82, 91)
)

print(data)
## # A tibble: 10 × 4
##       ID   Age Gender Score
##    <int> <dbl> <chr>  <dbl>
##  1     1    25 Male      85
##  2     2    30 Female    90
##  3     3    22 Female    78
##  4     4    35 Male      88
##  5     5    28 Female    76
##  6     6    40 Male      95
##  7     7    19 Male      65
##  8     8    33 Female    84
##  9     9    26 Female    82
## 10    10    37 Male      91

3.1 Select Specific Columns

# Select only ID and Age columns
selected_data <- data %>% select(ID, Age)
print(selected_data)
## # A tibble: 10 × 2
##       ID   Age
##    <int> <dbl>
##  1     1    25
##  2     2    30
##  3     3    22
##  4     4    35
##  5     5    28
##  6     6    40
##  7     7    19
##  8     8    33
##  9     9    26
## 10    10    37

3.2 Select Specific Columns (using column number)

# Select only ID and Age columns
df <- data %>% select(1, 3)
print(df)
## # A tibble: 10 × 2
##       ID Gender
##    <int> <chr> 
##  1     1 Male  
##  2     2 Female
##  3     3 Female
##  4     4 Male  
##  5     5 Female
##  6     6 Male  
##  7     7 Male  
##  8     8 Female
##  9     9 Female
## 10    10 Male

3.3 Delect Columns

# Delete ID and Age columns by using "-" 
df1 <- data %>% select(-c(ID, Age))
print(df1)
## # A tibble: 10 × 2
##    Gender Score
##    <chr>  <dbl>
##  1 Male      85
##  2 Female    90
##  3 Female    78
##  4 Male      88
##  5 Female    76
##  6 Male      95
##  7 Male      65
##  8 Female    84
##  9 Female    82
## 10 Male      91

4. Rename Columns

# Rename column 'Score' to 'Exam_Score'
renamed_data <- data %>% rename(Exam_Score = Score)
print(renamed_data)
## # A tibble: 10 × 4
##       ID   Age Gender Exam_Score
##    <int> <dbl> <chr>       <dbl>
##  1     1    25 Male           85
##  2     2    30 Female         90
##  3     3    22 Female         78
##  4     4    35 Male           88
##  5     5    28 Female         76
##  6     6    40 Male           95
##  7     7    19 Male           65
##  8     8    33 Female         84
##  9     9    26 Female         82
## 10    10    37 Male           91

5. Create or Modify Columns (Mutate)

# Add a new column 'Age_Group' based on Age
mutated_data1 <- data %>% mutate(half_score = Score/2)
print(mutated_data1)
## # A tibble: 10 × 5
##       ID   Age Gender Score half_score
##    <int> <dbl> <chr>  <dbl>      <dbl>
##  1     1    25 Male      85       42.5
##  2     2    30 Female    90       45  
##  3     3    22 Female    78       39  
##  4     4    35 Male      88       44  
##  5     5    28 Female    76       38  
##  6     6    40 Male      95       47.5
##  7     7    19 Male      65       32.5
##  8     8    33 Female    84       42  
##  9     9    26 Female    82       41  
## 10    10    37 Male      91       45.5
# Add a new column 'Age_Group' based on Age
mutated_data <- data %>% mutate(Age_Group = if_else(Age > 30, "Older", "Younger"))
print(mutated_data)
## # A tibble: 10 × 5
##       ID   Age Gender Score Age_Group
##    <int> <dbl> <chr>  <dbl> <chr>    
##  1     1    25 Male      85 Younger  
##  2     2    30 Female    90 Younger  
##  3     3    22 Female    78 Younger  
##  4     4    35 Male      88 Older    
##  5     5    28 Female    76 Younger  
##  6     6    40 Male      95 Older    
##  7     7    19 Male      65 Younger  
##  8     8    33 Female    84 Older    
##  9     9    26 Female    82 Younger  
## 10    10    37 Male      91 Older

6. Arrange the data (arrange)

6.1 Arrange Rows by a Column (large to small)

# Arrange rows by Age in descending order
arranged_data <- data %>% arrange(desc(Age))
print(arranged_data)
## # A tibble: 10 × 4
##       ID   Age Gender Score
##    <int> <dbl> <chr>  <dbl>
##  1     6    40 Male      95
##  2    10    37 Male      91
##  3     4    35 Male      88
##  4     8    33 Female    84
##  5     2    30 Female    90
##  6     5    28 Female    76
##  7     9    26 Female    82
##  8     1    25 Male      85
##  9     3    22 Female    78
## 10     7    19 Male      65

6.2 Arrange Rows by a Column (small to large)

# Arrange rows by Age in descending order
arranged_data1 <- data %>% arrange(Age)
print(arranged_data1)
## # A tibble: 10 × 4
##       ID   Age Gender Score
##    <int> <dbl> <chr>  <dbl>
##  1     7    19 Male      65
##  2     3    22 Female    78
##  3     1    25 Male      85
##  4     9    26 Female    82
##  5     5    28 Female    76
##  6     2    30 Female    90
##  7     8    33 Female    84
##  8     4    35 Male      88
##  9    10    37 Male      91
## 10     6    40 Male      95

6.3 Arrange Rows by a Column (categorical variable)

# Arrange rows by Age in descending order
arranged_data <- data %>% arrange(Gender)
print(arranged_data)
## # A tibble: 10 × 4
##       ID   Age Gender Score
##    <int> <dbl> <chr>  <dbl>
##  1     2    30 Female    90
##  2     3    22 Female    78
##  3     5    28 Female    76
##  4     8    33 Female    84
##  5     9    26 Female    82
##  6     1    25 Male      85
##  7     4    35 Male      88
##  8     6    40 Male      95
##  9     7    19 Male      65
## 10    10    37 Male      91

7. Transform Data from Wide to Long Format (pivot_longer)

# Convert dataset to a long format with Gender and Score as value columns
library(tidyr)
data
## # A tibble: 10 × 4
##       ID   Age Gender Score
##    <int> <dbl> <chr>  <dbl>
##  1     1    25 Male      85
##  2     2    30 Female    90
##  3     3    22 Female    78
##  4     4    35 Male      88
##  5     5    28 Female    76
##  6     6    40 Male      95
##  7     7    19 Male      65
##  8     8    33 Female    84
##  9     9    26 Female    82
## 10    10    37 Male      91
long_data <- data %>% pivot_longer(names_to = "Variable", values_to = "Value", c(2,4))
print(long_data)
## # A tibble: 20 × 4
##       ID Gender Variable Value
##    <int> <chr>  <chr>    <dbl>
##  1     1 Male   Age         25
##  2     1 Male   Score       85
##  3     2 Female Age         30
##  4     2 Female Score       90
##  5     3 Female Age         22
##  6     3 Female Score       78
##  7     4 Male   Age         35
##  8     4 Male   Score       88
##  9     5 Female Age         28
## 10     5 Female Score       76
## 11     6 Male   Age         40
## 12     6 Male   Score       95
## 13     7 Male   Age         19
## 14     7 Male   Score       65
## 15     8 Female Age         33
## 16     8 Female Score       84
## 17     9 Female Age         26
## 18     9 Female Score       82
## 19    10 Male   Age         37
## 20    10 Male   Score       91

8. Transform Data from Long to Wide Format (pivot_wider)

# Convert back to wide format
wide_data <- long_data %>% pivot_wider(names_from = Variable, values_from = Value)
print(wide_data)
## # A tibble: 10 × 4
##       ID Gender   Age Score
##    <int> <chr>  <dbl> <dbl>
##  1     1 Male      25    85
##  2     2 Female    30    90
##  3     3 Female    22    78
##  4     4 Male      35    88
##  5     5 Female    28    76
##  6     6 Male      40    95
##  7     7 Male      19    65
##  8     8 Female    33    84
##  9     9 Female    26    82
## 10    10 Male      37    91
    1. Using replace_na and ifelse in R

Example Dataset

# Example dataset
library(dplyr)
library(tidyr)
data <- tibble(
  ID = 1:10,
  Age = c(25, 30, NA, 35, 28, 40, 19, NA, 26, 37),
  Score = c(85, 90, 78, NA, 76, 95, NA, 84, 82, 91)
)

print(data)
## # A tibble: 10 × 3
##       ID   Age Score
##    <int> <dbl> <dbl>
##  1     1    25    85
##  2     2    30    90
##  3     3    NA    78
##  4     4    35    NA
##  5     5    28    76
##  6     6    40    95
##  7     7    19    NA
##  8     8    NA    84
##  9     9    26    82
## 10    10    37    91

9.1 Replace Missing Values (replace_na)

# Replace NA in Age and Score columns with specific values
replaced_data <- data %>% 
  replace_na(list(Age = 0, Score = 50))
print(replaced_data)
## # A tibble: 10 × 3
##       ID   Age Score
##    <int> <dbl> <dbl>
##  1     1    25    85
##  2     2    30    90
##  3     3     0    78
##  4     4    35    50
##  5     5    28    76
##  6     6    40    95
##  7     7    19    50
##  8     8     0    84
##  9     9    26    82
## 10    10    37    91

9.2 Conditional Replacement (ifelse)

# Create a new column 'Pass' based on Score
data
## # A tibble: 10 × 3
##       ID   Age Score
##    <int> <dbl> <dbl>
##  1     1    25    85
##  2     2    30    90
##  3     3    NA    78
##  4     4    35    NA
##  5     5    28    76
##  6     6    40    95
##  7     7    19    NA
##  8     8    NA    84
##  9     9    26    82
## 10    10    37    91
conditional_data <- data %>% 
  mutate(Pass = ifelse(Score >= 80, "Yes", "No"))
print(conditional_data)
## # A tibble: 10 × 4
##       ID   Age Score Pass 
##    <int> <dbl> <dbl> <chr>
##  1     1    25    85 Yes  
##  2     2    30    90 Yes  
##  3     3    NA    78 No   
##  4     4    35    NA <NA> 
##  5     5    28    76 No   
##  6     6    40    95 Yes  
##  7     7    19    NA <NA> 
##  8     8    NA    84 Yes  
##  9     9    26    82 Yes  
## 10    10    37    91 Yes

9.3 Combine replace_na and ifelse

# Replace NA in Score with 50 and then create 'Pass' column
combined_data <- data %>% 
  replace_na(list(Score = 50)) %>% 
  mutate(Pass = ifelse(Score >= 80, "Yes", "No"))
print(combined_data)
## # A tibble: 10 × 4
##       ID   Age Score Pass 
##    <int> <dbl> <dbl> <chr>
##  1     1    25    85 Yes  
##  2     2    30    90 Yes  
##  3     3    NA    78 No   
##  4     4    35    50 No   
##  5     5    28    76 No   
##  6     6    40    95 Yes  
##  7     7    19    50 No   
##  8     8    NA    84 Yes  
##  9     9    26    82 Yes  
## 10    10    37    91 Yes