To clean and manupulate the data in R, we use tidyr and dplyr. This
lesson is prepared to equip students with the ability to manage all data
before analysis.
# Import the library
library(tidyr) # Clean the data
library(dplyr) # Manipulate the data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
1 How to create the data in R
# Option 1
## First, create each variable
ID = 1:10
Age = c(25, 30, 22, 35, 28, 40, 19, 33, 26, 37)
Gender = c("Male", "Female", "Female", "Male", "Female", "Male", "Male", "Female", "Female", "Male")
## Second, create dataframe
data <- data.frame(ID, Age, Gender)
# Show the data
data
## ID Age Gender
## 1 1 25 Male
## 2 2 30 Female
## 3 3 22 Female
## 4 4 35 Male
## 5 5 28 Female
## 6 6 40 Male
## 7 7 19 Male
## 8 8 33 Female
## 9 9 26 Female
## 10 10 37 Male
# Option 2
data <- tibble(
ID = 1:10,
Age = c(25, 30, 22, 35, 28, 40, 19, 33, 26, 37),
Gender = c("Male", "Female", "Female", "Male", "Female",
"Male", "Male", "Female", "Female", "Male")
)
# Show the data
data
## # A tibble: 10 × 3
## ID Age Gender
## <int> <dbl> <chr>
## 1 1 25 Male
## 2 2 30 Female
## 3 3 22 Female
## 4 4 35 Male
## 5 5 28 Female
## 6 6 40 Male
## 7 7 19 Male
## 8 8 33 Female
## 9 9 26 Female
## 10 10 37 Male
2. Filter the data (filter
)
2.1 Filter rows where Age is greater than 30
# Filter rows where Age > 30
df_age <- data %>% filter(Age > 30)
print(df_age)
## # A tibble: 4 × 3
## ID Age Gender
## <int> <dbl> <chr>
## 1 4 35 Male
## 2 6 40 Male
## 3 8 33 Female
## 4 10 37 Male
2.2 Filter rows where Gender is “Female”
# Filter rows where Gender is "Female"
df_female<- data %>% filter(Gender == "Female")
print(df_female)
## # A tibble: 5 × 3
## ID Age Gender
## <int> <dbl> <chr>
## 1 2 30 Female
## 2 3 22 Female
## 3 5 28 Female
## 4 8 33 Female
## 5 9 26 Female
2.3 Filter rows where Gender, deleting “Female”
# Filter rows where Gender is "Female"
exc_female<- data %>% filter(!Gender == "Female")
print(exc_female)
## # A tibble: 5 × 3
## ID Age Gender
## <int> <dbl> <chr>
## 1 1 25 Male
## 2 4 35 Male
## 3 6 40 Male
## 4 7 19 Male
## 5 10 37 Male
2.4 Filter rows where Age is between 25 and 35 (inclusive)
# Filter rows where Age is between 25 and 35
age_range <- data %>% filter(Age >= 25 & Age <= 35)
print(age_range)
## # A tibble: 6 × 3
## ID Age Gender
## <int> <dbl> <chr>
## 1 1 25 Male
## 2 2 30 Female
## 3 4 35 Male
## 4 5 28 Female
## 5 8 33 Female
## 6 9 26 Female
2.5 Filter rows with multiple conditions (e.g., Age > 25 and
Gender is “Male”)
# Filter rows where Age > 25 and Gender is "Male"
age_male <- data %>% filter(Age > 25, Gender == "Male")
print(age_male)
## # A tibble: 3 × 3
## ID Age Gender
## <int> <dbl> <chr>
## 1 4 35 Male
## 2 6 40 Male
## 3 10 37 Male
3. Select the data (select
)
Example Dataset
# Example dataset
library(dplyr)
data <- tibble(
ID = 1:10,
Age = c(25, 30, 22, 35, 28, 40, 19, 33, 26, 37),
Gender = c("Male", "Female", "Female", "Male", "Female",
"Male", "Male", "Female", "Female", "Male"),
Score = c(85, 90, 78, 88, 76, 95, 65, 84, 82, 91)
)
print(data)
## # A tibble: 10 × 4
## ID Age Gender Score
## <int> <dbl> <chr> <dbl>
## 1 1 25 Male 85
## 2 2 30 Female 90
## 3 3 22 Female 78
## 4 4 35 Male 88
## 5 5 28 Female 76
## 6 6 40 Male 95
## 7 7 19 Male 65
## 8 8 33 Female 84
## 9 9 26 Female 82
## 10 10 37 Male 91
3.1 Select Specific Columns
# Select only ID and Age columns
selected_data <- data %>% select(ID, Age)
print(selected_data)
## # A tibble: 10 × 2
## ID Age
## <int> <dbl>
## 1 1 25
## 2 2 30
## 3 3 22
## 4 4 35
## 5 5 28
## 6 6 40
## 7 7 19
## 8 8 33
## 9 9 26
## 10 10 37
3.2 Select Specific Columns (using column number)
# Select only ID and Age columns
df <- data %>% select(1, 3)
print(df)
## # A tibble: 10 × 2
## ID Gender
## <int> <chr>
## 1 1 Male
## 2 2 Female
## 3 3 Female
## 4 4 Male
## 5 5 Female
## 6 6 Male
## 7 7 Male
## 8 8 Female
## 9 9 Female
## 10 10 Male
3.3 Delect Columns
# Delete ID and Age columns by using "-"
df1 <- data %>% select(-c(ID, Age))
print(df1)
## # A tibble: 10 × 2
## Gender Score
## <chr> <dbl>
## 1 Male 85
## 2 Female 90
## 3 Female 78
## 4 Male 88
## 5 Female 76
## 6 Male 95
## 7 Male 65
## 8 Female 84
## 9 Female 82
## 10 Male 91
4. Rename Columns
# Rename column 'Score' to 'Exam_Score'
renamed_data <- data %>% rename(Exam_Score = Score)
print(renamed_data)
## # A tibble: 10 × 4
## ID Age Gender Exam_Score
## <int> <dbl> <chr> <dbl>
## 1 1 25 Male 85
## 2 2 30 Female 90
## 3 3 22 Female 78
## 4 4 35 Male 88
## 5 5 28 Female 76
## 6 6 40 Male 95
## 7 7 19 Male 65
## 8 8 33 Female 84
## 9 9 26 Female 82
## 10 10 37 Male 91
5. Create or Modify Columns (Mutate
)
# Add a new column 'Age_Group' based on Age
mutated_data1 <- data %>% mutate(half_score = Score/2)
print(mutated_data1)
## # A tibble: 10 × 5
## ID Age Gender Score half_score
## <int> <dbl> <chr> <dbl> <dbl>
## 1 1 25 Male 85 42.5
## 2 2 30 Female 90 45
## 3 3 22 Female 78 39
## 4 4 35 Male 88 44
## 5 5 28 Female 76 38
## 6 6 40 Male 95 47.5
## 7 7 19 Male 65 32.5
## 8 8 33 Female 84 42
## 9 9 26 Female 82 41
## 10 10 37 Male 91 45.5
# Add a new column 'Age_Group' based on Age
mutated_data <- data %>% mutate(Age_Group = if_else(Age > 30, "Older", "Younger"))
print(mutated_data)
## # A tibble: 10 × 5
## ID Age Gender Score Age_Group
## <int> <dbl> <chr> <dbl> <chr>
## 1 1 25 Male 85 Younger
## 2 2 30 Female 90 Younger
## 3 3 22 Female 78 Younger
## 4 4 35 Male 88 Older
## 5 5 28 Female 76 Younger
## 6 6 40 Male 95 Older
## 7 7 19 Male 65 Younger
## 8 8 33 Female 84 Older
## 9 9 26 Female 82 Younger
## 10 10 37 Male 91 Older
6. Arrange the data (arrange
)
6.1 Arrange Rows by a Column (large to small)
# Arrange rows by Age in descending order
arranged_data <- data %>% arrange(desc(Age))
print(arranged_data)
## # A tibble: 10 × 4
## ID Age Gender Score
## <int> <dbl> <chr> <dbl>
## 1 6 40 Male 95
## 2 10 37 Male 91
## 3 4 35 Male 88
## 4 8 33 Female 84
## 5 2 30 Female 90
## 6 5 28 Female 76
## 7 9 26 Female 82
## 8 1 25 Male 85
## 9 3 22 Female 78
## 10 7 19 Male 65
6.2 Arrange Rows by a Column (small to large)
# Arrange rows by Age in descending order
arranged_data1 <- data %>% arrange(Age)
print(arranged_data1)
## # A tibble: 10 × 4
## ID Age Gender Score
## <int> <dbl> <chr> <dbl>
## 1 7 19 Male 65
## 2 3 22 Female 78
## 3 1 25 Male 85
## 4 9 26 Female 82
## 5 5 28 Female 76
## 6 2 30 Female 90
## 7 8 33 Female 84
## 8 4 35 Male 88
## 9 10 37 Male 91
## 10 6 40 Male 95
6.3 Arrange Rows by a Column (categorical variable)
# Arrange rows by Age in descending order
arranged_data <- data %>% arrange(Gender)
print(arranged_data)
## # A tibble: 10 × 4
## ID Age Gender Score
## <int> <dbl> <chr> <dbl>
## 1 2 30 Female 90
## 2 3 22 Female 78
## 3 5 28 Female 76
## 4 8 33 Female 84
## 5 9 26 Female 82
## 6 1 25 Male 85
## 7 4 35 Male 88
## 8 6 40 Male 95
## 9 7 19 Male 65
## 10 10 37 Male 91
Example Dataset
# Example dataset
library(dplyr)
library(tidyr)
data <- tibble(
ID = 1:10,
Age = c(25, 30, NA, 35, 28, 40, 19, NA, 26, 37),
Score = c(85, 90, 78, NA, 76, 95, NA, 84, 82, 91)
)
print(data)
## # A tibble: 10 × 3
## ID Age Score
## <int> <dbl> <dbl>
## 1 1 25 85
## 2 2 30 90
## 3 3 NA 78
## 4 4 35 NA
## 5 5 28 76
## 6 6 40 95
## 7 7 19 NA
## 8 8 NA 84
## 9 9 26 82
## 10 10 37 91
9.1 Replace Missing Values (replace_na
)
# Replace NA in Age and Score columns with specific values
replaced_data <- data %>%
replace_na(list(Age = 0, Score = 50))
print(replaced_data)
## # A tibble: 10 × 3
## ID Age Score
## <int> <dbl> <dbl>
## 1 1 25 85
## 2 2 30 90
## 3 3 0 78
## 4 4 35 50
## 5 5 28 76
## 6 6 40 95
## 7 7 19 50
## 8 8 0 84
## 9 9 26 82
## 10 10 37 91
9.2 Conditional Replacement (ifelse
)
# Create a new column 'Pass' based on Score
data
## # A tibble: 10 × 3
## ID Age Score
## <int> <dbl> <dbl>
## 1 1 25 85
## 2 2 30 90
## 3 3 NA 78
## 4 4 35 NA
## 5 5 28 76
## 6 6 40 95
## 7 7 19 NA
## 8 8 NA 84
## 9 9 26 82
## 10 10 37 91
conditional_data <- data %>%
mutate(Pass = ifelse(Score >= 80, "Yes", "No"))
print(conditional_data)
## # A tibble: 10 × 4
## ID Age Score Pass
## <int> <dbl> <dbl> <chr>
## 1 1 25 85 Yes
## 2 2 30 90 Yes
## 3 3 NA 78 No
## 4 4 35 NA <NA>
## 5 5 28 76 No
## 6 6 40 95 Yes
## 7 7 19 NA <NA>
## 8 8 NA 84 Yes
## 9 9 26 82 Yes
## 10 10 37 91 Yes
9.3 Combine replace_na
and ifelse
# Replace NA in Score with 50 and then create 'Pass' column
combined_data <- data %>%
replace_na(list(Score = 50)) %>%
mutate(Pass = ifelse(Score >= 80, "Yes", "No"))
print(combined_data)
## # A tibble: 10 × 4
## ID Age Score Pass
## <int> <dbl> <dbl> <chr>
## 1 1 25 85 Yes
## 2 2 30 90 Yes
## 3 3 NA 78 No
## 4 4 35 50 No
## 5 5 28 76 No
## 6 6 40 95 Yes
## 7 7 19 50 No
## 8 8 NA 84 Yes
## 9 9 26 82 Yes
## 10 10 37 91 Yes