R dplyr: select()

knit("2. dplyr.Rmd")

## 
## 
## processing file: 2. dplyr.Rmd

## 
  |                                                      
  |                                                |   0%
  |                                                      
  |...                                             |   6%                   
  |                                                      
  |.....                                           |  11% [unnamed-chunk-20]
  |                                                      
  |........                                        |  17%                   
  |                                                      
  |...........                                     |  22% [unnamed-chunk-21]
  |                                                      
  |.............                                   |  28%                   
  |                                                      
  |................                                |  33% [unnamed-chunk-22]
  |                                                      
  |...................                             |  39%                   
  |                                                      
  |.....................                           |  44% [unnamed-chunk-23]
  |                                                      
  |........................                        |  50%                   
  |                                                      
  |...........................                     |  56% [unnamed-chunk-24]
  |                                                      
  |.............................                   |  61%                   
  |                                                      
  |................................                |  67% [unnamed-chunk-25]
  |                                                      
  |...................................             |  72%                   
  |                                                      
  |.....................................           |  78% [unnamed-chunk-26]
  |                                                      
  |........................................        |  83%                   
  |                                                      
  |...........................................     |  89% [unnamed-chunk-27]
  |                                                      
  |.............................................   |  94%                   
  |                                                      
  |................................................| 100% [unnamed-chunk-28]

## output file: 2. dplyr.md

## [1] "2. dplyr.md"

#dplyr: select()

# Mẫu tập dữ liệu
data <- data.frame(
  name = c("Alice", "Bob", "Charlie", "David"),
  age = c(28, 35, 22, 29),
  gender = c("Female", "Male", "Male", "Male"),
  occupation = c("Engineer", "Teacher", "Student", "Doctor")
)
print(data)

# Sử dụng hàm select() để chọn cột
selected_data <- select(data, name, occupation)
print(selected_data)

# Load tập dữ liệu iris
data(iris)
head(iris)


#Ví dụ 1: Sử dụng hàm filter() để lọc các bông hoa có độ dài cánh hoa (Petal.Length) lớn hơn 4:

filtered_data <- iris %>%
  filter(Petal.Length > 4)

head(filtered_data)

#Ví dụ 2: Sử dụng hàm select() để chỉ chọn các cột Sepal.Length và Species:

selected_data <- iris %>%
  select(Sepal.Length, Species)

head(selected_data)

#Ví dụ 3: Sử dụng hàm mutate() để tính toán tỷ lệ giữa độ dài cánh hoa và độ dài đài hoa (Petal.Length / Sepal.Length):

mutated_data <- iris %>%
  mutate(Petal_Sepal_Ratio = Petal.Length / Sepal.Length)

head(mutated_data)

#Ví dụ 4: Sử dụng hàm arrange() để sắp xếp dữ liệu theo độ dài cánh hoa (Petal.Length) giảm dần:

arranged_data <- iris %>%
  arrange(desc(Petal.Length))

head(arranged_data)

#Ví dụ 5: Sử dụng hàm summarize() để tính giá trị trung bình của độ dài cánh hoa (Petal.Length) cho từng loài (Species):

summary_data <- iris %>%
  group_by(Species) %>%
  summarize(Avg_Petal_Length = mean(Petal.Length))

print(summary_data)

#sử dụng hàm distinct() để lấy các giá trị duy nhất từ một cột cụ thể, ví dụ là cột Species:
  
unique_species <- iris %>%
  distinct(Species)

print(unique_species)

#sử dụng inner_join() để kết hợp dữ liệu từ hai tập dựa trên cột Species

# Tạo tập dữ liệu mới chứa thông tin về đài hoa
sepal_data <- data.frame(
  Species = c("setosa", "versicolor", "virginica"),
  Sepal_Length = c(5.1, 5.5, 6.3)
)

# Sử dụng hàm join() để kết hợp dữ liệu từ hai tập
merged_data <- iris %>%
  inner_join(sepal_data, by = "Species")

head(merged_data)

R dplyr: select()

Ho Duc Duy

2023-08-17