DA_SHK44

Câu 2: Vẽ biểu đồ hộp và biểu diễn giá trị nhỏ nhất

library(readr)
library(ggplot2)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ purrr   0.3.4     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

df1 <- "https://raw.githubusercontent.com/ngocdlu/K44_test/master/iris.csv"
df1 <- read_csv(df1)

## Rows: 150 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): species
## dbl (4): sepal_length, sepal_width, petal_length, petal_width

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(df1)

## # A tibble: 6 × 5
##   sepal_length sepal_width petal_length petal_width species
##          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa 
## 4          4.6         3.1          1.5         0.2 setosa 
## 5          5           3.6          1.4         0.2 setosa 
## 6          5.4         3.9          1.7         0.4 setosa

Vẽ biểu đồ hộp chiều dìa lá theo loài:

ggplot(df1, aes(x=species, y=sepal_length, fill = species)) + geom_boxplot()

Tính giá trị nhỏ nhất chiều dài cuống lá:

min_sepal_by_species <- df1 %>%
  group_by(species) %>%
  summarise(min_sepal = min(sepal_length)) %>%
  as.data.frame
min_sepal_by_species

##      species min_sepal
## 1     setosa       4.3
## 2 versicolor       4.9
## 3  virginica       4.9

Biểu diễn giá trị nhỏ nhất lên biểu đồ hộp:

ggplot(df1, aes(x=species, y=sepal_length, fill = species)) + geom_boxplot() + labs(title = "Biểu đồ so sánh chiều dài đài hoa", xlab = "Loài thực vật", ylab = "Chiều dài đài hoa (Cm)") + geom_point(data = min_sepal_by_species, aes( x= species, y = round(min_sepal)), col = "red") + geom_text(aes(label =round(min_sepal, 1), x= species, y = round(min_sepal, 1)), data = min_sepal_by_species, check_overlap = TRUE, vjust = -0.5)

Xong bước này lưu biểu đồ dưới dạng pdf với tên file “Plot_1”

Câu 3: Phân tích phương sai và phân tích hậu định:

data(iris)
attach(iris)
av <- aov(Sepal.Length~Species)
t <- TukeyHSD(av)
plot(t)

Xong lưu biểu đồ dạng pdf với tên “Plot_2”

Câu 4: Phân tích tương quan và phân tích thống kê

library(ggpubr)
df4 <- "https://raw.githubusercontent.com/ngocdlu/data_analysis/main/bidoupensis.csv"
df4 <- read_csv(df4)

## Rows: 65 Columns: 6

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): species
## dbl (5): pet, len, wid, rat, cir

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggscatter(df4, x = "len", y = "wid", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "Blade Length", ylab = "Blade width")

## `geom_smooth()` using formula 'y ~ x'

*Xong ý này lưu biểu đồ tương quan với tên “Plot_3”

Vẽ biểu đồ hộp so sánh chiều rộng phiến lá (wid) của 3 loài thực vật trong data set bidoupensis.csv với package ggstatsplot

library(ggstatsplot)

## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167

ggbetweenstats(data= df4, x = species, y = wid, plot.type = "box", title = "Biểu đồ hộp so sánh chiều rộng phiến lá", xlab = "Loài thực vật", ylab = "Chiều rộng phiến lá (Cm)")

Xong bước này lưu biểu đồ dạng pdf và lưu tên file là “Plot_4”

Câu 5: Phân tích thành phần chính PCA

library(FactoMineR)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

df5 <- "https://raw.githubusercontent.com/ngocdlu/data_analysis/main/bidoupensis.csv"
df5 <- read_csv(df5)

## Rows: 65 Columns: 6

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): species
## dbl (5): pet, len, wid, rat, cir

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Tách các biến định lượng

df6 <- df5[,1:5]
head(df6)

## # A tibble: 6 × 5
##     pet   len   wid   rat   cir
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  0.61 11.0   5.21  2.12  0.75
## 2  0.58 11.6   5.03  2.3   0.74
## 3  0.35  9.1   3.72  2.45  0.64
## 4  0.32  9.72  4.26  2.28  0.71
## 5  0.35  8.99  4.16  2.16  0.67
## 6  0.43 10.0   4.93  2.04  0.74

attach(df6)

Tính PCA

pca <- PCA(df6, graph = FALSE)
print(pca)

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 65 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

Trích xuất giá trị eigenvalue

eig.val <- get_eigenvalue(pca)
eig.val

##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.974304748       59.4860950                    59.48609
## Dim.2 1.661036931       33.2207386                    92.70683
## Dim.3 0.299409050        5.9881810                    98.69501
## Dim.4 0.058451339        1.1690268                    99.86404
## Dim.5 0.006797931        0.1359586                   100.00000

Vẽ biểu đồ sree

fviz_eig(pca, addlabels = TRUE, ylim = c(0, 100))

Xong bước này lưu biểu đồ dang pdf với tên “Plot_5”

Vẻ biểu đồ thành phần chính PCA

fviz_pca_ind(pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = df5$species, # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, # Concentration ellipses
             legend.title = "Groups"
             )

Xong bước này lưu biểu đồ dạng pdf với tên “Plot_6”

—– Hết —–

DA_SHK44

Ngoc Nguyen

1/14/2022

Câu 2: Vẽ biểu đồ hộp và biểu diễn giá trị nhỏ nhất

Câu 3: Phân tích phương sai và phân tích hậu định:

Câu 4: Phân tích tương quan và phân tích thống kê

Câu 5: Phân tích thành phần chính PCA