#THỰC HÀNH NGÀY 23-10-25 ##VIỆC 1. PHÂN TÍCH TƯƠNG QUAN ###1.1 Nhập dữ liệu về cân nặng của 4 nhóm A, B, C và D

A = c( 8, 9, 11, 4, 7, 8, 5)
B = c(7, 17, 10, 14, 12, 24, 11, 22)
C = c(28, 21, 26, 11, 24, 19)
D = c(26, 16, 13, 12, 9, 10, 11, 17, 15)

wt = c(A, B, C, D)
group = c(rep("A", 7), rep("B", 8), rep("C" , 6), rep("D" , 9))
data = data.frame(wt, group)

data
##    wt group
## 1   8     A
## 2   9     A
## 3  11     A
## 4   4     A
## 5   7     A
## 6   8     A
## 7   5     A
## 8   7     B
## 9  17     B
## 10 10     B
## 11 14     B
## 12 12     B
## 13 24     B
## 14 11     B
## 15 22     B
## 16 28     C
## 17 21     C
## 18 26     C
## 19 11     C
## 20 24     C
## 21 19     C
## 22 26     D
## 23 16     D
## 24 13     D
## 25 12     D
## 26  9     D
## 27 10     D
## 28 11     D
## 29 17     D
## 30 15     D

###1.2 Mô tả cân nặng giữa 4 nhóm.

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ wt | group, data = data, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))
A
(N=7)
B
(N=8)
C
(N=6)
D
(N=9)
Overall
(N=30)
wt
Mean (SD) 7.43 (2.37) 14.6 (5.95) 21.5 (6.09) 14.3 (5.15) 14.2 (6.75)
Median [Q1, Q3] 8.00 [6.00, 8.50] 13.0 [10.8, 18.3] 22.5 [19.5, 25.5] 13.0 [11.0, 16.0] 12.0 [9.25, 18.5]

###1.3 Phân tích sự khác biệt về cân nặng giữa 4 nhóm

av = aov(wt ~ group, data = data)
summary(av)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## group        3  642.3  214.09   8.197 0.000528 ***
## Residuals   26  679.1   26.12                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

###1.4 Thực hiện phân tích hậu định

TukeyHSD(av)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = wt ~ group, data = data)
## 
## $group
##           diff          lwr        upr     p adj
## B-A  7.1964286  -0.05969765 14.4525548 0.0525014
## C-A 14.0714286   6.27132726 21.8715299 0.0002134
## D-A  6.9047619  -0.16073856 13.9702624 0.0571911
## C-B  6.8750000  -0.69675602 14.4467560 0.0850381
## D-B -0.2916667  -7.10424368  6.5209103 0.9994049
## D-C -7.1666667 -14.55594392  0.2226106 0.0597131

##VIỆC 2. PHÂN TÍCH TƯƠNG QUAN ###2.1 Đọc dữ liệu “Demo data.csv” vào R và gọi dữ liệu là “df”

file.choose()
## [1] "C:\\Users\\ASUS\\Desktop\\Tap huan NCKH\\thuc hanh 23-10-25 (Phan Ha).Rmd"
df = read.csv( "C:\\Users\\ASUS\\Desktop\\Tap huan NCKH\\Demo.csv")
head(df)
##   X id age gender weight height pcfat
## 1 1  1  53      F     49    150  37.3
## 2 2  2  65      M     52    165  16.8
## 3 3  3  64      F     57    157  34.0
## 4 4  4  56      F     53    156  33.8
## 5 5  5  54      M     51    160  14.8
## 6 6  6  52      F     47    153  32.2

###2.2 Mô tả đặc điểm cân nặng (weight) và chiều cao (height).

library(table1)
table1(~ weight + height, data = df)
Overall
(N=1217)
weight
Mean (SD) 55.1 (9.40)
Median [Min, Max] 54.0 [34.0, 95.0]
height
Mean (SD) 157 (7.98)
Median [Min, Max] 155 [136, 185]

###2.3 Vẽ biểu đồ tán xạ đánh giá mối liên quan giữa cân nặng (weight) và chiều cao (height

plot(height ~ weight, data = df)

library(ggplot2)
ggplot(data = df, aes(x = weight, y = height)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

###2.4 Phân tích tương quan định lượng mối liên quan giữa cân nặng (weight) và chiều cao (height).

cor.test(df$weight, df$height)
## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$height
## t = 25.984, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5602911 0.6326135
## sample estimates:
##       cor 
## 0.5976667

###2.5 Phân tích tương quan định lượng mối liên quan giữa chiều cao (height) và tỉ trọng mỡ (pcfat).

cor.test(df$pcfat, df$height)
## 
##  Pearson's product-moment correlation
## 
## data:  df$pcfat and df$height
## t = -19.063, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5219407 -0.4353664
## sample estimates:
##        cor 
## -0.4798206

##VIỆC 3. HỒI QUY TUYẾN TÍNH ###3.1 Đọc dữ liệu “gapminder” vào R từ gói lệnh gapminder.

library(gapminder)
data(gapminder)
vn = subset(gapminder, country == "Vietnam")
head(vn)
## # A tibble: 6 × 6
##   country continent  year lifeExp      pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Vietnam Asia       1952    40.4 26246839      605.
## 2 Vietnam Asia       1957    42.9 28998543      676.
## 3 Vietnam Asia       1962    45.4 33796140      772.
## 4 Vietnam Asia       1967    47.8 39463910      637.
## 5 Vietnam Asia       1972    50.3 44655014      700.
## 6 Vietnam Asia       1977    55.8 50533506      714.
library(lessR)
## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:table1':
## 
##     label
Plot(lifeExp, year, data = vn, xlab = "Life expectancy (years)", ylab = "Year")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lifeExp, year, enhance=TRUE)  # many options
## Plot(lifeExp, year, color="red")  # exterior edge color of points
## Plot(lifeExp, year, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(lifeExp, year, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 12 
## Sample Correlation of lifeExp and year: r = 0.995 
##   
## Hypothesis Test of 0 Correlation:  t = 30.569,  df = 10,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.981 to 0.999 
## 

###3.2 Thực hiện phân tích hồi qui tuyến tính

m.1 = lm(lifeExp ~ year, data = vn)
summary(m.1)
## 
## Call:
## lm(formula = lifeExp ~ year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1884 -0.5840  0.1335  0.7396  1.7873 
## 
## Coefficients:
##                Estimate  Std. Error t value        Pr(>|t|)    
## (Intercept) -1271.98315    43.49240  -29.25 0.0000000000510 ***
## year            0.67162     0.02197   30.57 0.0000000000329 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.314 on 10 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9884 
## F-statistic: 934.5 on 1 and 10 DF,  p-value: 0.00000000003289

###3.4 Viết phương trình

Life Exp = -1272 + 0.67*year ###3.5 Sử dụng ChatGPT # 1️⃣ Nhập dữ liệu

year <- c(1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007)
lifeExp <- c(40.4, 42.9, 45.4, 47.8, 50.3, 55.8, 58.8, 62.8, 67.7, 70.7, 73.0, 74.2)
vn <- data.frame(year, lifeExp)

2️⃣ Xem qua dữ liệu

print(vn)
##    year lifeExp
## 1  1952    40.4
## 2  1957    42.9
## 3  1962    45.4
## 4  1967    47.8
## 5  1972    50.3
## 6  1977    55.8
## 7  1982    58.8
## 8  1987    62.8
## 9  1992    67.7
## 10 1997    70.7
## 11 2002    73.0
## 12 2007    74.2

3️⃣ Phân tích hồi quy tuyến tính

model <- lm(lifeExp ~ year, data = vn)
summary(model)
## 
## Call:
## lm(formula = lifeExp ~ year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1494 -0.5944  0.1387  0.7324  1.8268 
## 
## Coefficients:
##                Estimate  Std. Error t value        Pr(>|t|)    
## (Intercept) -1271.13492    43.77173  -29.04 0.0000000000547 ***
## year            0.67119     0.02211   30.35 0.0000000000353 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.322 on 10 degrees of freedom
## Multiple R-squared:  0.9893, Adjusted R-squared:  0.9882 
## F-statistic: 921.4 on 1 and 10 DF,  p-value: 0.00000000003527

##VIỆC 4. HỒI QUY ĐA BIÊN ###4.1 Nhập dữ liệu vào R

Y  = c(12.1, 11.9, 10.2, 8.0, 7.7, 5.3, 7.9, 7.8, 5.5, 2.6)
X1 = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
X2 = c(7, 4, 4, 6, 4, 2, 1, 1, 1, 0)
df = data.frame(Y, X1, X2)
head(df)
##      Y X1 X2
## 1 12.1  0  7
## 2 11.9  1  4
## 3 10.2  2  4
## 4  8.0  3  6
## 5  7.7  4  4
## 6  5.3  5  2

###4.2 Đánh giá mối liên quan giữa Y và X1

model1 = lm(Y ~ X1, data = df)
summary(model1)
## 
## Call:
## lm(formula = Y ~ X1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1606 -1.0735  0.1742  0.8621  2.0970 
## 
## Coefficients:
##             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)  11.8545     0.8283  14.312 0.000000554 ***
## X1           -0.8788     0.1552  -5.664    0.000474 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.409 on 8 degrees of freedom
## Multiple R-squared:  0.8004, Adjusted R-squared:  0.7755 
## F-statistic: 32.08 on 1 and 8 DF,  p-value: 0.0004737

###4.3 Đánh giá mối liên quan giữa Y và X2

model2 = lm(Y ~ X2, data = df)
summary(model2)
## 
## Call:
## lm(formula = Y ~ X2, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.702 -1.533 -0.034  1.667  3.066 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   5.0980     1.1222   4.543  0.00189 **
## X2            0.9340     0.2999   3.114  0.01436 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.121 on 8 degrees of freedom
## Multiple R-squared:  0.548,  Adjusted R-squared:  0.4915 
## F-statistic: 9.698 on 1 and 8 DF,  p-value: 0.01436

###4.4 Đánh giá mối liên quan độc lập giữa X2 và Y

model3 = lm(Y ~ X1 + X2, data = df)
summary(model3)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46078 -0.33384  0.00026  0.81856  1.98476 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  14.7076     2.9785   4.938  0.00168 **
## X1           -1.2042     0.3614  -3.332  0.01255 * 
## X2           -0.4629     0.4642  -0.997  0.35187   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.41 on 7 degrees of freedom
## Multiple R-squared:  0.8252, Adjusted R-squared:  0.7753 
## F-statistic: 16.53 on 2 and 7 DF,  p-value: 0.002232