#THỰC HÀNH NGÀY 23-10-25 ##VIỆC 1. PHÂN TÍCH TƯƠNG QUAN ###1.1 Nhập dữ liệu về cân nặng của 4 nhóm A, B, C và D
A = c( 8, 9, 11, 4, 7, 8, 5)
B = c(7, 17, 10, 14, 12, 24, 11, 22)
C = c(28, 21, 26, 11, 24, 19)
D = c(26, 16, 13, 12, 9, 10, 11, 17, 15)
wt = c(A, B, C, D)
group = c(rep("A", 7), rep("B", 8), rep("C" , 6), rep("D" , 9))
data = data.frame(wt, group)
data
## wt group
## 1 8 A
## 2 9 A
## 3 11 A
## 4 4 A
## 5 7 A
## 6 8 A
## 7 5 A
## 8 7 B
## 9 17 B
## 10 10 B
## 11 14 B
## 12 12 B
## 13 24 B
## 14 11 B
## 15 22 B
## 16 28 C
## 17 21 C
## 18 26 C
## 19 11 C
## 20 24 C
## 21 19 C
## 22 26 D
## 23 16 D
## 24 13 D
## 25 12 D
## 26 9 D
## 27 10 D
## 28 11 D
## 29 17 D
## 30 15 D
###1.2 Mô tả cân nặng giữa 4 nhóm.
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ wt | group, data = data, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))
| A (N=7) |
B (N=8) |
C (N=6) |
D (N=9) |
Overall (N=30) |
|
|---|---|---|---|---|---|
| wt | |||||
| Mean (SD) | 7.43 (2.37) | 14.6 (5.95) | 21.5 (6.09) | 14.3 (5.15) | 14.2 (6.75) |
| Median [Q1, Q3] | 8.00 [6.00, 8.50] | 13.0 [10.8, 18.3] | 22.5 [19.5, 25.5] | 13.0 [11.0, 16.0] | 12.0 [9.25, 18.5] |
###1.3 Phân tích sự khác biệt về cân nặng giữa 4 nhóm
av = aov(wt ~ group, data = data)
summary(av)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 3 642.3 214.09 8.197 0.000528 ***
## Residuals 26 679.1 26.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
###1.4 Thực hiện phân tích hậu định
TukeyHSD(av)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = wt ~ group, data = data)
##
## $group
## diff lwr upr p adj
## B-A 7.1964286 -0.05969765 14.4525548 0.0525014
## C-A 14.0714286 6.27132726 21.8715299 0.0002134
## D-A 6.9047619 -0.16073856 13.9702624 0.0571911
## C-B 6.8750000 -0.69675602 14.4467560 0.0850381
## D-B -0.2916667 -7.10424368 6.5209103 0.9994049
## D-C -7.1666667 -14.55594392 0.2226106 0.0597131
##VIỆC 2. PHÂN TÍCH TƯƠNG QUAN ###2.1 Đọc dữ liệu “Demo data.csv” vào R và gọi dữ liệu là “df”
file.choose()
## [1] "C:\\Users\\ASUS\\Desktop\\Tap huan NCKH\\thuc hanh 23-10-25 (Phan Ha).Rmd"
df = read.csv( "C:\\Users\\ASUS\\Desktop\\Tap huan NCKH\\Demo.csv")
head(df)
## X id age gender weight height pcfat
## 1 1 1 53 F 49 150 37.3
## 2 2 2 65 M 52 165 16.8
## 3 3 3 64 F 57 157 34.0
## 4 4 4 56 F 53 156 33.8
## 5 5 5 54 M 51 160 14.8
## 6 6 6 52 F 47 153 32.2
###2.2 Mô tả đặc điểm cân nặng (weight) và chiều cao (height).
library(table1)
table1(~ weight + height, data = df)
| Overall (N=1217) |
|
|---|---|
| weight | |
| Mean (SD) | 55.1 (9.40) |
| Median [Min, Max] | 54.0 [34.0, 95.0] |
| height | |
| Mean (SD) | 157 (7.98) |
| Median [Min, Max] | 155 [136, 185] |
###2.3 Vẽ biểu đồ tán xạ đánh giá mối liên quan giữa cân nặng (weight) và chiều cao (height
plot(height ~ weight, data = df)
library(ggplot2)
ggplot(data = df, aes(x = weight, y = height)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
###2.4 Phân tích tương quan định lượng mối liên quan giữa cân nặng (weight) và chiều cao (height).
cor.test(df$weight, df$height)
##
## Pearson's product-moment correlation
##
## data: df$weight and df$height
## t = 25.984, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5602911 0.6326135
## sample estimates:
## cor
## 0.5976667
###2.5 Phân tích tương quan định lượng mối liên quan giữa chiều cao (height) và tỉ trọng mỡ (pcfat).
cor.test(df$pcfat, df$height)
##
## Pearson's product-moment correlation
##
## data: df$pcfat and df$height
## t = -19.063, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5219407 -0.4353664
## sample estimates:
## cor
## -0.4798206
##VIỆC 3. HỒI QUY TUYẾN TÍNH ###3.1 Đọc dữ liệu “gapminder” vào R từ gói lệnh gapminder.
library(gapminder)
data(gapminder)
vn = subset(gapminder, country == "Vietnam")
head(vn)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Vietnam Asia 1952 40.4 26246839 605.
## 2 Vietnam Asia 1957 42.9 28998543 676.
## 3 Vietnam Asia 1962 45.4 33796140 772.
## 4 Vietnam Asia 1967 47.8 39463910 637.
## 5 Vietnam Asia 1972 50.3 44655014 700.
## 6 Vietnam Asia 1977 55.8 50533506 714.
library(lessR)
##
## lessR 4.4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
##
## Attaching package: 'lessR'
## The following object is masked from 'package:table1':
##
## label
Plot(lifeExp, year, data = vn, xlab = "Life expectancy (years)", ylab = "Year")
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(lifeExp, year, enhance=TRUE) # many options
## Plot(lifeExp, year, color="red") # exterior edge color of points
## Plot(lifeExp, year, fit="lm", fit_se=c(.90,.99)) # fit line, stnd errors
## Plot(lifeExp, year, out_cut=.10) # label top 10% from center as outliers
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 12
## Sample Correlation of lifeExp and year: r = 0.995
##
## Hypothesis Test of 0 Correlation: t = 30.569, df = 10, p-value = 0.000
## 95% Confidence Interval for Correlation: 0.981 to 0.999
##
###3.2 Thực hiện phân tích hồi qui tuyến tính
m.1 = lm(lifeExp ~ year, data = vn)
summary(m.1)
##
## Call:
## lm(formula = lifeExp ~ year, data = vn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1884 -0.5840 0.1335 0.7396 1.7873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1271.98315 43.49240 -29.25 0.0000000000510 ***
## year 0.67162 0.02197 30.57 0.0000000000329 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.314 on 10 degrees of freedom
## Multiple R-squared: 0.9894, Adjusted R-squared: 0.9884
## F-statistic: 934.5 on 1 and 10 DF, p-value: 0.00000000003289
###3.4 Viết phương trình
Life Exp = -1272 + 0.67*year ###3.5 Sử dụng ChatGPT # 1️⃣ Nhập dữ liệu
year <- c(1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007)
lifeExp <- c(40.4, 42.9, 45.4, 47.8, 50.3, 55.8, 58.8, 62.8, 67.7, 70.7, 73.0, 74.2)
vn <- data.frame(year, lifeExp)
print(vn)
## year lifeExp
## 1 1952 40.4
## 2 1957 42.9
## 3 1962 45.4
## 4 1967 47.8
## 5 1972 50.3
## 6 1977 55.8
## 7 1982 58.8
## 8 1987 62.8
## 9 1992 67.7
## 10 1997 70.7
## 11 2002 73.0
## 12 2007 74.2
model <- lm(lifeExp ~ year, data = vn)
summary(model)
##
## Call:
## lm(formula = lifeExp ~ year, data = vn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1494 -0.5944 0.1387 0.7324 1.8268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1271.13492 43.77173 -29.04 0.0000000000547 ***
## year 0.67119 0.02211 30.35 0.0000000000353 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.322 on 10 degrees of freedom
## Multiple R-squared: 0.9893, Adjusted R-squared: 0.9882
## F-statistic: 921.4 on 1 and 10 DF, p-value: 0.00000000003527
##VIỆC 4. HỒI QUY ĐA BIÊN ###4.1 Nhập dữ liệu vào R
Y = c(12.1, 11.9, 10.2, 8.0, 7.7, 5.3, 7.9, 7.8, 5.5, 2.6)
X1 = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
X2 = c(7, 4, 4, 6, 4, 2, 1, 1, 1, 0)
df = data.frame(Y, X1, X2)
head(df)
## Y X1 X2
## 1 12.1 0 7
## 2 11.9 1 4
## 3 10.2 2 4
## 4 8.0 3 6
## 5 7.7 4 4
## 6 5.3 5 2
###4.2 Đánh giá mối liên quan giữa Y và X1
model1 = lm(Y ~ X1, data = df)
summary(model1)
##
## Call:
## lm(formula = Y ~ X1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1606 -1.0735 0.1742 0.8621 2.0970
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.8545 0.8283 14.312 0.000000554 ***
## X1 -0.8788 0.1552 -5.664 0.000474 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.409 on 8 degrees of freedom
## Multiple R-squared: 0.8004, Adjusted R-squared: 0.7755
## F-statistic: 32.08 on 1 and 8 DF, p-value: 0.0004737
###4.3 Đánh giá mối liên quan giữa Y và X2
model2 = lm(Y ~ X2, data = df)
summary(model2)
##
## Call:
## lm(formula = Y ~ X2, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.702 -1.533 -0.034 1.667 3.066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0980 1.1222 4.543 0.00189 **
## X2 0.9340 0.2999 3.114 0.01436 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.121 on 8 degrees of freedom
## Multiple R-squared: 0.548, Adjusted R-squared: 0.4915
## F-statistic: 9.698 on 1 and 8 DF, p-value: 0.01436
###4.4 Đánh giá mối liên quan độc lập giữa X2 và Y
model3 = lm(Y ~ X1 + X2, data = df)
summary(model3)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.46078 -0.33384 0.00026 0.81856 1.98476
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.7076 2.9785 4.938 0.00168 **
## X1 -1.2042 0.3614 -3.332 0.01255 *
## X2 -0.4629 0.4642 -0.997 0.35187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 7 degrees of freedom
## Multiple R-squared: 0.8252, Adjusted R-squared: 0.7753
## F-statistic: 16.53 on 2 and 7 DF, p-value: 0.002232