#Việc 1 phân tích phương sai ## 1.1 Tạo dữ liệu

a = file.choose()
df = read.csv("D:\\CONG VIEC 2023 - DAU\\TAP HUAN\\2025.10 - NCKH\\Obesity data.csv")

NhomA <- c(8, 9, 11, 4, 7, 8, 5)
NhomB <- c(7, 17, 10, 14, 12, 24, 11, 22)
NhomC <- c(28, 21, 26, 11, 24, 19)
NhomD <- c(26, 16, 13, 12, 9, 10, 11, 17, 15)

##1.2 Mô tả cân nặng các nhóm

data <- data.frame(can_nang = c(NhomA, NhomB, NhomC, NhomD), nhom = rep(c("A", "B", "C", "D"), times = c(length(NhomA), length(NhomB), length(NhomC), length(NhomD))))
print(data)

##    can_nang nhom
## 1         8    A
## 2         9    A
## 3        11    A
## 4         4    A
## 5         7    A
## 6         8    A
## 7         5    A
## 8         7    B
## 9        17    B
## 10       10    B
## 11       14    B
## 12       12    B
## 13       24    B
## 14       11    B
## 15       22    B
## 16       28    C
## 17       21    C
## 18       26    C
## 19       11    C
## 20       24    C
## 21       19    C
## 22       26    D
## 23       16    D
## 24       13    D
## 25       12    D
## 26        9    D
## 27       10    D
## 28       11    D
## 29       17    D
## 30       15    D

1.3 Phân tích sự khác biệt về cân nặng giữa 4 nhóm. Diễn giải kết quả

tapply(data$can_nang, data$nhom, summary)

## $A
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   6.000   8.000   7.429   8.500  11.000 
## 
## $B
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   10.75   13.00   14.62   18.25   24.00 
## 
## $C
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    11.0    19.5    22.5    21.5    25.5    28.0 
## 
## $D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   11.00   13.00   14.33   16.00   26.00

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data %>% group_by(nhom) %>% summarize(so_luong = n(), trung_binh = mean(can_nang),trung_vi = median(can_nang), do_lech_chuan = sd(can_nang))

## # A tibble: 4 × 5
##   nhom  so_luong trung_binh trung_vi do_lech_chuan
##   <chr>    <int>      <dbl>    <dbl>         <dbl>
## 1 A            7       7.43      8            2.37
## 2 B            8      14.6      13            5.95
## 3 C            6      21.5      22.5          6.09
## 4 D            9      14.3      13            5.15

boxplot(can_nang ~ nhom, data = data,main = "Phân phối cân nặng theo 4 nhóm",xlab = "Nhóm",ylab = "Cân nặng")

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

1.4 Phân tích hậu định

table1(~ weight + height, data = df)

	Overall (N=1217)
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]

plot(height ~ weight, data = df)

library(ggplot2)
ggplot(data = df, aes(x = weight, y = height)) + geom_point() + geom_smooth()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

cor.test(df$weight, df$height)

## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$height
## t = 25.984, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5602911 0.6326135
## sample estimates:
##       cor 
## 0.5976667

library(lessR)

## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## 
## Attaching package: 'lessR'
## 
## The following object is masked from 'package:table1':
## 
##     label
## 
## The following objects are masked from 'package:dplyr':
## 
##     order_by, recode, rename

Plot(height, pcfat, data = df, fit = "lm")

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(height, pcfat, enhance=TRUE)  # many options
## Plot(height, pcfat, fill="skyblue")  # interior fill color of points
## Plot(height, pcfat, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 1217 
## Sample Correlation of height and pcfat: r = -0.480 
##   
## Hypothesis Test of 0 Correlation:  t = -19.063,  df = 1215,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  -0.522 to -0.435 
##   
## 
##   Line: b0 = 99.311633    b1 = -0.432014    Linear Model MSE = 39.747926   Rsq = 0.230
##

cor.test(df$weight, df$pcfat)

## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$pcfat
## t = 1.9745, df = 1215, p-value = 0.04855
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0003640405 0.1123913870
## sample estimates:
##        cor 
## 0.05655573

Việc 2

2.1 Đọc dữ liệu “Demo data.csv” vào R và gọi dữ liệu là “df”

head(df)

##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
##   diabetes
## 1        1
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0

dim(df)

## [1] 1217   13

##2.2 Mô tả đặc điểm cân nặng (weight) và chiều cao (height).

library(table1)
table1(~ weight + height, data = df)

	Overall (N=1217)
weight
Mean (SD)	55.1 (9.40)
Median [Min, Max]	54.0 [34.0, 95.0]
height
Mean (SD)	157 (7.98)
Median [Min, Max]	155 [136, 185]

##2.3 Vẽ biểu đồ tán xạ đánh giá mối liên quan giữa cân nặng (weight) và chiều cao (height

plot(height ~ weight, data = df)

library(ggplot2)
ggplot(data = df, aes(x = weight, y = height)) + geom_point() + geom_smooth()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

##2.4 Tiến hành phân tích tương quan định lượng mối liên quan giữa chiều cao (height) và chiều cao (height).

cor.test(df$weight, df$height)

## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$height
## t = 25.984, df = 1215, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5602911 0.6326135
## sample estimates:
##       cor 
## 0.5976667

library(lessR)

##2.5 Tiến hành phân tích tương quan định lượng mối liên quan giữa chiều cao (height) và tỉ trọng mỡ (pcfat).

Plot(height, pcfat, data = df, fit = "lm")

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(height, pcfat, enhance=TRUE)  # many options
## Plot(height, pcfat, color="red")  # exterior edge color of points
## Plot(height, pcfat, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 1217 
## Sample Correlation of height and pcfat: r = -0.480 
##   
## Hypothesis Test of 0 Correlation:  t = -19.063,  df = 1215,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  -0.522 to -0.435 
##   
## 
##   Line: b0 = 99.311633    b1 = -0.432014    Linear Model MSE = 39.747926   Rsq = 0.230
##

cor.test(df$weight, df$pcfat)

## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$pcfat
## t = 1.9745, df = 1215, p-value = 0.04855
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0003640405 0.1123913870
## sample estimates:
##        cor 
## 0.05655573

##Việc 3 Hồi quy tuyến tính ##3.1 Đọc dữ liệu “gapminder” vào R từ gói lệnh gapminder. Tạo bộ dữ liệu “vn” gồm dữ liệu của Việt Nam

library(gapminder)
data(gapminder)
vn = subset(gapminder, country == "Vietnam")
library(lessR)
Plot(lifeExp, year, data = vn, xlab = "Life expectancy (years)", ylab = "Year")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lifeExp, year, enhance=TRUE)  # many options
## Plot(lifeExp, year, fill="skyblue")  # interior fill color of points
## Plot(lifeExp, year, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(lifeExp, year, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 12 
## Sample Correlation of lifeExp and year: r = 0.995 
##   
## Hypothesis Test of 0 Correlation:  t = 30.569,  df = 10,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.981 to 0.999 
##

##3.2 Thực hiện phân tích hồi qui tuyến tính để xác định xem mỗi năm, trong thời gian từ 1952-2007, tuổi thọ của người Việt Nam gia tăng như thế nào?

m.1 = lm(lifeExp ~ year, data = vn)
summary(m.1)

## 
## Call:
## lm(formula = lifeExp ~ year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1884 -0.5840  0.1335  0.7396  1.7873 
## 
## Coefficients:
##                Estimate  Std. Error t value        Pr(>|t|)    
## (Intercept) -1271.98315    43.49240  -29.25 0.0000000000510 ***
## year            0.67162     0.02197   30.57 0.0000000000329 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.314 on 10 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9884 
## F-statistic: 934.5 on 1 and 10 DF,  p-value: 0.00000000003289

m.2 = Regression(lifeExp ~ year, data = vn)

m.2

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=lifeExp ~ year, data=vn, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  vn 
##  
## Response Variable: lifeExp 
## Predictor Variable: year 
##  
## Number of cases (rows) of data:  12 
## Number of cases retained for analysis:  12 
## 
## 
##   BASIC ANALYSIS 
## 
##               Estimate    Std Err  t-value  p-value    Lower 95%    Upper 95% 
## (Intercept) -1271.9832    43.4924  -29.246    0.000   -1368.8903   -1175.0760 
##        year     0.6716     0.0220   30.569    0.000       0.6227       0.7206 
## 
## Standard deviation of lifeExp: 12.17233 
##  
## Standard deviation of residuals:  1.31365 for df=10 
## 95% range of residuals:  5.85399 = 2 * (2.228 * 1.31365) 
##  
## R-squared: 0.989    Adjusted R-squared: 0.988    PRESS R-squared: 0.984 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 934.455     df: 1 and 10     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##             df     Sum Sq    Mean Sq   F-value   p-value 
## Model        1  1612.5653  1612.5653  934.4554     0.000 
## Residuals   10    17.2567     1.7257 
## lifeExp     11  1629.8221   148.1656 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##           lifeExp year 
##   lifeExp    1.00 0.99 
##      year    0.99 1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 12, out of 12 ] 
## ------------------------------------------------------------- 
##           year lifeExp  fitted   resid  rstdnt  dffits  cooks 
##   12      2007 74.2490 75.9489 -1.6999 -1.6742 -1.0827 0.4965 
##    1      1952 40.4120 39.0101  1.4019  1.3167  0.8515 0.3377 
##    5      1972 50.2540 52.4424 -2.1884 -2.0016 -0.6637 0.1694 
##    9      1992 67.6620 65.8747  1.7873  1.5563  0.5937 0.1543 
##   10      1997 70.6720 69.2328  1.4392  1.2327  0.5559 0.1469 
##    4      1967 47.8380 49.0843 -1.2463 -1.0172 -0.3880 0.0750 
##    2      1957 42.8870 42.3682  0.5188  0.4300  0.2316 0.0292 
##   11      2002 73.0170 72.5908  0.4262  0.3520  0.1896 0.0197 
##    3      1962 45.3630 45.7262 -0.3632 -0.2891 -0.1304 0.0094 
##    7      1982 58.8160 59.1585 -0.3425 -0.2596 -0.0792 0.0035 
##    8      1987 62.8200 62.5166  0.3034  0.2315  0.0768 0.0032 
##    6      1977 55.7640 55.8005 -0.0365 -0.0275 -0.0084 0.0000 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##  ---------------------------------------------- 
## 
##           year lifeExp    pred s_pred  pi.lwr  pi.upr  width 
##    1      1952 40.4120 39.0101 1.4948 35.6794 42.3408 6.6614 
##    2      1957 42.8870 42.3682 1.4539 39.1286 45.6077 6.4790 
##    3      1962 45.3630 45.7262 1.4203 42.5616 48.8909 6.3293 
##    4      1967 47.8380 49.0843 1.3946 45.9770 52.1917 6.2147 
##    5      1972 50.2540 52.4424 1.3772 49.3738 55.5109 6.1371 
##    6      1977 55.7640 55.8005 1.3684 52.7515 58.8494 6.0979 
##    7      1982 58.8160 59.1585 1.3684 56.1096 62.2075 6.0979 
##    8      1987 62.8200 62.5166 1.3772 59.4481 65.5852 6.1371 
##    9      1992 67.6620 65.8747 1.3946 62.7673 68.9820 6.2147 
##   10      1997 70.6720 69.2328 1.4203 66.0681 72.3974 6.3293 
##   11      2002 73.0170 72.5908 1.4539 69.3513 75.8304 6.4790 
##   12      2007 74.2490 75.9489 1.4948 72.6182 79.2796 6.6614 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------

##3.3 Kiểm tra các giả định của mô hình hồi qui tuyến tính

par(mfrow = c(2, 2))
plot(m.1)

library(ggfortify)
autoplot(m.1)

##3.5 Sử dụng ChatGPT viết code để đánh giá gia tăng tuổi thọ của người Việt Nam trong khoảng 1952-2007 bằng dữ liệu sau

# 1. Nhập dữ liệu
year <- c(1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007)
lifeExp <- c(40.4, 42.9, 45.4, 47.8, 50.3, 55.8, 58.8, 62.8, 67.7, 70.7, 73.0, 74.2)

data <- data.frame(year, lifeExp)

# 2. Xem dữ liệu
print(data)

##    year lifeExp
## 1  1952    40.4
## 2  1957    42.9
## 3  1962    45.4
## 4  1967    47.8
## 5  1972    50.3
## 6  1977    55.8
## 7  1982    58.8
## 8  1987    62.8
## 9  1992    67.7
## 10 1997    70.7
## 11 2002    73.0
## 12 2007    74.2

# 3. Vẽ biểu đồ xu hướng tuổi thọ qua thời gian
plot(year, lifeExp, type = "b", pch = 19, col = "blue",
     xlab = "Năm", ylab = "Tuổi thọ trung bình",
     main = "Xu hướng gia tăng tuổi thọ (1952–2007)")

# 4. Hồi quy tuyến tính để đánh giá xu hướng
model <- lm(lifeExp ~ year, data = data)
summary(model)

## 
## Call:
## lm(formula = lifeExp ~ year, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1494 -0.5944  0.1387  0.7324  1.8268 
## 
## Coefficients:
##                Estimate  Std. Error t value        Pr(>|t|)    
## (Intercept) -1271.13492    43.77173  -29.04 0.0000000000547 ***
## year            0.67119     0.02211   30.35 0.0000000000353 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.322 on 10 degrees of freedom
## Multiple R-squared:  0.9893, Adjusted R-squared:  0.9882 
## F-statistic: 921.4 on 1 and 10 DF,  p-value: 0.00000000003527

# 5. Vẽ đường hồi quy lên biểu đồ
abline(model, col = "red", lwd = 2)

# 6. Dự đoán tuổi thọ trung bình năm 2007 và mức tăng trung bình mỗi năm
coef(model)

##   (Intercept)          year 
## -1271.1349184     0.6711888

increase_per_year <- coef(model)[2]
cat("Mức tăng tuổi thọ trung bình mỗi năm:", round(increase_per_year, 3), "năm\n")

## Mức tăng tuổi thọ trung bình mỗi năm: 0.671 năm

# 7. Tính tổng mức tăng giai đoạn 1952–2007
total_increase <- (2007 - 1952) * increase_per_year
cat("Tổng mức tăng tuổi thọ 1952–2007 (ước lượng):", round(total_increase, 1), "năm\n")

## Tổng mức tăng tuổi thọ 1952–2007 (ước lượng): 36.9 năm

predict(model, newdata = data.frame(year = 2012))

##        1 
## 79.29697

##Việc 4 ##4.1 Nhập liệu

Y  = c(12.1, 11.9, 10.2, 8.0, 7.7, 5.3, 7.9, 7.8, 5.5, 2.6)
X1 = c(0,  1,  2,  3,  4, 5, 6,  7,  8,  9)
X2 = c(7,  4,  4,  6,  4, 2, 1,  1,  1,  0)
df = data.frame(Y, X1, X2)
dim(df)

## [1] 10  3

head(df)

##      Y X1 X2
## 1 12.1  0  7
## 2 11.9  1  4
## 3 10.2  2  4
## 4  8.0  3  6
## 5  7.7  4  4
## 6  5.3  5  2

##4.2 Đánh giá mối liên quan giữa ethylene oxide (Y) và hoạt tính bạc của xúc tác (X1)

m.3 = lm(Y ~ X1, data = df)
summary(m.3)

## 
## Call:
## lm(formula = Y ~ X1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1606 -1.0735  0.1742  0.8621  2.0970 
## 
## Coefficients:
##             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)  11.8545     0.8283  14.312 0.000000554 ***
## X1           -0.8788     0.1552  -5.664    0.000474 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.409 on 8 degrees of freedom
## Multiple R-squared:  0.8004, Adjusted R-squared:  0.7755 
## F-statistic: 32.08 on 1 and 8 DF,  p-value: 0.0004737

##4.3 Đánh giá mối liên quan giữa ethylene oxide (Y) và thời gian lưu (X2)

m.4 = lm(Y ~ X2, data = df)
summary(m.4)

## 
## Call:
## lm(formula = Y ~ X2, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.702 -1.533 -0.034  1.667  3.066 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   5.0980     1.1222   4.543  0.00189 **
## X2            0.9340     0.2999   3.114  0.01436 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.121 on 8 degrees of freedom
## Multiple R-squared:  0.548,  Adjusted R-squared:  0.4915 
## F-statistic: 9.698 on 1 and 8 DF,  p-value: 0.01436

##4.4 Đánh giá mối liên quan độc lập giữa thời gian lưu (X2) và ethylene oxide (Y)

m.5 = lm(Y ~ X1 + X2, data = df)
summary(m.5)

## 
## Call:
## lm(formula = Y ~ X1 + X2, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46078 -0.33384  0.00026  0.81856  1.98476 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  14.7076     2.9785   4.938  0.00168 **
## X1           -1.2042     0.3614  -3.332  0.01255 * 
## X2           -0.4629     0.4642  -0.997  0.35187   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.41 on 7 degrees of freedom
## Multiple R-squared:  0.8252, Adjusted R-squared:  0.7753 
## F-statistic: 16.53 on 2 and 7 DF,  p-value: 0.002232

Bài tập ngày 3

Thaodtt

2025-10-23

1.3 Phân tích sự khác biệt về cân nặng giữa 4 nhóm. Diễn giải kết quả

1.4 Phân tích hậu định

Việc 2

2.1 Đọc dữ liệu “Demo data.csv” vào R và gọi dữ liệu là “df”