library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
weight <- c(8, 9, 11, 4, 7, 8, 5, 7, 17, 10, 14, 12, 24, 11, 22, 28, 21, 26, 11, 24, 19, 26, 16, 13, 12, 9, 10, 11, 17, 15)
group <- factor(c(rep("A", 7), rep("B", 8), rep("C", 6), rep("D", 9)))
data <- data.frame(weight, group)
head(data)
## weight group
## 1 8 A
## 2 9 A
## 3 11 A
## 4 4 A
## 5 7 A
## 6 8 A
ggplot(data, aes(x = group, y = weight, fill = group)) +
geom_boxplot() + # Biểu đồ hộp
geom_jitter(width = 0.1, alpha = 0.6) + # Thêm các điểm dữ liệu thô để thấy sự phân bố
labs(
title = "So sánh Cân nặng giữa 4 nhóm",
x = "Nhóm",
y = "Cân nặng (kg)"
) +
theme_minimal()
library(dplyr)
data %>%
group_by(group) %>%
summarise(
n = n(),
mean_weight = mean(weight),
sd_weight = sd(weight),
min_weight = min(weight),
max_weight = max(weight)
)
## # A tibble: 4 × 6
## group n mean_weight sd_weight min_weight max_weight
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 A 7 7.43 2.37 4 11
## 2 B 8 14.6 5.95 7 24
## 3 C 6 21.5 6.09 11 28
## 4 D 9 14.3 5.15 9 26
aov_model <- aov(weight ~ group, data = data)
summary(aov_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 3 642.3 214.09 8.197 0.000528 ***
## Residuals 26 679.1 26.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov_model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weight ~ group, data = data)
##
## $group
## diff lwr upr p adj
## B-A 7.1964286 -0.05969765 14.4525548 0.0525014
## C-A 14.0714286 6.27132726 21.8715299 0.0002134
## D-A 6.9047619 -0.16073856 13.9702624 0.0571911
## C-B 6.8750000 -0.69675602 14.4467560 0.0850381
## D-B -0.2916667 -7.10424368 6.5209103 0.9994049
## D-C -7.1666667 -14.55594392 0.2226106 0.0597131
df <- read.csv("Obesity data.csv")
str(df)
## 'data.frame': 1217 obs. of 13 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr "F" "M" "F" "F" ...
## $ height : int 150 165 157 156 160 153 155 167 165 158 ...
## $ weight : int 49 52 57 53 51 47 58 65 54 60 ...
## $ bmi : num 21.8 19.1 23.1 21.8 19.9 20.1 24.1 23.3 19.8 24 ...
## $ age : int 53 65 64 56 54 52 66 50 61 58 ...
## $ WBBMC : int 1312 1309 1230 1171 1681 1358 1546 2276 1778 1404 ...
## $ wbbmd : num 0.88 0.84 0.84 0.8 0.98 0.91 0.96 1.11 0.96 0.86 ...
## $ fat : int 17802 8381 19221 17472 7336 14904 20233 17749 10795 21365 ...
## $ lean : int 28600 40229 36057 33094 40621 30068 35599 43301 38613 35534 ...
## $ pcfat : num 37.3 16.8 34 33.8 14.8 32.2 35.3 28 21.1 36.6 ...
## $ hypertension: int 0 1 1 1 0 1 1 1 0 1 ...
## $ diabetes : int 1 0 0 0 0 0 1 1 0 0 ...
summary(df$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34.00 49.00 54.00 55.14 61.00 95.00
summary(df$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 136.0 151.0 155.0 156.7 162.0 185.0
plot(df$height, df$weight,
main = "Biểu đồ Tán xạ: Cân nặng theo Chiều cao",
xlab = "Chiều cao (height - cm)",
ylab = "Cân nặng (weight - kg)",
pch = 19, col = "blue")
ggplot(df, aes(x = height, y = weight, color = gender)) +
geom_point(alpha = 0.7, size = 3) + # Thêm các điểm dữ liệu, tăng kích thước điểm
geom_smooth(method = "lm", se = FALSE, linetype = "dashed") + # Thêm đường hồi quy cho TỔNG THỂ
# Thêm đường hồi quy riêng cho từng nhóm giới tính
geom_smooth(aes(group = gender), method = "lm", se = FALSE, linetype = "solid") +
labs(
title = "Mối liên quan giữa Cân nặng và Chiều cao theo Giới tính",
x = "Chiều cao (height, cm)",
y = "Cân nặng (weight, kg)",
color = "Giới tính" # Đặt lại tên chú giải màu sắc
) +
scale_color_manual(values = c("F" = "darkorange", "M" = "darkblue")) + # Tự chọn màu sắc
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Căn giữa tiêu đề
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
cor.test(df$weight, df$height, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: df$weight and df$height
## t = 25.984, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5602911 0.6326135
## sample estimates:
## cor
## 0.5976667
cor.test(df$height, df$pcfat, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: df$height and df$pcfat
## t = -19.063, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5219407 -0.4353664
## sample estimates:
## cor
## -0.4798206
vn <- data.frame(
year = c(1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007),
lifeExp = c(40.4, 42.9, 45.4, 47.8, 50.3, 55.8, 58.8, 62.8, 67.7, 70.7, 73.0, 74.2)
)
head(vn)
## year lifeExp
## 1 1952 40.4
## 2 1957 42.9
## 3 1962 45.4
## 4 1967 47.8
## 5 1972 50.3
## 6 1977 55.8
model_vn <- lm(lifeExp ~ year, data = vn)
summary(model_vn)
##
## Call:
## lm(formula = lifeExp ~ year, data = vn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1494 -0.5944 0.1387 0.7324 1.8268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.271e+03 4.377e+01 -29.04 5.47e-11 ***
## year 6.712e-01 2.211e-02 30.35 3.53e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.322 on 10 degrees of freedom
## Multiple R-squared: 0.9893, Adjusted R-squared: 0.9882
## F-statistic: 921.4 on 1 and 10 DF, p-value: 3.527e-11
par(mfrow = c(2, 2)) # Chia cửa sổ đồ họa thành 2x2
plot(model_vn) # HIỂN THỊ 4 BIỂU ĐỒ CHẨN ĐOÁN
par(mfrow = c(1, 1)) # Đặt lại bố cục mặc định
summary(model_vn)
##
## Call:
## lm(formula = lifeExp ~ year, data = vn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1494 -0.5944 0.1387 0.7324 1.8268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.271e+03 4.377e+01 -29.04 5.47e-11 ***
## year 6.712e-01 2.211e-02 30.35 3.53e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.322 on 10 degrees of freedom
## Multiple R-squared: 0.9893, Adjusted R-squared: 0.9882
## F-statistic: 921.4 on 1 and 10 DF, p-value: 3.527e-11
year_data <- c(1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007)
lifeExp_data <- c(40.4, 42.9, 45.4, 47.8, 50.3, 55.8, 58.8, 62.8, 67.7, 70.7, 73.0, 74.2)
vn_35 <- data.frame(year = year_data, lifeExp = lifeExp_data)
model_vn_35 <- lm(lifeExp ~ year, data = vn_35)
summary(model_vn_35)
##
## Call:
## lm(formula = lifeExp ~ year, data = vn_35)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1494 -0.5944 0.1387 0.7324 1.8268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.271e+03 4.377e+01 -29.04 5.47e-11 ***
## year 6.712e-01 2.211e-02 30.35 3.53e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.322 on 10 degrees of freedom
## Multiple R-squared: 0.9893, Adjusted R-squared: 0.9882
## F-statistic: 921.4 on 1 and 10 DF, p-value: 3.527e-11
df_mreg <- data.frame(
Y = c(12.1, 11.9, 10.2, 8.0, 7.7, 5.3, 7.9, 7.8, 5.5, 2.6), # Ethylene oxide
X1 = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), # Hoạt tính bạc
X2 = c(7, 4, 4, 6, 4, 2, 1, 1, 1, 0) # Thời gian lưu
)
cor.test(df_mreg$Y, df_mreg$X1) # Tương quan mạnh và âm
##
## Pearson's product-moment correlation
##
## data: df_mreg$Y and df_mreg$X1
## t = -5.664, df = 8, p-value = 0.0004737
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9750413 -0.6068801
## sample estimates:
## cor
## -0.8946527
cor.test(df_mreg$Y, df_mreg$X2) # Tương quan mạnh và dương
##
## Pearson's product-moment correlation
##
## data: df_mreg$Y and df_mreg$X2
## t = 3.1141, df = 8, p-value = 0.01436
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2071808 0.9343783
## sample estimates:
## cor
## 0.7402448
model_mreg <- lm(Y ~ X1 + X2, data = df_mreg)
summary(model_mreg)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = df_mreg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.46078 -0.33384 0.00026 0.81856 1.98476
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.7076 2.9785 4.938 0.00168 **
## X1 -1.2042 0.3614 -3.332 0.01255 *
## X2 -0.4629 0.4642 -0.997 0.35187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 7 degrees of freedom
## Multiple R-squared: 0.8252, Adjusted R-squared: 0.7753
## F-statistic: 16.53 on 2 and 7 DF, p-value: 0.002232
cor_matrix <- cor(df_mreg)
melted_cor <- melt(cor_matrix)
ggplot(melted_cor, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Hệ số\nTương quan") +
geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 4) +
labs(title = "Heatmap Ma trận Tương quan (Y, X1, X2)") +
theme_minimal() +
coord_fixed()