library (ggplot2)
ggplot(data= fmh, aes(x= bmi, y= sysbp)) + geom_point()
## Warning: Removed 52 rows containing missing values (geom_point).
## 2.2. Kiểm tra mối tương quan giữa ‘bmi’ và ‘tot.chol’. Thể hiện mối
liên quan bằng biểu đồ tương quan (dùng ggplot).
ggplot(data= fmh, aes(x= bmi, y= tot.chol)) + geom_point()
## Warning: Removed 454 rows containing missing values (geom_point).
# 2.3. Thay đổi nội dung trục tung và trục hoành
ggplot(data= fmh, aes(x= bmi, y= sysbp)) + geom_point() + ylab("Systolic Blood Pressure") + xlab("Body Mass Index")
## Warning: Removed 52 rows containing missing values (geom_point).
# 2.4. Thay đổi màu của points
ggplot(data= fmh, aes(x= bmi, y= sysbp)) + geom_point(colour= 'blue') + ylab("Systolic Blood Pressure") + xlab("Body Mass Index")
## Warning: Removed 52 rows containing missing values (geom_point).
ggplot(data= fmh, aes(x= bmi, y= sysbp)) + geom_point(colour = alpha("blue", 1/5)) + ylab("Systolic Blood Pressure") + xlab("Body Mass Index")
## Warning: Removed 52 rows containing missing values (geom_point).
## 3. Kiểm tra mối tương quan giữa các biến ‘age’, ‘bmi’, ‘sysbp’,
‘diasbp’, ‘tot.chol’, và ‘heart.rate’. Hãy vẽ một biểu đồ tương quan đa
biến giữa các biến trên. #Pearson correlation value được mô tả bên
phải
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(fmh, columns = c(3, 4, 5, 6, 9, 12), aes(alpha = 0.5)) +theme_bw()
## Warning: Removed 409 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 454 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 413 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 454 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## Removed 52 rows containing missing values (geom_point).
## Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 54 rows containing missing values
## Warning: Removed 413 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Removed 6 rows containing missing values (geom_point).
## Removed 6 rows containing missing values (geom_point).
## Warning: Removed 54 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).
## 4.1 Đọc dữ liệu ‘Insurance dataset.xlsx’ vào R và gọi đối tượng là
‘ins’ (dùng hàm ‘read_excel’ trong package ‘readxl’)
library(readxl)
ins = read_excel("C:\\Users\\Hoang Truc\\Downloads\\Dataset for TDTU workshop 4-2022\\Insurance dataset.xlsx")
dim(ins)
## [1] 1338 7
summary(ins)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charge
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
r= lm (ins$charge ~ ins$age)
summary (r)
##
## Call:
## lm(formula = ins$charge ~ ins$age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8059 -6671 -5939 5440 47829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3165.9 937.1 3.378 0.000751 ***
## ins$age 257.7 22.5 11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared: 0.08941, Adjusted R-squared: 0.08872
## F-statistic: 131.2 on 1 and 1336 DF, p-value: < 2.2e-16
mean(r$residuals^2)
## [1] 133440979
par(mfrow = c(2, 2))
plot (r)
## 5.1. Mô phỏng và lấy mẫu. Trong code dưới đây chúng ta sẽ mô phỏng
một dân số (population) gồm 100 000 người
N = 100000
id = 1:N
X = runif(N, min=0, max=10)
u = rnorm(N, mean=0, sd=2)
Y = -2 + 2.5*X + u
pop = data.frame(id, X, Y)
sample1 = dplyr::sample_n(pop, 100)
head(sample1)
## id X Y
## 1 53678 7.178636 18.393641
## 2 69993 4.351198 9.247884
## 3 74251 6.033277 10.750057
## 4 56109 4.083767 8.087592
## 5 58951 1.246673 7.014237
## 6 72023 5.515857 11.432426
summary(lm(Y ~ X, data=sample1))
##
## Call:
## lm(formula = Y ~ X, data = sample1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6823 -1.5258 -0.1766 1.4296 5.8549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.95231 0.46726 -4.178 6.39e-05 ***
## X 2.49593 0.07735 32.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.132 on 98 degrees of freedom
## Multiple R-squared: 0.914, Adjusted R-squared: 0.9131
## F-statistic: 1041 on 1 and 98 DF, p-value: < 2.2e-16
sample2 = dplyr::sample_n(pop, 500)
head(sample2)
## id X Y
## 1 66294 3.6277061 8.408156
## 2 72116 4.4223439 6.260127
## 3 97319 6.4497038 13.964986
## 4 72241 6.8397538 16.914497
## 5 32026 4.7992161 8.520737
## 6 58575 0.9958578 1.330217
summary(lm(Y ~ X, data=sample2))
##
## Call:
## lm(formula = Y ~ X, data = sample2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7440 -1.3975 -0.0322 1.4072 5.5625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.63940 0.17679 -9.273 <2e-16 ***
## X 2.43157 0.03129 77.720 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.025 on 498 degrees of freedom
## Multiple R-squared: 0.9238, Adjusted R-squared: 0.9237
## F-statistic: 6040 on 1 and 498 DF, p-value: < 2.2e-16
sample3 = dplyr::sample_n(pop, 1000)
head(sample3)
## id X Y
## 1 88346 0.1465202 -2.430933
## 2 8332 4.8072976 12.201159
## 3 35886 8.0488420 19.412311
## 4 25244 9.8694906 23.774630
## 5 18267 2.6609952 4.701066
## 6 49950 1.5564529 -1.846860
summary(lm(Y ~ X, data=sample3))
##
## Call:
## lm(formula = Y ~ X, data = sample3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1710 -1.3762 -0.0063 1.3317 7.0288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.83804 0.12646 -14.53 <2e-16 ***
## X 2.48585 0.02216 112.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.013 on 998 degrees of freedom
## Multiple R-squared: 0.9265, Adjusted R-squared: 0.9265
## F-statistic: 1.258e+04 on 1 and 998 DF, p-value: < 2.2e-16
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.