Thực hành ngày 5
Mô hình hồi qui tuyến tính
Học viên Đặng Bảo Đăng - Mã số R1F019
library(ggplot2); library(tidyverse); library(ggfortify); library(ggthemes)
Việc 1: Đọc dữ liệu “Insurance dataset.xlsx”
ins = readxl::read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Insurance dataset.xlsx")
line.data = data.frame(xintercept = c(median(ins$charge), mean(ins$charge)), Lines = factor(c("median", "mean"), levels = c("median", "mean")), color = c("blue", "red"))
ins %>% ggplot(aes(x=charge)) + geom_histogram(aes(y=..density..), col = "black", fill = "white") + geom_density(col="black", fill="blue", alpha=0.25) + geom_vline(data=line.data, aes(xintercept = xintercept, color= Lines), linetype="dashed", size=1) + scale_colour_manual(name="statistics", values = line.data$color)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot (data= ins, aes(x= charge)) + geom_histogram (aes(y= ..density..), binwidth = 1000, color= 'black', fill= 'white') + geom_density(alpha= 0.2, fill= 'blue') +
geom_vline(aes(xintercept= median(ins$charge), color="median"), linetype="dashed", size=1) +
geom_vline(aes(xintercept= mean(ins$charge), color="mean"), linetype="dashed", size=1) +
scale_color_manual(name = "statistics", values = c(median = "blue", mean = "red"))
## Warning: Use of `ins$charge` is discouraged. Use `charge` instead.
## Warning: Use of `ins$charge` is discouraged. Use `charge` instead.

ins %>% ggplot(aes(x=region, y=charge, fill=region)) + geom_bar(stat="identity") + theme(legend.position = "none")

ggplot(data= ins, aes(x= reorder(region, charge), y= charge, fill= region)) + geom_bar(stat = 'identity') + theme(legend.position='none')

ins %>% ggplot((aes(x=region, y=charge, fill = region, col = region))) + xlab("Region") + ylab("Charge") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1)

fit1 = lm(charge ~ region, data=ins)
summary(fit1)
##
## Call:
## lm(formula = charge ~ region, data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13614 -8463 -3793 3385 49035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13406.4 671.3 19.971 <2e-16 ***
## regionnorthwest -988.8 948.6 -1.042 0.297
## regionsoutheast 1329.0 922.9 1.440 0.150
## regionsouthwest -1059.4 948.6 -1.117 0.264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12080 on 1334 degrees of freedom
## Multiple R-squared: 0.006634, Adjusted R-squared: 0.0044
## F-statistic: 2.97 on 3 and 1334 DF, p-value: 0.03089
par(mfrow = c(2, 2))
plot(fit1)

autoplot(fit1)

fit2 = lm(charge ~ smoker, data=ins)
ins %>% ggplot(aes(x=reorder(smoker, charge), y=charge, fill=smoker)) + geom_bar(stat="identity") + theme(legend.position = "none")

ins %>% ggplot((aes(x=smoker, y=charge, fill = smoker, col = smoker))) + xlab("Smoker") + ylab("Charge") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1)

ins %>% ggplot(aes(x=sex, y=charge, fill = sex, col = sex, alpha=0.001)) + xlab("Sex") + ylab("Charge") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1, alpha=0.5) + theme_stata() + theme(legend.position = "none")

Việc 2: Phân tích các mô hình sau đây:
model1 = lm(charge ~ sex, data=ins)
summary(model1)
##
## Call:
## lm(formula = charge ~ sex, data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12835 -8435 -3980 3476 51201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12569.6 470.1 26.740 <2e-16 ***
## sexmale 1387.2 661.3 2.098 0.0361 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12090 on 1336 degrees of freedom
## Multiple R-squared: 0.003282, Adjusted R-squared: 0.002536
## F-statistic: 4.4 on 1 and 1336 DF, p-value: 0.03613
model2 = lm(charge ~ sex + smoker, data=ins)
summary(model2)
##
## Call:
## lm(formula = charge ~ sex + smoker, data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19193 -5074 -909 3739 31682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8466.04 303.54 27.89 <2e-16 ***
## sexmale -65.38 409.81 -0.16 0.873
## smokeryes 23622.13 507.74 46.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7473 on 1335 degrees of freedom
## Multiple R-squared: 0.6198, Adjusted R-squared: 0.6192
## F-statistic: 1088 on 2 and 1335 DF, p-value: < 2.2e-16
model3 = lm(charge ~ sex*smoker, data=ins)
summary(model3)
##
## Call:
## lm(formula = charge ~ sex * smoker, data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20213 -5166 -920 3655 33091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8762.3 318.6 27.504 < 2e-16 ***
## sexmale -675.1 457.0 -1.477 0.13988
## smokeryes 21916.7 764.4 28.673 < 2e-16 ***
## sexmale:smokeryes 3038.1 1020.2 2.978 0.00295 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7451 on 1334 degrees of freedom
## Multiple R-squared: 0.6223, Adjusted R-squared: 0.6214
## F-statistic: 732.6 on 3 and 1334 DF, p-value: < 2.2e-16
Việc 4: Đọc dữ liệu “SAT scores dataset.xlsx”
sat = readxl::read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/SAT scores dataset.xlsx")
library(GGally); library(gridExtra)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
ggpairs(sat, columns = c(3:7, 2), mapping=aes(alpha=0.5))

library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
dat1 = sat %>% select(3:7, 2)
pairs.panels(dat1)

sat$s.hsm = scale(sat$hsm)[,1]
sat$s.hss = scale(sat$hss)[,1]
sat$s.hse = scale(sat$hse)[,1]
sat$s.satm = scale(sat$satm)[,1]
sat$s.satv = scale(sat$satv)[,1]
ggpairs(sat, column = c(9:13, 2), mapping=aes(alpha=0.5))

dat1 = sat %>% select(9:13, 2)
pairs.panels(dat1)

summary(lm(gpa ~ s.hsm, data=sat))
##
## Call:
## lm(formula = gpa ~ s.hsm, data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.12610 -0.36370 0.08651 0.46351 1.63911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.63522 0.04696 56.119 < 2e-16 ***
## s.hsm 0.34020 0.04706 7.229 7.77e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7028 on 222 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1869
## F-statistic: 52.25 on 1 and 222 DF, p-value: 7.774e-12
summary(lm(gpa ~ s.satm, data=sat))
##
## Call:
## lm(formula = gpa ~ s.satm, data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.59405 -0.37876 0.08366 0.55984 1.39948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.63522 0.05051 52.170 < 2e-16 ***
## s.satm 0.19618 0.05063 3.875 0.00014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.756 on 222 degrees of freedom
## Multiple R-squared: 0.06336, Adjusted R-squared: 0.05914
## F-statistic: 15.02 on 1 and 222 DF, p-value: 0.0001402
summary(lm(gpa ~ s.hsm + s.satm, data=sat))
##
## Call:
## lm(formula = gpa ~ s.hsm + s.satm, data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1435 -0.3292 0.0793 0.4591 1.6169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.63522 0.04696 56.118 < 2e-16 ***
## s.hsm 0.31628 0.05281 5.990 8.45e-09 ***
## s.satm 0.05275 0.05281 0.999 0.319
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7028 on 221 degrees of freedom
## Multiple R-squared: 0.1942, Adjusted R-squared: 0.1869
## F-statistic: 26.63 on 2 and 221 DF, p-value: 4.365e-11
Việc 5: Tìm mô hình tối ưu
library(BMA)
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.5
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.5-5)
x = sat[, c("s.hsm", "s.hss", "s.hse", "s.satm", "s.satv", "sex")]
y = sat$gpa
r = bicreg(x = x, y = y, strict = FALSE, OR = 20)
summary(r)
##
## Call:
## bicreg(x = x, y = y, strict = FALSE, OR = 20)
##
##
## 6 models were selected
## Best 5 models (cumulative posterior probability = 0.9614 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 2.6338185 0.054198 2.63522 2.63522 2.63522
## s.hsm 100.0 0.3245713 0.054157 0.34020 0.29932 0.28777
## s.hss 13.2 0.0120537 0.037257 . . 0.09109
## s.hse 17.3 0.0157980 0.040856 . 0.09148 .
## s.satm 6.2 0.0032458 0.018228 . . .
## s.satv 3.9 0.0005734 0.009958 . . .
## sex 3.9 0.0010385 0.020071 . . .
##
## nVar 1 2 2
## r2 0.191 0.202 0.200
## BIC -41.93648 -39.59809 -39.06568
## post prob 0.556 0.173 0.132
## model 4 model 5
## Intercept 2.63522 2.63522
## s.hsm 0.31628 0.33695
## s.hss . .
## s.hse . .
## s.satm 0.05275 .
## s.satv . 0.01473
## sex . .
##
## nVar 2 2
## r2 0.194 0.191
## BIC -37.53438 -36.61894
## post prob 0.062 0.039
imageplot.bma(r)

model = lm (gpa ~ s.hsm + s.hss + s.hse + s.satm + s.satv, data=sat)
library(relaimpo)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
metrics = calc.relimp(model, type=c("lmg"))
metrics
## Response variable: gpa
## Total response variance: 0.6074565
## Analysis based on 224 observations
##
## 5 Regressors:
## s.hsm s.hss s.hse s.satm s.satv
## Proportion of variance explained by model: 21.15%
## Metrics are not normalized (rela=FALSE).
##
## Relative importance metrics:
##
## lmg
## s.hsm 0.105941155
## s.hss 0.040887708
## s.hse 0.033054100
## s.satm 0.027854805
## s.satv 0.003712514
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## s.hsm 0.34020499 0.31008030 0.28463658 0.26096075 0.23919178
## s.hss 0.25675243 0.18853263 0.13503334 0.09207860 0.06102693
## s.hse 0.22524611 0.15734863 0.11853408 0.09595220 0.08337422
## s.satm 0.19618484 0.14162783 0.11384806 0.09612545 0.08152776
## s.satv 0.08923328 0.01815515 -0.01558741 -0.03133515 -0.03777113
boot = boot.relimp(model, b=1000, type=c("lmg"), fixed=F)
booteval.relimp(boot, typesel=c("lmg"), level=0.9, bty="perc", nodiff=T)
## Response variable: gpa
## Total response variance: 0.6074565
## Analysis based on 224 observations
##
## 5 Regressors:
## s.hsm s.hss s.hse s.satm s.satv
## Proportion of variance explained by model: 21.15%
## Metrics are not normalized (rela=FALSE).
##
## Relative importance metrics:
##
## lmg
## s.hsm 0.105941155
## s.hss 0.040887708
## s.hse 0.033054100
## s.satm 0.027854805
## s.satv 0.003712514
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## s.hsm 0.34020499 0.31008030 0.28463658 0.26096075 0.23919178
## s.hss 0.25675243 0.18853263 0.13503334 0.09207860 0.06102693
## s.hse 0.22524611 0.15734863 0.11853408 0.09595220 0.08337422
## s.satm 0.19618484 0.14162783 0.11384806 0.09612545 0.08152776
## s.satv 0.08923328 0.01815515 -0.01558741 -0.03133515 -0.03777113
##
##
## Confidence interval information ( 1000 bootstrap replicates, bty= perc ):
## Relative Contributions with confidence intervals:
##
## Lower Upper
## percentage 0.9 0.9 0.9
## s.hsm.lmg 0.1059 AB___ 0.0592 0.1691
## s.hss.lmg 0.0409 _BCD_ 0.0181 0.0811
## s.hse.lmg 0.0331 _BCD_ 0.0134 0.0667
## s.satm.lmg 0.0279 _BCD_ 0.0085 0.0622
## s.satv.lmg 0.0037 ___DE 0.0022 0.0204
##
## Letters indicate the ranks covered by bootstrap CIs.
## (Rank bootstrap confidence intervals always obtained by percentile method)
## CAUTION: Bootstrap confidence intervals can be somewhat liberal.