Thực hành ngày 5

Mô hình hồi qui tuyến tính

Học viên Đặng Bảo Đăng - Mã số R1F019

library(ggplot2); library(tidyverse); library(ggfortify); library(ggthemes)

Việc 1: Đọc dữ liệu “Insurance dataset.xlsx”

ins = readxl::read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Insurance dataset.xlsx")

line.data = data.frame(xintercept = c(median(ins$charge), mean(ins$charge)), Lines = factor(c("median", "mean"), levels = c("median", "mean")), color = c("blue", "red"))

ins %>% ggplot(aes(x=charge)) + geom_histogram(aes(y=..density..), col = "black", fill = "white") + geom_density(col="black", fill="blue", alpha=0.25) + geom_vline(data=line.data, aes(xintercept = xintercept, color= Lines), linetype="dashed", size=1) + scale_colour_manual(name="statistics", values = line.data$color)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot (data= ins, aes(x= charge)) + geom_histogram (aes(y= ..density..), binwidth = 1000, color= 'black', fill= 'white') + geom_density(alpha= 0.2, fill= 'blue') + 
  geom_vline(aes(xintercept= median(ins$charge), color="median"), linetype="dashed", size=1) +
  geom_vline(aes(xintercept= mean(ins$charge), color="mean"), linetype="dashed",  size=1) +
  scale_color_manual(name = "statistics", values = c(median = "blue", mean = "red"))
## Warning: Use of `ins$charge` is discouraged. Use `charge` instead.

## Warning: Use of `ins$charge` is discouraged. Use `charge` instead.

ins %>% ggplot(aes(x=region, y=charge, fill=region)) + geom_bar(stat="identity") + theme(legend.position = "none")

ggplot(data= ins, aes(x= reorder(region, charge), y= charge, fill= region)) + geom_bar(stat = 'identity') + theme(legend.position='none')

ins %>% ggplot((aes(x=region, y=charge, fill = region, col = region))) + xlab("Region") + ylab("Charge") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1)

fit1 = lm(charge ~ region, data=ins)
summary(fit1)
## 
## Call:
## lm(formula = charge ~ region, data = ins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13614  -8463  -3793   3385  49035 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      13406.4      671.3  19.971   <2e-16 ***
## regionnorthwest   -988.8      948.6  -1.042    0.297    
## regionsoutheast   1329.0      922.9   1.440    0.150    
## regionsouthwest  -1059.4      948.6  -1.117    0.264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12080 on 1334 degrees of freedom
## Multiple R-squared:  0.006634,   Adjusted R-squared:  0.0044 
## F-statistic:  2.97 on 3 and 1334 DF,  p-value: 0.03089
par(mfrow = c(2, 2))
plot(fit1)

autoplot(fit1)

fit2 = lm(charge ~ smoker, data=ins)

ins %>% ggplot(aes(x=reorder(smoker, charge), y=charge, fill=smoker)) + geom_bar(stat="identity") + theme(legend.position = "none")

ins %>% ggplot((aes(x=smoker, y=charge, fill = smoker, col = smoker))) + xlab("Smoker") + ylab("Charge") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1)

ins %>% ggplot(aes(x=sex, y=charge, fill = sex, col = sex, alpha=0.001)) + xlab("Sex") + ylab("Charge") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1, alpha=0.5) + theme_stata() + theme(legend.position = "none") 

Việc 2: Phân tích các mô hình sau đây:

model1 = lm(charge ~ sex, data=ins)
summary(model1)
## 
## Call:
## lm(formula = charge ~ sex, data = ins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12835  -8435  -3980   3476  51201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12569.6      470.1  26.740   <2e-16 ***
## sexmale       1387.2      661.3   2.098   0.0361 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12090 on 1336 degrees of freedom
## Multiple R-squared:  0.003282,   Adjusted R-squared:  0.002536 
## F-statistic:   4.4 on 1 and 1336 DF,  p-value: 0.03613
model2 = lm(charge ~ sex + smoker, data=ins)
summary(model2)
## 
## Call:
## lm(formula = charge ~ sex + smoker, data = ins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19193  -5074   -909   3739  31682 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8466.04     303.54   27.89   <2e-16 ***
## sexmale       -65.38     409.81   -0.16    0.873    
## smokeryes   23622.13     507.74   46.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7473 on 1335 degrees of freedom
## Multiple R-squared:  0.6198, Adjusted R-squared:  0.6192 
## F-statistic:  1088 on 2 and 1335 DF,  p-value: < 2.2e-16
model3 = lm(charge ~ sex*smoker, data=ins)
summary(model3)
## 
## Call:
## lm(formula = charge ~ sex * smoker, data = ins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20213  -5166   -920   3655  33091 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         8762.3      318.6  27.504  < 2e-16 ***
## sexmale             -675.1      457.0  -1.477  0.13988    
## smokeryes          21916.7      764.4  28.673  < 2e-16 ***
## sexmale:smokeryes   3038.1     1020.2   2.978  0.00295 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7451 on 1334 degrees of freedom
## Multiple R-squared:  0.6223, Adjusted R-squared:  0.6214 
## F-statistic: 732.6 on 3 and 1334 DF,  p-value: < 2.2e-16

Việc 4: Đọc dữ liệu “SAT scores dataset.xlsx”

sat = readxl::read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/SAT scores dataset.xlsx")

library(GGally); library(gridExtra)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
ggpairs(sat, columns = c(3:7, 2), mapping=aes(alpha=0.5))

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
dat1 = sat %>% select(3:7, 2)
pairs.panels(dat1)

sat$s.hsm = scale(sat$hsm)[,1]
sat$s.hss = scale(sat$hss)[,1]
sat$s.hse = scale(sat$hse)[,1]
sat$s.satm = scale(sat$satm)[,1]
sat$s.satv = scale(sat$satv)[,1]

ggpairs(sat, column = c(9:13, 2), mapping=aes(alpha=0.5))

dat1 = sat %>% select(9:13, 2)
pairs.panels(dat1)

summary(lm(gpa ~ s.hsm, data=sat))
## 
## Call:
## lm(formula = gpa ~ s.hsm, data = sat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.12610 -0.36370  0.08651  0.46351  1.63911 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.63522    0.04696  56.119  < 2e-16 ***
## s.hsm        0.34020    0.04706   7.229 7.77e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7028 on 222 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1869 
## F-statistic: 52.25 on 1 and 222 DF,  p-value: 7.774e-12
summary(lm(gpa ~ s.satm, data=sat))
## 
## Call:
## lm(formula = gpa ~ s.satm, data = sat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.59405 -0.37876  0.08366  0.55984  1.39948 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.63522    0.05051  52.170  < 2e-16 ***
## s.satm       0.19618    0.05063   3.875  0.00014 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.756 on 222 degrees of freedom
## Multiple R-squared:  0.06336,    Adjusted R-squared:  0.05914 
## F-statistic: 15.02 on 1 and 222 DF,  p-value: 0.0001402
summary(lm(gpa ~ s.hsm + s.satm, data=sat))
## 
## Call:
## lm(formula = gpa ~ s.hsm + s.satm, data = sat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1435 -0.3292  0.0793  0.4591  1.6169 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.63522    0.04696  56.118  < 2e-16 ***
## s.hsm        0.31628    0.05281   5.990 8.45e-09 ***
## s.satm       0.05275    0.05281   0.999    0.319    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7028 on 221 degrees of freedom
## Multiple R-squared:  0.1942, Adjusted R-squared:  0.1869 
## F-statistic: 26.63 on 2 and 221 DF,  p-value: 4.365e-11

Việc 5: Tìm mô hình tối ưu

library(BMA)
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.5
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.5-5)
x = sat[, c("s.hsm", "s.hss", "s.hse", "s.satm", "s.satv", "sex")]
y = sat$gpa
r = bicreg(x = x, y = y, strict = FALSE, OR = 20)
summary(r)
## 
## Call:
## bicreg(x = x, y = y, strict = FALSE, OR = 20)
## 
## 
##   6  models were selected
##  Best  5  models (cumulative posterior probability =  0.9614 ): 
## 
##            p!=0    EV        SD        model 1    model 2    model 3  
## Intercept  100.0  2.6338185  0.054198    2.63522    2.63522    2.63522
## s.hsm      100.0  0.3245713  0.054157    0.34020    0.29932    0.28777
## s.hss       13.2  0.0120537  0.037257      .          .        0.09109
## s.hse       17.3  0.0157980  0.040856      .        0.09148      .    
## s.satm       6.2  0.0032458  0.018228      .          .          .    
## s.satv       3.9  0.0005734  0.009958      .          .          .    
## sex          3.9  0.0010385  0.020071      .          .          .    
##                                                                       
## nVar                                       1          2          2    
## r2                                       0.191      0.202      0.200  
## BIC                                    -41.93648  -39.59809  -39.06568
## post prob                                0.556      0.173      0.132  
##            model 4    model 5  
## Intercept    2.63522    2.63522
## s.hsm        0.31628    0.33695
## s.hss          .          .    
## s.hse          .          .    
## s.satm       0.05275      .    
## s.satv         .        0.01473
## sex            .          .    
##                                
## nVar           2          2    
## r2           0.194      0.191  
## BIC        -37.53438  -36.61894
## post prob    0.062      0.039
imageplot.bma(r)

model = lm (gpa ~ s.hsm + s.hss + s.hse + s.satm + s.satv, data=sat)
library(relaimpo)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:survival':
## 
##     aml
## The following object is masked from 'package:psych':
## 
##     logit
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
metrics = calc.relimp(model, type=c("lmg"))
metrics
## Response variable: gpa 
## Total response variance: 0.6074565 
## Analysis based on 224 observations 
## 
## 5 Regressors: 
## s.hsm s.hss s.hse s.satm s.satv 
## Proportion of variance explained by model: 21.15%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##                lmg
## s.hsm  0.105941155
## s.hss  0.040887708
## s.hse  0.033054100
## s.satm 0.027854805
## s.satv 0.003712514
## 
## Average coefficients for different model sizes: 
## 
##                1X        2Xs         3Xs         4Xs         5Xs
## s.hsm  0.34020499 0.31008030  0.28463658  0.26096075  0.23919178
## s.hss  0.25675243 0.18853263  0.13503334  0.09207860  0.06102693
## s.hse  0.22524611 0.15734863  0.11853408  0.09595220  0.08337422
## s.satm 0.19618484 0.14162783  0.11384806  0.09612545  0.08152776
## s.satv 0.08923328 0.01815515 -0.01558741 -0.03133515 -0.03777113
boot = boot.relimp(model, b=1000, type=c("lmg"), fixed=F)
booteval.relimp(boot, typesel=c("lmg"), level=0.9, bty="perc", nodiff=T)
## Response variable: gpa 
## Total response variance: 0.6074565 
## Analysis based on 224 observations 
## 
## 5 Regressors: 
## s.hsm s.hss s.hse s.satm s.satv 
## Proportion of variance explained by model: 21.15%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##                lmg
## s.hsm  0.105941155
## s.hss  0.040887708
## s.hse  0.033054100
## s.satm 0.027854805
## s.satv 0.003712514
## 
## Average coefficients for different model sizes: 
## 
##                1X        2Xs         3Xs         4Xs         5Xs
## s.hsm  0.34020499 0.31008030  0.28463658  0.26096075  0.23919178
## s.hss  0.25675243 0.18853263  0.13503334  0.09207860  0.06102693
## s.hse  0.22524611 0.15734863  0.11853408  0.09595220  0.08337422
## s.satm 0.19618484 0.14162783  0.11384806  0.09612545  0.08152776
## s.satv 0.08923328 0.01815515 -0.01558741 -0.03133515 -0.03777113
## 
##  
##  Confidence interval information ( 1000 bootstrap replicates, bty= perc ): 
## Relative Contributions with confidence intervals: 
##  
##                             Lower  Upper
##            percentage 0.9   0.9    0.9   
## s.hsm.lmg  0.1059     AB___ 0.0592 0.1691
## s.hss.lmg  0.0409     _BCD_ 0.0181 0.0811
## s.hse.lmg  0.0331     _BCD_ 0.0134 0.0667
## s.satm.lmg 0.0279     _BCD_ 0.0085 0.0622
## s.satv.lmg 0.0037     ___DE 0.0022 0.0204
## 
## Letters indicate the ranks covered by bootstrap CIs. 
## (Rank bootstrap confidence intervals always obtained by percentile method) 
## CAUTION: Bootstrap confidence intervals can be somewhat liberal.