library(carData)
data(Salaries)
dim(Salaries)
## [1] 397 6
head(Salaries)
## rank discipline yrs.since.phd yrs.service sex salary
## 1 Prof B 19 18 Male 139750
## 2 Prof B 20 16 Male 173200
## 3 AsstProf B 4 3 Male 79750
## 4 Prof B 45 39 Male 115000
## 5 Prof B 40 41 Male 141500
## 6 AssocProf B 6 6 Male 97000
library(ggplot2)
ggplot(data = Salaries, aes(x = salary)) + geom_histogram(fill = "blue", col = "white") + labs(x = "Lương (USD)", y = "Số người", title = "Phân bố lương GS (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = Salaries, aes(x = sex, y = salary, col = sex))
p + geom_boxplot()
m.1 = lm(salary ~ sex, data = Salaries)
summary(m.1)
##
## Call:
## lm(formula = salary ~ sex, data = Salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101002 4809 21.001 < 2e-16 ***
## sexMale 14088 5065 2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
par(mfrow = c(2,2))
plot(m.1)
# Dùng gói ggfortify:
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.2
autoplot(m.1)
vars = Salaries[, c("rank", "discipline", "yrs.since.phd", "yrs.service", "sex", "salary")]
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data = vars, mapping = aes(color = sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
m.2 = lm(salary ~ sex + discipline + yrs.service, data = Salaries)
summary(m.2)
##
## Call:
## lm(formula = salary ~ sex + discipline + yrs.service, data = Salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77424 -19404 -4635 15539 105391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84361.1 4941.4 17.072 < 2e-16 ***
## sexMale 8423.3 4744.5 1.775 0.0766 .
## disciplineB 13033.8 2840.3 4.589 6.01e-06 ***
## yrs.service 832.2 110.2 7.550 3.07e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27790 on 393 degrees of freedom
## Multiple R-squared: 0.1646, Adjusted R-squared: 0.1582
## F-statistic: 25.81 on 3 and 393 DF, p-value: 2.928e-15
autoplot((m.2))
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
yvar = Salaries[, c("salary")]
xvar = Salaries[, c("sex", "rank", "discipline", "yrs.since.phd", "yrs.service")]
m.bma = bicreg(xvar, yvar, strict = FALSE, OR = 20)
summary(m.bma)
##
## Call:
## bicreg(x = xvar, y = yvar, strict = FALSE, OR = 20)
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 71631.552 3429.93 71944.33 68223.53 72253.53
## sexMale 8.1 365.573 1649.61 . 4491.80 .
## rankAssocProf 100.0 13760.698 3984.90 13761.54 13723.42 14483.23
## rankProf 100.0 47809.009 3259.03 47843.84 47403.32 49377.50
## disciplineB 100.0 13759.250 2300.72 13760.96 13708.69 13561.43
## yrs.since.phd 4.8 3.481 31.86 . . .
## yrs.service 5.2 -3.983 30.56 . . -76.33
##
## nVar 3 4 4
## r2 0.445 0.447 0.446
## BIC -215.78 -211.17 -210.28
## post prob 0.818 0.081 0.052
## model 4
## Intercept 71405.40
## sexMale .
## rankAssocProf 13030.16
## rankProf 46211.57
## disciplineB 14028.68
## yrs.since.phd 71.92
## yrs.service .
##
## nVar 4
## r2 0.445
## BIC -210.13
## post prob 0.048
imageplot.bma(m.bma)
m.3 = lm(salary ~ rank + discipline, data = Salaries)
summary(m.3)
##
## Call:
## lm(formula = salary ~ rank + discipline, data = Salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65990 -14049 -1288 10760 97996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71944 3135 22.948 < 2e-16 ***
## rankAssocProf 13762 3961 3.475 0.000569 ***
## rankProf 47844 3112 15.376 < 2e-16 ***
## disciplineB 13761 2296 5.993 4.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22650 on 393 degrees of freedom
## Multiple R-squared: 0.445, Adjusted R-squared: 0.4407
## F-statistic: 105 on 3 and 393 DF, p-value: < 2.2e-16
autoplot(m.3)
library(readxl)
df = read_excel("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Insurance data for reg analysis.xlsx")
dim(df)
## [1] 1338 8
head(df)
## # A tibble: 6 × 8
## id age sex bmi children smoking region charges
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 19 female 27.9 0 yes southwest 16885.
## 2 2 18 male 33.8 1 no southeast 1726.
## 3 3 28 male 33 3 no southeast 4449.
## 4 4 33 male 22.7 0 no northwest 21984.
## 5 5 32 male 28.9 0 no northwest 3867.
## 6 6 31 female 25.7 0 no southeast 3757.
library(ggplot2)
ggplot(data = df, aes(x = charges)) + geom_histogram(fill = "blue", col = "white") + labs(x = "Phí bảo hiểm (USD)", y = "Số người", title = "Phân bố phí bảo hiểm (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chuyển đổi dữ liệu
df$y = log(df$charges)
ggplot(data = df, aes(x = y)) + geom_histogram(fill = "blue", col = "white") + labs(x = "log(Phí bảo hiểm (USD))", y = "Số người", title = "Phân bố phí bảo hiểm (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1234)
index = createDataPartition(df$y, p = 0.7, list = FALSE)
training = df[index, ]
dim(training)
## [1] 938 9
testing = df[-index, ]
dim(testing)
## [1] 400 9
fit = lm(y ~ age + sex + bmi + children + smoking + region, data = training)
summary(fit)
##
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoking + region,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08746 -0.21190 -0.05229 0.06475 2.15423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.102467 0.088675 80.095 < 2e-16 ***
## age 0.033259 0.001067 31.181 < 2e-16 ***
## sexmale -0.061977 0.029935 -2.070 0.0387 *
## bmi 0.013598 0.002538 5.357 1.07e-07 ***
## children 0.105015 0.012314 8.528 < 2e-16 ***
## smokingyes 1.573470 0.037395 42.077 < 2e-16 ***
## regionnorthwest -0.083466 0.042530 -1.963 0.0500 *
## regionsoutheast -0.214992 0.043499 -4.942 9.15e-07 ***
## regionsouthwest -0.177750 0.043724 -4.065 5.20e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4561 on 929 degrees of freedom
## Multiple R-squared: 0.7557, Adjusted R-squared: 0.7536
## F-statistic: 359.1 on 8 and 929 DF, p-value: < 2.2e-16
testing$predicted = predict(fit, testing)
error = testing$predicted - testing$y
# RMSE:
sqrt(mean(error^2))
## [1] 0.4196563
# R2:
cor(testing$predicted, testing$y)^2
## [1] 0.7935273
fit.2 = train(form = y ~ age + sex + bmi + children + smoking + region, data = df, method = "lm", trContrl = trainControl(method = "cv", number = 10))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
fit.2
## Linear Regression
##
## 1338 samples
## 6 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1338, 1338, 1338, 1338, 1338, 1338, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4458389 0.7619611 0.2814355
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
fit.3 = train(form = y ~ age + sex + bmi + children + smoking + region, data = df, method = "lm", trContrl = trainControl(method = "boot", number = 100))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trContrl' will be disregarded
fit.3
## Linear Regression
##
## 1338 samples
## 6 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1338, 1338, 1338, 1338, 1338, 1338, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4442467 0.767426 0.2822197
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
df$gender = ifelse(df$sex == "female", 1, 0)
df$smoke = ifelse(df$smoking == "yes", 1, 0)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::select() masks MASS::select()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df = df %>%
mutate(area = case_when(region == "northeast" ~ 1,
region == "northwest" ~ 2,
region == "southeast" ~ 3,
region == "southwest" ~ 4))
m.rel = lm(y ~ age + gender + bmi + children + smoke + area, data = df)
calc.relimp(m.rel, type = "lmg", rela = TRUE, rank = TRUE)
## Response variable: y
## Total response variance: 0.8455301
## Analysis based on 1338 observations
##
## 6 Regressors:
## age gender bmi children smoke area
## Proportion of variance explained by model: 76.7%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## age 0.362162950
## gender 0.001258041
## bmi 0.015751896
## children 0.028567714
## smoke 0.588736915
## area 0.003522485
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## age 0.03454513 0.034587378 0.03462174 0.03464601 0.03465802
## gender -0.01035399 0.007543188 0.02507011 0.04221895 0.05898003
## bmi 0.02000482 0.018466697 0.01692440 0.01537887 0.01383119
## children 0.12306432 0.118758126 0.11454432 0.11041611 0.10636778
## smoke 1.51587709 1.522850531 1.52978138 1.53662037 1.54332089
## area -0.03552783 -0.039673654 -0.04294073 -0.04534090 -0.04688970
## 6Xs
## age 0.03465562
## gender 0.07534136
## bmi 0.01228258
## children 0.10239500
## smoke 1.54983839
## area -0.04760667