sa=read.csv("E:/CONG VIEC/Ky nang ngoai/Xu ly so lieu Bang R/Dataset Van Lang R and Machine Learning 2023/Thuc hanh ngay 1/Salaries.csv")
head(sa)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 3 AsstProf B 4 3 Male 79750
## 4 4 Prof B 45 39 Male 115000
## 5 5 Prof B 40 41 Male 141500
## 6 6 AssocProf B 6 6 Male 97000
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(table1)
##
## Attaching package: 'table1'
##
## The following objects are masked from 'package:base':
##
## units, units<-
library(compareGroups)
th1=compareGroups(Sex~Rank+Discipline+Yrs.since.phd+Yrs.service+Salary,data=sa)
tb1=createTable(th1)
tb1
##
## --------Summary descriptives table by 'Sex'---------
##
## _____________________________________________________
## Female Male p.overall
## N=39 N=358
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Rank: 0.014
## AssocProf 10 (25.6%) 54 (15.1%)
## AsstProf 11 (28.2%) 56 (15.6%)
## Prof 18 (46.2%) 248 (69.3%)
## Discipline: 1.000
## A 18 (46.2%) 163 (45.5%)
## B 21 (53.8%) 195 (54.5%)
## Yrs.since.phd 16.5 (9.78) 22.9 (13.0) <0.001
## Yrs.service 11.6 (8.81) 18.3 (13.2) <0.001
## Salary 101002 (25952) 115090 (30437) 0.003
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
export2word(tb1, file="E:/CONG VIEC/Ky nang ngoai/Xu ly so lieu Bang R/Dataset Van Lang R and Machine Learning 2023/Thuc hanh ngay 2/table1.doc", header.labels=c(p.overall="p-value"))
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
vars=sa[, c("Rank","Discipline","Yrs.since.phd","Yrs.service","Sex","Salary")]
ggpairs(data=vars,mapping=aes(color=Sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mh1=lm(Salary~Sex,data=sa)
summary(mh1)
##
## Call:
## lm(formula = Salary ~ Sex, data = sa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101002 4809 21.001 < 2e-16 ***
## SexMale 14088 5065 2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
plot(mh1)
## Nhan xet mh 1: Moi lien quan giua Salary va Sex: ### Std.Error để
tính CI95% cua 14088. bang 14088+1.965065 hoac 14088-1.965065.
Vi R2 nho nen co the noi bien gioi tinh chi giai thich rat it cho su
chenh lech tien luong giua nm=am va nu. Co the dung HQTT de so sanh 2
nhom doc lap thay t-test.
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
mh2=ols(Salary~Sex, data=sa,x= TRUE,y= TRUE)
validate=validate(mh2,B=100)
validate
## index.orig training test optimism index.corrected
## R-square 1.920000e-02 1.880000e-02 1.490000e-02 3.900000e-03 1.530000e-02
## MSE 8.975331e+08 8.903880e+08 9.014712e+08 -1.108326e+07 9.086163e+08
## g 2.502313e+03 2.346472e+03 2.502313e+03 -1.558410e+02 2.658153e+03
## Intercept 0.000000e+00 0.000000e+00 -3.409729e+04 3.409729e+04 -3.409729e+04
## Slope 1.000000e+00 1.000000e+00 1.297300e+00 -2.973000e-01 1.297300e+00
## n
## R-square 100
## MSE 100
## g 100
## Intercept 100
## Slope 100
mh3=lm(Salary~Sex+Yrs.since.phd,data=sa)
summary(mh3)
##
## Call:
## lm(formula = Salary ~ Sex + Yrs.since.phd, data = sa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84167 -19735 -2551 15427 102033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85181.8 4748.3 17.939 <2e-16 ***
## SexMale 7923.6 4684.1 1.692 0.0915 .
## Yrs.since.phd 958.1 108.3 8.845 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared: 0.1817, Adjusted R-squared: 0.1775
## F-statistic: 43.74 on 2 and 394 DF, p-value: < 2.2e-16
mh4=ols(Salary~Sex+Yrs.since.phd, data=sa,x= TRUE,y= TRUE)
validate2=validate(mh4,B=100)
validate2
## index.orig training test optimism index.corrected
## R-square 1.817000e-01 1.791000e-01 1.751000e-01 0.0040 1.777000e-01
## MSE 7.488405e+08 7.494946e+08 7.548965e+08 -5401855.4167 7.542424e+08
## g 1.480181e+04 1.456507e+04 1.474882e+04 -183.7512 1.498556e+04
## Intercept 0.000000e+00 0.000000e+00 -2.836017e+03 2836.0167 -2.836017e+03
## Slope 1.000000e+00 1.000000e+00 1.026600e+00 -0.0266 1.026600e+00
## n
## R-square 100
## MSE 100
## g 100
## Intercept 100
## Slope 100
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123)
sample=createDataPartition(y=sa$Salary,p=0.7,list=F)
train=sa[sample,]
test=sa[-sample,]
control=trainControl(method="CV",number = 10)
mh5= train(Salary~Rank+Sex+Yrs.since.phd+Yrs.service+Discipline,data=train,method="lm",trcontrol=control,metric="Rsquared")
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trcontrol' will be disregarded
summary(mh5)
##
## Call:
## lm(formula = .outcome ~ ., data = dat, trcontrol = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62588 -13478 -1077 11209 86416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75164.4 5521.5 13.613 < 2e-16 ***
## RankAsstProf -11557.3 4718.2 -2.450 0.01493 *
## RankProf 30581.8 4013.5 7.620 4.20e-13 ***
## SexMale 6327.0 4195.1 1.508 0.13266
## Yrs.since.phd 914.3 279.0 3.277 0.00119 **
## Yrs.service -751.3 251.1 -2.992 0.00303 **
## DisciplineB 14401.6 2675.9 5.382 1.59e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21670 on 273 degrees of freedom
## Multiple R-squared: 0.4845, Adjusted R-squared: 0.4732
## F-statistic: 42.77 on 6 and 273 DF, p-value: < 2.2e-16
pred=predict(mh5,test)
model.value=data.frame(obs=test$Salary,pred)
defaultSummary(model.value)
## RMSE Rsquared MAE
## 2.474092e+04 3.844177e-01 1.779092e+04
plot(pred~test$ Salary, pch=16, col="coral", xlab="gia tri quan sat", ylab="gia tri tien luong")