Problem definition Use salary-train.csv to create a linear model. Use salary-test.csv and predict the salaries.
Data description The variables are:
SX = Sex, coded 1 for female and 0 for male RK = Rank, coded 1 for assistant professor, 2 for associate professor, and 3 for full professor YR = Number of years in current rank DG = Highest degree, coded 1 if doctorate, 0 if masters YD = Number of years since highest degree was earned SL = Academic year salary, in dollars.
Setup
Load libraries
library(ggplot2)
library(corrgram)
library(gridExtra)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(pscl)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
Functions
Dataset
dfrModel <- read.csv("D:/R-BA/R-Scripts/data/salary-train.csv", header=T, stringsAsFactors=F)
head(dfrModel)
## SX RK YR DG YD SL
## 1 male full 25 doctorate 35 36350
## 2 male full 13 doctorate 22 35350
## 3 male full 10 doctorate 23 28200
## 4 female full 7 doctorate 27 26775
## 5 male full 19 masters 30 33696
## 6 male full 16 doctorate 21 28516
Change Sex Column
Change Sex Col male = 0 / female = 1
dfrModel$SX <- ifelse(dfrModel$SX=="male",0,1)
head(dfrModel)
## SX RK YR DG YD SL
## 1 0 full 25 doctorate 35 36350
## 2 0 full 13 doctorate 22 35350
## 3 0 full 10 doctorate 23 28200
## 4 1 full 7 doctorate 27 26775
## 5 0 full 19 masters 30 33696
## 6 0 full 16 doctorate 21 28516
Change Degree Column
Change Degree Col masters = 0 /doctorate = 1
dfrModel$DG <- ifelse(dfrModel$DG=="masters",0,1)
head(dfrModel)
## SX RK YR DG YD SL
## 1 0 full 25 1 35 36350
## 2 0 full 13 1 22 35350
## 3 0 full 10 1 23 28200
## 4 1 full 7 1 27 26775
## 5 0 full 19 0 30 33696
## 6 0 full 16 1 21 28516
Change Rank Column
Change Rank Col assistant = 1 / associate = 2 / full = 3
dfrModel$RK <- ifelse(dfrModel$RK=="assistant",1,
ifelse(dfrModel$RK=="associate", 2,3))
head(dfrModel)
## SX RK YR DG YD SL
## 1 0 3 25 1 35 36350
## 2 0 3 13 1 22 35350
## 3 0 3 10 1 23 28200
## 4 1 3 7 1 27 26775
## 5 0 3 19 0 30 33696
## 6 0 3 16 1 21 28516
Check for outliers
lapply(dfrModel, FUN=detect_outliers)
## $SX
## [1] 1 1 1 1 1
##
## $RK
## numeric(0)
##
## $YR
## integer(0)
##
## $DG
## numeric(0)
##
## $YD
## integer(0)
##
## $SL
## integer(0)
Observation Ignore the outliers in SX column.
Missing Data
lapply(dfrModel, FUN=detect_na)
## $SX
## [1] 0
##
## $RK
## [1] 0
##
## $YR
## [1] 0
##
## $DG
## [1] 0
##
## $YD
## [1] 0
##
## $SL
## [1] 0
Correlation
vctCorr = numeric(0)
for (i in names(dfrModel)){
cor.result <- cor(as.numeric(dfrModel$SL), as.numeric(dfrModel[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
## SX RK YR DG YD
## -0.005754106 0.685295537 0.432201859 0.377699012 0.329367156
## SL
## 1.000000000
Observation Rank and Salary seems to have a decent correlation.
Data For Visualization
dfrGraph <- gather(dfrModel, variable, value, -SL)
head(dfrGraph)
## SL variable value
## 1 36350 SX 0
## 2 35350 SX 0
## 3 28200 SX 0
## 4 26775 SX 1
## 5 33696 SX 0
## 6 28516 SX 0
Data visualization
ggplot(dfrGraph) +
geom_jitter(aes(value,SL, colour=variable)) +
geom_smooth(aes(value,SL, colour=variable), method=lm, se=FALSE) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Salary With Other Features")
Find Best Multi Linear Model
stpModel=step(lm(data=dfrModel, formula=SL~.), trace=0, steps=100)
summary(stpModel)
##
## Call:
## lm(formula = SL ~ SX + RK + YR, data = dfrModel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3428.9 -2196.8 165.4 996.2 7827.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9830.6 2477.2 3.969 0.000481 ***
## SX 1994.1 1480.1 1.347 0.189102
## RK 4985.7 828.2 6.020 2.01e-06 ***
## YR 429.4 108.4 3.961 0.000491 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2813 on 27 degrees of freedom
## Multiple R-squared: 0.6647, Adjusted R-squared: 0.6274
## F-statistic: 17.84 on 3 and 27 DF, p-value: 1.386e-06
Observation
Best results given by SL ~ SX+ RK + YR
Make Final Multi Linear Model
# make model
Model <- lm(data=dfrModel, formula= SL ~ SX + RK + YR)
# print summary
summary(Model)
##
## Call:
## lm(formula = SL ~ SX + RK + YR, data = dfrModel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3428.9 -2196.8 165.4 996.2 7827.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9830.6 2477.2 3.969 0.000481 ***
## SX 1994.1 1480.1 1.347 0.189102
## RK 4985.7 828.2 6.020 2.01e-06 ***
## YR 429.4 108.4 3.961 0.000491 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2813 on 27 degrees of freedom
## Multiple R-squared: 0.6647, Adjusted R-squared: 0.6274
## F-statistic: 17.84 on 3 and 27 DF, p-value: 1.386e-06
Confusion Matrix
prdVal <- predict(Model, type='response')
Regression Data
dfrPlot <- mutate(dfrModel, PrdVal=prdVal)
head(dfrPlot)
## SX RK YR DG YD SL PrdVal
## 1 0 3 25 1 35 36350 35522.87
## 2 0 3 13 1 22 35350 30370.04
## 3 0 3 10 1 23 28200 29081.84
## 4 1 3 7 1 27 26775 29787.69
## 5 0 3 19 0 30 33696 32946.46
## 6 0 3 16 1 21 28516 31658.25
Regression Visulaization
#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=SL)) +
geom_point(shape=19, colour="red", fill="blue") +
geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
labs(title="Binomial Regression Curve") +
labs(x="") +
labs(y="")
Test Data
dfrTests <- read.csv("D:/R-BA/R-Scripts/data/salary-test.csv", header=T, stringsAsFactors=F)
dfrTests$SX <- ifelse(dfrTests$SX=="male",0,1)
dfrTests$DG <- ifelse(dfrTests$DG=="masters",0,1)
dfrTests$RK<- ifelse(dfrTests$RK=="assistant",1,
ifelse(dfrTests$RK=="associate", 2,3))
head(dfrTests)
## SX RK YR DG YD SL
## 1 0 2 11 0 31 23300
## 2 0 1 9 0 14 23713
## 3 1 2 4 0 33 20690
## 4 1 2 6 0 29 22450
## 5 0 2 1 1 9 20850
## 6 1 1 8 1 14 18304
Observation
Test Data successfully created.
Predict
result <- predict(Model, dfrTests)
print(result)
## 1 2 3 4 5 6 7 8
## 24525.51 18680.97 23513.76 24372.56 20231.49 20245.64 16533.96 16533.96
## 9 10 11 12 13 14 15 16
## 16533.96 16104.56 16104.56 19802.09 18098.63 15675.16 15675.16 17669.22
## 17 18 19 20 21
## 17669.22 15245.76 17239.82 17239.82 16810.42