RBA-19-SrobonaRoychowdhury

Problem definition Use salary-train.csv to create a linear model. Use salary-test.csv and predict the salaries.

Data description The variables are:

SX = Sex, coded 1 for female and 0 for male RK = Rank, coded 1 for assistant professor, 2 for associate professor, and 3 for full professor YR = Number of years in current rank DG = Highest degree, coded 1 if doctorate, 0 if masters YD = Number of years since highest degree was earned SL = Academic year salary, in dollars.

Setup

Load libraries

library(ggplot2)
library(corrgram)
library(gridExtra)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: lattice

library(pscl)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Classes and Methods for R developed in the

## Political Science Computational Laboratory

## Department of Political Science

## Stanford University

## Simon Jackman

## hurdle and zeroinfl functions by Achim Zeileis

Functions

Dataset

dfrModel <- read.csv("D:/R-BA/R-Scripts/data/salary-train.csv", header=T, stringsAsFactors=F)
head(dfrModel)

##       SX   RK YR        DG YD    SL
## 1   male full 25 doctorate 35 36350
## 2   male full 13 doctorate 22 35350
## 3   male full 10 doctorate 23 28200
## 4 female full  7 doctorate 27 26775
## 5   male full 19   masters 30 33696
## 6   male full 16 doctorate 21 28516

Change Sex Column
Change Sex Col male = 0 / female = 1

dfrModel$SX <- ifelse(dfrModel$SX=="male",0,1)
head(dfrModel)

##   SX   RK YR        DG YD    SL
## 1  0 full 25 doctorate 35 36350
## 2  0 full 13 doctorate 22 35350
## 3  0 full 10 doctorate 23 28200
## 4  1 full  7 doctorate 27 26775
## 5  0 full 19   masters 30 33696
## 6  0 full 16 doctorate 21 28516

Change Degree Column
Change Degree Col masters = 0 /doctorate = 1

dfrModel$DG <- ifelse(dfrModel$DG=="masters",0,1)
head(dfrModel)

##   SX   RK YR DG YD    SL
## 1  0 full 25  1 35 36350
## 2  0 full 13  1 22 35350
## 3  0 full 10  1 23 28200
## 4  1 full  7  1 27 26775
## 5  0 full 19  0 30 33696
## 6  0 full 16  1 21 28516

Change Rank Column
Change Rank Col assistant = 1 / associate = 2 / full = 3

dfrModel$RK <- ifelse(dfrModel$RK=="assistant",1,
                      ifelse(dfrModel$RK=="associate", 2,3))
head(dfrModel)

##   SX RK YR DG YD    SL
## 1  0  3 25  1 35 36350
## 2  0  3 13  1 22 35350
## 3  0  3 10  1 23 28200
## 4  1  3  7  1 27 26775
## 5  0  3 19  0 30 33696
## 6  0  3 16  1 21 28516

Check for outliers

lapply(dfrModel, FUN=detect_outliers)

## $SX
## [1] 1 1 1 1 1
## 
## $RK
## numeric(0)
## 
## $YR
## integer(0)
## 
## $DG
## numeric(0)
## 
## $YD
## integer(0)
## 
## $SL
## integer(0)

Observation Ignore the outliers in SX column.

Missing Data

lapply(dfrModel, FUN=detect_na)

## $SX
## [1] 0
## 
## $RK
## [1] 0
## 
## $YR
## [1] 0
## 
## $DG
## [1] 0
## 
## $YD
## [1] 0
## 
## $SL
## [1] 0

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
    cor.result <- cor(as.numeric(dfrModel$SL), as.numeric(dfrModel[,i]))
    vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr

##           SX           RK           YR           DG           YD 
## -0.005754106  0.685295537  0.432201859  0.377699012  0.329367156 
##           SL 
##  1.000000000

Observation Rank and Salary seems to have a decent correlation.

Data For Visualization

dfrGraph <- gather(dfrModel, variable, value, -SL)
head(dfrGraph)

##      SL variable value
## 1 36350       SX     0
## 2 35350       SX     0
## 3 28200       SX     0
## 4 26775       SX     1
## 5 33696       SX     0
## 6 28516       SX     0

Data visualization

ggplot(dfrGraph) +
    geom_jitter(aes(value,SL, colour=variable)) + 
    geom_smooth(aes(value,SL, colour=variable), method=lm, se=FALSE) +
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Salary With Other Features")

Find Best Multi Linear Model

stpModel=step(lm(data=dfrModel, formula=SL~.), trace=0, steps=100)
summary(stpModel)

## 
## Call:
## lm(formula = SL ~ SX + RK + YR, data = dfrModel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3428.9 -2196.8   165.4   996.2  7827.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9830.6     2477.2   3.969 0.000481 ***
## SX            1994.1     1480.1   1.347 0.189102    
## RK            4985.7      828.2   6.020 2.01e-06 ***
## YR             429.4      108.4   3.961 0.000491 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2813 on 27 degrees of freedom
## Multiple R-squared:  0.6647, Adjusted R-squared:  0.6274 
## F-statistic: 17.84 on 3 and 27 DF,  p-value: 1.386e-06

Observation
Best results given by SL ~ SX+ RK + YR

Make Final Multi Linear Model

# make model
Model <- lm(data=dfrModel, formula= SL ~ SX + RK + YR)

# print summary
summary(Model)

## 
## Call:
## lm(formula = SL ~ SX + RK + YR, data = dfrModel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3428.9 -2196.8   165.4   996.2  7827.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9830.6     2477.2   3.969 0.000481 ***
## SX            1994.1     1480.1   1.347 0.189102    
## RK            4985.7      828.2   6.020 2.01e-06 ***
## YR             429.4      108.4   3.961 0.000491 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2813 on 27 degrees of freedom
## Multiple R-squared:  0.6647, Adjusted R-squared:  0.6274 
## F-statistic: 17.84 on 3 and 27 DF,  p-value: 1.386e-06

Confusion Matrix

prdVal <- predict(Model, type='response')

Regression Data

dfrPlot <- mutate(dfrModel, PrdVal=prdVal)
head(dfrPlot)

##   SX RK YR DG YD    SL   PrdVal
## 1  0  3 25  1 35 36350 35522.87
## 2  0  3 13  1 22 35350 30370.04
## 3  0  3 10  1 23 28200 29081.84
## 4  1  3  7  1 27 26775 29787.69
## 5  0  3 19  0 30 33696 32946.46
## 6  0  3 16  1 21 28516 31658.25

Regression Visulaization

#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=SL))  + 
    geom_point(shape=19, colour="red", fill="blue") +
    geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
    labs(title="Binomial Regression Curve") +
    labs(x="") +
    labs(y="")

Test Data

dfrTests <- read.csv("D:/R-BA/R-Scripts/data/salary-test.csv", header=T, stringsAsFactors=F)
dfrTests$SX <- ifelse(dfrTests$SX=="male",0,1)
dfrTests$DG <- ifelse(dfrTests$DG=="masters",0,1)
dfrTests$RK<- ifelse(dfrTests$RK=="assistant",1,
                      ifelse(dfrTests$RK=="associate", 2,3))
head(dfrTests)

##   SX RK YR DG YD    SL
## 1  0  2 11  0 31 23300
## 2  0  1  9  0 14 23713
## 3  1  2  4  0 33 20690
## 4  1  2  6  0 29 22450
## 5  0  2  1  1  9 20850
## 6  1  1  8  1 14 18304

Observation
Test Data successfully created.

Predict

result <-  predict(Model, dfrTests)
print(result)

##        1        2        3        4        5        6        7        8 
## 24525.51 18680.97 23513.76 24372.56 20231.49 20245.64 16533.96 16533.96 
##        9       10       11       12       13       14       15       16 
## 16533.96 16104.56 16104.56 19802.09 18098.63 15675.16 15675.16 17669.22 
##       17       18       19       20       21 
## 17669.22 15245.76 17239.82 17239.82 16810.42

RBA-19-SrobonaRoychowdhury

Srobona Roychowdhury

20 November 2017