library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(knitr)
library(heplots)
## Warning: package 'heplots' was built under R version 3.4.2
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(reshape)
## Warning: package 'reshape' was built under R version 3.4.2
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths

Including Plots

You can also embed plots, for example:

Dataset

Salary <- read.table("http://data.princeton.edu/wws509/datasets/salary.dat",header=TRUE)
head(Salary)
##       sx   rk yr        dg yd    sl
## 1   male full 25 doctorate 35 36350
## 2   male full 13 doctorate 22 35350
## 3   male full 10 doctorate 23 28200
## 4 female full  7 doctorate 27 26775
## 5   male full 19   masters 30 33696
## 6   male full 16 doctorate 21 28516

Observation There are six coloumns in data.Varible sl being the dependent variable.

str(Salary)
## 'data.frame':    52 obs. of  6 variables:
##  $ sx: Factor w/ 2 levels "female","male": 2 2 2 1 2 2 1 2 2 2 ...
##  $ rk: Factor w/ 3 levels "assistant","associate",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ yr: int  25 13 10 7 19 16 0 16 13 13 ...
##  $ dg: Factor w/ 2 levels "doctorate","masters": 1 1 1 1 2 1 2 1 2 2 ...
##  $ yd: int  35 22 23 27 30 21 32 18 30 31 ...
##  $ sl: int  36350 35350 28200 26775 33696 28516 24900 31909 31850 32850 ...

Exploratory_Analysis

#summary(dfrModel)
lapply(Salary, FUN=summary)
## $sx
## female   male 
##     14     38 
## 
## $rk
## assistant associate      full 
##        18        14        20 
## 
## $yr
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   7.000   7.481  11.000  25.000 
## 
## $dg
## doctorate   masters 
##        34        18 
## 
## $yd
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.75   15.50   16.12   23.25   35.00 
## 
## $sl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15000   18247   23719   23798   27259   38045

Observation Data Contains no missing values

Functions to detect outliers

detect_outliers(Salary$yr)
## [1] 25
detect_outliers(Salary$yd)
## integer(0)
detect_outliers(Salary$sl)
## integer(0)

Observation There are no any outliers in data set

testing correlation

kable(cor(Salary[,c("yr", "yd", "sl")]), col.names=c("years rank", "years degree", "salary"))
years rank years degree salary
yr 1.0000000 0.6387763 0.7006690
yd 0.6387763 1.0000000 0.6748542
sl 0.7006690 0.6748542 1.0000000

Observation variable “year rank”" has more correlation with salary than “years degree”.

lets check with factors

sx_rk <- table(Salary[,c("sx", "rk")])
sx_rk
##         rk
## sx       assistant associate full
##   female         8         2    4
##   male          10        12   16
fisher.test(x=as.matrix(sx_rk))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  as.matrix(sx_rk)
## p-value = 0.1564
## alternative hypothesis: two.sided
ggplot(Salary) + geom_boxplot(aes(x=sx, y=sl))

ggplot(Salary) + geom_boxplot(aes(x=rk, y=sl))

ggplot(Salary) + geom_boxplot(aes(x=dg, y=sl))

model.aov <- aov(sl ~ sx, data=Salary)
summary(model.aov)
##             Df    Sum Sq   Mean Sq F value Pr(>F)  
## sx           1 1.141e+08 114106220   3.413 0.0706 .
## Residuals   50 1.672e+09  33432473                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etasq(model.aov, partial = FALSE)
##                eta^2
## sx        0.06389893
## Residuals         NA
model.aov <- aov(sl ~ rk, data=Salary)
summary(model.aov)
##             Df    Sum Sq   Mean Sq F value   Pr(>F)    
## rk           2 1.347e+09 673391900   75.17 1.17e-15 ***
## Residuals   49 4.389e+08   8958083                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etasq(model.aov, partial = FALSE)
##               eta^2
## rk        0.7541924
## Residuals        NA
model.aov <- aov(sl ~ dg, data=Salary)
summary(model.aov)
##             Df    Sum Sq  Mean Sq F value Pr(>F)
## dg           1 8.682e+06  8681649   0.244  0.623
## Residuals   50 1.777e+09 35540964
etasq(model.aov, partial = FALSE)
##                 eta^2
## dg        0.004861681
## Residuals          NA

Observations From the plots it looks like both sx (sex) and rk (rank) are correlated with salary. The computed eta values, which can not be seen as the correlation coefficient but give a similar idea of the data. dg (degree) does not seem to be very promising to contain information about salary.

** Lets see some Visualizations before creating model**

ggplot(Salary, aes(x = yr, y = sl, col = rk, shape = sx)) + geom_point()

dfrGraph <- gather(Salary, variable, value, -sl)
## Warning: attributes are not identical across measure variables;
## they will be dropped
ggplot(dfrGraph) +
    geom_jitter(aes(value,sl, colour=variable)) + 
    geom_smooth(aes(value,sl, colour=variable), method=lm, se=FALSE) +
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Salary With Other Features")

ggpairs(Salary, mapping=aes(col=sx, alpha=0.5), columns=which(colnames(Salary) %in% c("sl", "yd", "yr")))

ggpairs(Salary, mapping=aes(col=rk, alpha=0.5), columns=which(colnames(Salary) %in% c("sl", "yd", "yr")))

Observation From this plot you can see that rank could be very helpful in determining the salary since the three densities don’t overlap too much.

Lets try fitting simple model with all variables

fit1 <- lm(sl ~ sx + rk + yr + dg + yd,Salary)
summary(fit1)
## 
## Call:
## lm(formula = sl ~ sx + rk + yr + dg + yd, data = Salary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4045.2 -1094.7  -361.5   813.2  9193.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16912.42     816.44  20.715  < 2e-16 ***
## sxmale      -1166.37     925.57  -1.260    0.214    
## rkassociate  5292.36    1145.40   4.621 3.22e-05 ***
## rkfull      11118.76    1351.77   8.225 1.62e-10 ***
## yr            476.31      94.91   5.018 8.65e-06 ***
## dgmasters    1388.61    1018.75   1.363    0.180    
## yd           -124.57      77.49  -1.608    0.115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2398 on 45 degrees of freedom
## Multiple R-squared:  0.855,  Adjusted R-squared:  0.8357 
## F-statistic: 44.24 on 6 and 45 DF,  p-value: < 2.2e-16

Better Model using step

#?step()
stpModel=step(lm(data=Salary, sl~.), trace=0, steps=1000)
stpSummary <- summary(stpModel)
stpSummary 
## 
## Call:
## lm(formula = sl ~ rk + yr, data = Salary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3462.0 -1302.8  -299.2   783.5  9381.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16203.27     638.68  25.370  < 2e-16 ***
## rkassociate  4262.28     882.89   4.828 1.45e-05 ***
## rkfull       9454.52     905.83  10.437 6.12e-14 ***
## yr            375.70      70.92   5.298 2.90e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2402 on 48 degrees of freedom
## Multiple R-squared:  0.8449, Adjusted R-squared:  0.8352 
## F-statistic: 87.15 on 3 and 48 DF,  p-value: < 2.2e-16

Observation

F static value has increased drastically.

Train and Test

set.seed(65848)
train_idx <- sample(1:nrow(Salary), round(nrow(Salary) * 0.75))
train <- Salary[train_idx,]
test <- Salary[-train_idx,] # exclude all that are in train_idx
fit <- lm(sl ~ rk + yr, data=train)

Create predictions

pred_test <- predict(fit, test)  # it automatically picks the right columns
sum((pred_test - test$sl)^2) # residual sum of squares
## [1] 105130932
pred_train <- predict(fit, train) 
sum((pred_train - train$sl)^2) 
## [1] 181652324
sqrt(sum((pred_test - test$sl)^2) / length(pred_test)) # root mean squared error
## [1] 2843.764
sqrt(sum((pred_train - train$sl)^2) / length(pred_train))
## [1] 2158.183

ObserVationS Root mean square of errors are calculated .which is surprisingly lesser for train set.

Automate train and test

get_residual_sum_of_squares <- function(prediction, target) {
  return (sum((prediction-target)^2))
  }
rmsd <- function(prediction, target) {
  return(sqrt(get_residual_sum_of_squares(prediction, target)/length(prediction)))
  }
build_model <- function (train, test, features, target) {
  form <- as.formula(paste(target, "~",paste(features,collapse="+"))) # this is a way to paste feature names dynamically into a formula
  fit <- lm(form, data = train)
  train_error <- rmsd(predict(fit, train), train[,target])
  test_error <- rmsd(predict(fit, test), test[,target])
  return(list("train_error"=train_error, "test_error"=test_error, "model"=fit))
  }

Observations The test errors are more wide spread than the trainings errors but interestingly the mean test error is below the mean training error.

Computes Errors

train_order <- sample(nrow(Salary))
CV <- list()
errors <- NULL
for (t in train_order) {
  CV[[length(CV) + 1]] <- build_model(Salary[-t,], Salary[t,], c("yd", "sx", "rk", "yr", "dg"), "sl")
  errors <- rbind(errors, data.frame("train" = CV[[length(CV)]]$train_error, "test" = mean(CV[[length(CV)]]$test_error)))
}
errors_melt <- melt(errors, id.vars=NULL)
ggplot(errors_melt) + geom_boxplot(aes(x=variable, y=value))

train_err_all <- mean(errors$train)
train_err_all
## [1] 2226.789
test_err_all <- mean(errors$test)
test_err_all
## [1] 1803.707