library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(knitr)
library(heplots)
## Warning: package 'heplots' was built under R version 3.4.2
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(reshape)
## Warning: package 'reshape' was built under R version 3.4.2
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
You can also embed plots, for example:
Dataset
Salary <- read.table("http://data.princeton.edu/wws509/datasets/salary.dat",header=TRUE)
head(Salary)
## sx rk yr dg yd sl
## 1 male full 25 doctorate 35 36350
## 2 male full 13 doctorate 22 35350
## 3 male full 10 doctorate 23 28200
## 4 female full 7 doctorate 27 26775
## 5 male full 19 masters 30 33696
## 6 male full 16 doctorate 21 28516
Observation There are six coloumns in data.Varible sl being the dependent variable.
str(Salary)
## 'data.frame': 52 obs. of 6 variables:
## $ sx: Factor w/ 2 levels "female","male": 2 2 2 1 2 2 1 2 2 2 ...
## $ rk: Factor w/ 3 levels "assistant","associate",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ yr: int 25 13 10 7 19 16 0 16 13 13 ...
## $ dg: Factor w/ 2 levels "doctorate","masters": 1 1 1 1 2 1 2 1 2 2 ...
## $ yd: int 35 22 23 27 30 21 32 18 30 31 ...
## $ sl: int 36350 35350 28200 26775 33696 28516 24900 31909 31850 32850 ...
Exploratory_Analysis
#summary(dfrModel)
lapply(Salary, FUN=summary)
## $sx
## female male
## 14 38
##
## $rk
## assistant associate full
## 18 14 20
##
## $yr
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 7.000 7.481 11.000 25.000
##
## $dg
## doctorate masters
## 34 18
##
## $yd
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.75 15.50 16.12 23.25 35.00
##
## $sl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15000 18247 23719 23798 27259 38045
Observation Data Contains no missing values
Functions to detect outliers
detect_outliers(Salary$yr)
## [1] 25
detect_outliers(Salary$yd)
## integer(0)
detect_outliers(Salary$sl)
## integer(0)
Observation There are no any outliers in data set
testing correlation
kable(cor(Salary[,c("yr", "yd", "sl")]), col.names=c("years rank", "years degree", "salary"))
| years rank | years degree | salary | |
|---|---|---|---|
| yr | 1.0000000 | 0.6387763 | 0.7006690 |
| yd | 0.6387763 | 1.0000000 | 0.6748542 |
| sl | 0.7006690 | 0.6748542 | 1.0000000 |
Observation variable “year rank”" has more correlation with salary than “years degree”.
lets check with factors
sx_rk <- table(Salary[,c("sx", "rk")])
sx_rk
## rk
## sx assistant associate full
## female 8 2 4
## male 10 12 16
fisher.test(x=as.matrix(sx_rk))
##
## Fisher's Exact Test for Count Data
##
## data: as.matrix(sx_rk)
## p-value = 0.1564
## alternative hypothesis: two.sided
ggplot(Salary) + geom_boxplot(aes(x=sx, y=sl))
ggplot(Salary) + geom_boxplot(aes(x=rk, y=sl))
ggplot(Salary) + geom_boxplot(aes(x=dg, y=sl))
model.aov <- aov(sl ~ sx, data=Salary)
summary(model.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## sx 1 1.141e+08 114106220 3.413 0.0706 .
## Residuals 50 1.672e+09 33432473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etasq(model.aov, partial = FALSE)
## eta^2
## sx 0.06389893
## Residuals NA
model.aov <- aov(sl ~ rk, data=Salary)
summary(model.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## rk 2 1.347e+09 673391900 75.17 1.17e-15 ***
## Residuals 49 4.389e+08 8958083
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etasq(model.aov, partial = FALSE)
## eta^2
## rk 0.7541924
## Residuals NA
model.aov <- aov(sl ~ dg, data=Salary)
summary(model.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## dg 1 8.682e+06 8681649 0.244 0.623
## Residuals 50 1.777e+09 35540964
etasq(model.aov, partial = FALSE)
## eta^2
## dg 0.004861681
## Residuals NA
Observations From the plots it looks like both sx (sex) and rk (rank) are correlated with salary. The computed eta values, which can not be seen as the correlation coefficient but give a similar idea of the data. dg (degree) does not seem to be very promising to contain information about salary.
** Lets see some Visualizations before creating model**
ggplot(Salary, aes(x = yr, y = sl, col = rk, shape = sx)) + geom_point()
dfrGraph <- gather(Salary, variable, value, -sl)
## Warning: attributes are not identical across measure variables;
## they will be dropped
ggplot(dfrGraph) +
geom_jitter(aes(value,sl, colour=variable)) +
geom_smooth(aes(value,sl, colour=variable), method=lm, se=FALSE) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Salary With Other Features")
ggpairs(Salary, mapping=aes(col=sx, alpha=0.5), columns=which(colnames(Salary) %in% c("sl", "yd", "yr")))
ggpairs(Salary, mapping=aes(col=rk, alpha=0.5), columns=which(colnames(Salary) %in% c("sl", "yd", "yr")))
Observation From this plot you can see that rank could be very helpful in determining the salary since the three densities don’t overlap too much.
Lets try fitting simple model with all variables
fit1 <- lm(sl ~ sx + rk + yr + dg + yd,Salary)
summary(fit1)
##
## Call:
## lm(formula = sl ~ sx + rk + yr + dg + yd, data = Salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4045.2 -1094.7 -361.5 813.2 9193.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16912.42 816.44 20.715 < 2e-16 ***
## sxmale -1166.37 925.57 -1.260 0.214
## rkassociate 5292.36 1145.40 4.621 3.22e-05 ***
## rkfull 11118.76 1351.77 8.225 1.62e-10 ***
## yr 476.31 94.91 5.018 8.65e-06 ***
## dgmasters 1388.61 1018.75 1.363 0.180
## yd -124.57 77.49 -1.608 0.115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2398 on 45 degrees of freedom
## Multiple R-squared: 0.855, Adjusted R-squared: 0.8357
## F-statistic: 44.24 on 6 and 45 DF, p-value: < 2.2e-16
Better Model using step
#?step()
stpModel=step(lm(data=Salary, sl~.), trace=0, steps=1000)
stpSummary <- summary(stpModel)
stpSummary
##
## Call:
## lm(formula = sl ~ rk + yr, data = Salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3462.0 -1302.8 -299.2 783.5 9381.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16203.27 638.68 25.370 < 2e-16 ***
## rkassociate 4262.28 882.89 4.828 1.45e-05 ***
## rkfull 9454.52 905.83 10.437 6.12e-14 ***
## yr 375.70 70.92 5.298 2.90e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2402 on 48 degrees of freedom
## Multiple R-squared: 0.8449, Adjusted R-squared: 0.8352
## F-statistic: 87.15 on 3 and 48 DF, p-value: < 2.2e-16
Observation
F static value has increased drastically.
Train and Test
set.seed(65848)
train_idx <- sample(1:nrow(Salary), round(nrow(Salary) * 0.75))
train <- Salary[train_idx,]
test <- Salary[-train_idx,] # exclude all that are in train_idx
fit <- lm(sl ~ rk + yr, data=train)
Create predictions
pred_test <- predict(fit, test) # it automatically picks the right columns
sum((pred_test - test$sl)^2) # residual sum of squares
## [1] 105130932
pred_train <- predict(fit, train)
sum((pred_train - train$sl)^2)
## [1] 181652324
sqrt(sum((pred_test - test$sl)^2) / length(pred_test)) # root mean squared error
## [1] 2843.764
sqrt(sum((pred_train - train$sl)^2) / length(pred_train))
## [1] 2158.183
ObserVationS Root mean square of errors are calculated .which is surprisingly lesser for train set.
Automate train and test
get_residual_sum_of_squares <- function(prediction, target) {
return (sum((prediction-target)^2))
}
rmsd <- function(prediction, target) {
return(sqrt(get_residual_sum_of_squares(prediction, target)/length(prediction)))
}
build_model <- function (train, test, features, target) {
form <- as.formula(paste(target, "~",paste(features,collapse="+"))) # this is a way to paste feature names dynamically into a formula
fit <- lm(form, data = train)
train_error <- rmsd(predict(fit, train), train[,target])
test_error <- rmsd(predict(fit, test), test[,target])
return(list("train_error"=train_error, "test_error"=test_error, "model"=fit))
}
Observations The test errors are more wide spread than the trainings errors but interestingly the mean test error is below the mean training error.
Computes Errors
train_order <- sample(nrow(Salary))
CV <- list()
errors <- NULL
for (t in train_order) {
CV[[length(CV) + 1]] <- build_model(Salary[-t,], Salary[t,], c("yd", "sx", "rk", "yr", "dg"), "sl")
errors <- rbind(errors, data.frame("train" = CV[[length(CV)]]$train_error, "test" = mean(CV[[length(CV)]]$test_error)))
}
errors_melt <- melt(errors, id.vars=NULL)
ggplot(errors_melt) + geom_boxplot(aes(x=variable, y=value))
train_err_all <- mean(errors$train)
train_err_all
## [1] 2226.789
test_err_all <- mean(errors$test)
test_err_all
## [1] 1803.707