#Loading libraries
library(googleVis)
library(ggplot2)
library(caret)
library(gbm)
library(MASS)
#Setting working directory
setwd("G:\\Downloads\\New folder")#setting working directory
#Loading train dataset
R_train = read.csv("hr_train.csv", stringsAsFactors = FALSE)
#Looking at the data set
str(R_train)
'data.frame': 10499 obs. of 10 variables:
$ satisfaction_level : num 0.42 0.66 0.55 0.22 0.2 0.83 0.87 0.85 0.89 0.45 ...
$ last_evaluation : num 0.46 0.77 0.49 0.88 0.72 0.84 0.49 0.99 0.92 0.56 ...
$ number_project : int 2 2 5 4 6 4 2 3 5 2 ...
$ average_montly_hours : int 150 171 240 213 224 206 251 208 237 154 ...
$ time_spend_company : int 3 2 3 3 4 2 3 2 5 3 ...
$ Work_accident : int 0 0 0 1 0 0 0 0 0 0 ...
$ left : int 1 0 0 0 1 0 0 0 0 1 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sales : chr "sales" "technical" "technical" "technical" ...
$ salary : chr "medium" "medium" "high" "medium" ...
#Checking if we have any NA values in the dataset
apply(R_train, 2, function(x) any(is.na(x)))
satisfaction_level last_evaluation number_project
FALSE FALSE FALSE
average_montly_hours time_spend_company Work_accident
FALSE FALSE FALSE
left promotion_last_5years sales
FALSE FALSE FALSE
salary
FALSE
#Loading test dataset
R_test = read.csv("hr_test.csv", stringsAsFactors = FALSE)
str(R_test)
'data.frame': 4500 obs. of 9 variables:
$ satisfaction_level : num 0.38 0.8 0.1 0.45 0.11 0.41 0.38 0.45 0.4 0.4 ...
$ last_evaluation : num 0.53 0.86 0.77 0.54 0.81 0.55 0.54 0.47 0.53 0.49 ...
$ number_project : int 2 5 6 2 6 2 2 2 2 2 ...
$ average_montly_hours : int 157 262 247 135 305 148 143 160 158 135 ...
$ time_spend_company : int 3 6 4 3 4 3 3 3 3 3 ...
$ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sales : chr "sales" "sales" "sales" "sales" ...
$ salary : chr "low" "medium" "low" "low" ...
#Adding 0 values in the left column of test data
R_test$left=rep(0,4500)
#Looking at new dataset
str(R_test)
'data.frame': 4500 obs. of 10 variables:
$ satisfaction_level : num 0.38 0.8 0.1 0.45 0.11 0.41 0.38 0.45 0.4 0.4 ...
$ last_evaluation : num 0.53 0.86 0.77 0.54 0.81 0.55 0.54 0.47 0.53 0.49 ...
$ number_project : int 2 5 6 2 6 2 2 2 2 2 ...
$ average_montly_hours : int 157 262 247 135 305 148 143 160 158 135 ...
$ time_spend_company : int 3 6 4 3 4 3 3 3 3 3 ...
$ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sales : chr "sales" "sales" "sales" "sales" ...
$ salary : chr "low" "medium" "low" "low" ...
$ left : num 0 0 0 0 0 0 0 0 0 0 ...
#Checking if we have any NA values in the dataset
apply(R_test, 2, function(x) any(is.na(x)))
satisfaction_level last_evaluation number_project
FALSE FALSE FALSE
average_montly_hours time_spend_company Work_accident
FALSE FALSE FALSE
promotion_last_5years sales salary
FALSE FALSE FALSE
left
FALSE
#Loading library dplyr
library(dplyr)
#Joining both datasets
R = bind_rows(R_train,R_test)
str(R)
'data.frame': 14999 obs. of 10 variables:
$ satisfaction_level : num 0.42 0.66 0.55 0.22 0.2 0.83 0.87 0.85 0.89 0.45 ...
$ last_evaluation : num 0.46 0.77 0.49 0.88 0.72 0.84 0.49 0.99 0.92 0.56 ...
$ number_project : int 2 2 5 4 6 4 2 3 5 2 ...
$ average_montly_hours : int 150 171 240 213 224 206 251 208 237 154 ...
$ time_spend_company : int 3 2 3 3 4 2 3 2 5 3 ...
$ Work_accident : int 0 0 0 1 0 0 0 0 0 0 ...
$ left : num 1 0 0 0 1 0 0 0 0 1 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sales : chr "sales" "technical" "technical" "technical" ...
$ salary : chr "medium" "medium" "high" "medium" ...
#Checking if we have any NA values in the dataset
apply(R, 2, function(x) any(is.na(x)))
satisfaction_level last_evaluation number_project
FALSE FALSE FALSE
average_montly_hours time_spend_company Work_accident
FALSE FALSE FALSE
left promotion_last_5years sales
FALSE FALSE FALSE
salary
FALSE
#calling library for data cleaning
library(dplyr)
table(R$salary)
high low medium
1237 7316 6446
#Probability table round to 2 decimal places
round(prop.table(table(R$salary,R$left),1),2)
0 1
high 0.90 0.10
low 0.76 0.24
medium 0.81 0.19
#Assigning categories
#1 : high 0.90
#2 : medium 0.81
R=R %>% mutate (sal_1 = as.numeric(salary %in% c("high")),
sal_2 = as.numeric(salary %in% c("medium"))) %>%
select(-salary)
table(R$sales)
accounting hr IT management marketing product_mng
767 739 1227 630 858 902
RandD sales support technical
787 4140 2229 2720
#Probability table round to 2 decimal places
round(prop.table(table(R$sales,R$left),1),2)
0 1
accounting 0.76 0.24
hr 0.76 0.24
IT 0.81 0.19
management 0.84 0.16
marketing 0.77 0.23
product_mng 0.80 0.20
RandD 0.83 0.17
sales 0.79 0.21
support 0.80 0.20
technical 0.79 0.21
#Assigning categories
#1 : accounting, hr 0.76
#2 : product_mng, support 0.80
#3 :sales, technical 0.79
#4 : IT 0.81
#5 : Management 0.84
#6 : RandD
R = R %>% mutate(s_1 = as.numeric(sales %in% c("management")),
s_2 = as.numeric(sales %in% c("RandD")),
s_3 = as.numeric(sales %in% c("IT")),
s_4 = as.numeric(sales %in% c("product_mng","support")),
s_5 = as.numeric(sales %in% c("sales","technical")),
s_6 = as.numeric(sales %in% c("accounting","hr"))) %>%
select(-sales)
#Looking at the complete dataset once again
str(R)
'data.frame': 14999 obs. of 16 variables:
$ satisfaction_level : num 0.42 0.66 0.55 0.22 0.2 0.83 0.87 0.85 0.89 0.45 ...
$ last_evaluation : num 0.46 0.77 0.49 0.88 0.72 0.84 0.49 0.99 0.92 0.56 ...
$ number_project : int 2 2 5 4 6 4 2 3 5 2 ...
$ average_montly_hours : int 150 171 240 213 224 206 251 208 237 154 ...
$ time_spend_company : int 3 2 3 3 4 2 3 2 5 3 ...
$ Work_accident : int 0 0 0 1 0 0 0 0 0 0 ...
$ left : num 1 0 0 0 1 0 0 0 0 1 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sal_1 : num 0 0 1 0 0 0 0 0 0 0 ...
$ sal_2 : num 1 1 0 1 1 1 1 0 1 0 ...
$ s_1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_3 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_4 : num 0 0 0 0 0 0 0 1 0 0 ...
$ s_5 : num 1 1 1 1 1 1 1 0 1 0 ...
$ s_6 : num 0 0 0 0 0 0 0 0 0 0 ...
#Splitting the dataset back
R_train <- R[1:10499,]
R_test <- R[10500:14999,]
#Dropping the column left from test dataset
R_test$left = NULL
#Looking at the training dataset after split
str(R_train)
'data.frame': 10499 obs. of 16 variables:
$ satisfaction_level : num 0.42 0.66 0.55 0.22 0.2 0.83 0.87 0.85 0.89 0.45 ...
$ last_evaluation : num 0.46 0.77 0.49 0.88 0.72 0.84 0.49 0.99 0.92 0.56 ...
$ number_project : int 2 2 5 4 6 4 2 3 5 2 ...
$ average_montly_hours : int 150 171 240 213 224 206 251 208 237 154 ...
$ time_spend_company : int 3 2 3 3 4 2 3 2 5 3 ...
$ Work_accident : int 0 0 0 1 0 0 0 0 0 0 ...
$ left : num 1 0 0 0 1 0 0 0 0 1 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sal_1 : num 0 0 1 0 0 0 0 0 0 0 ...
$ sal_2 : num 1 1 0 1 1 1 1 0 1 0 ...
$ s_1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_3 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_4 : num 0 0 0 0 0 0 0 1 0 0 ...
$ s_5 : num 1 1 1 1 1 1 1 0 1 0 ...
$ s_6 : num 0 0 0 0 0 0 0 0 0 0 ...
#Looking at the test dataset after split
str(R_test) #Test data
'data.frame': 4500 obs. of 15 variables:
$ satisfaction_level : num 0.38 0.8 0.1 0.45 0.11 0.41 0.38 0.45 0.4 0.4 ...
$ last_evaluation : num 0.53 0.86 0.77 0.54 0.81 0.55 0.54 0.47 0.53 0.49 ...
$ number_project : int 2 5 6 2 6 2 2 2 2 2 ...
$ average_montly_hours : int 157 262 247 135 305 148 143 160 158 135 ...
$ time_spend_company : int 3 6 4 3 4 3 3 3 3 3 ...
$ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sal_1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ sal_2 : num 0 1 0 0 0 0 0 0 0 0 ...
$ s_1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_3 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_4 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_5 : num 1 1 1 1 1 1 1 1 1 1 ...
$ s_6 : num 0 0 0 0 0 0 0 0 0 0 ...
#Spliting the Train data in two parts
R_train_1 <- R_train[1:8000,]
R_train_2 <- R_train[8001:10499,]
#Looking at first train dataset
str(R_train_1)
'data.frame': 8000 obs. of 16 variables:
$ satisfaction_level : num 0.42 0.66 0.55 0.22 0.2 0.83 0.87 0.85 0.89 0.45 ...
$ last_evaluation : num 0.46 0.77 0.49 0.88 0.72 0.84 0.49 0.99 0.92 0.56 ...
$ number_project : int 2 2 5 4 6 4 2 3 5 2 ...
$ average_montly_hours : int 150 171 240 213 224 206 251 208 237 154 ...
$ time_spend_company : int 3 2 3 3 4 2 3 2 5 3 ...
$ Work_accident : int 0 0 0 1 0 0 0 0 0 0 ...
$ left : num 1 0 0 0 1 0 0 0 0 1 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sal_1 : num 0 0 1 0 0 0 0 0 0 0 ...
$ sal_2 : num 1 1 0 1 1 1 1 0 1 0 ...
$ s_1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_3 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_4 : num 0 0 0 0 0 0 0 1 0 0 ...
$ s_5 : num 1 1 1 1 1 1 1 0 1 0 ...
$ s_6 : num 0 0 0 0 0 0 0 0 0 0 ...
#Looking at second train dataset
str(R_train_2)
'data.frame': 2499 obs. of 16 variables:
$ satisfaction_level : num 0.94 0.32 0.99 0.69 0.57 0.56 0.54 0.88 0.78 0.76 ...
$ last_evaluation : num 0.57 0.8 0.77 0.72 0.78 0.79 0.55 0.67 0.59 0.93 ...
$ number_project : int 4 3 5 4 3 5 4 5 5 3 ...
$ average_montly_hours : int 251 263 222 210 206 248 252 141 236 271 ...
$ time_spend_company : int 2 3 2 2 4 3 3 2 3 5 ...
$ Work_accident : int 1 0 0 1 0 0 0 0 0 0 ...
$ left : num 0 0 0 0 0 1 0 0 0 0 ...
$ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
$ sal_1 : num 0 0 0 1 0 0 0 0 1 0 ...
$ sal_2 : num 1 1 1 0 1 1 1 1 0 0 ...
$ s_1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_2 : num 0 0 0 0 1 0 1 0 0 0 ...
$ s_3 : num 0 0 0 0 0 0 0 0 0 0 ...
$ s_4 : num 0 0 0 0 0 0 0 1 0 0 ...
$ s_5 : num 1 0 0 1 0 1 0 0 1 0 ...
$ s_6 : num 0 0 1 0 0 0 0 0 0 1 ...
#checking up VIF
library(car)
R_fit <- lm(left ~ ., data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ ., data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7843 -0.3004 -0.1692 0.4305 1.0704
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4997786 0.0344202 14.520 < 2e-16 ***
satisfaction_level -0.5205802 0.0198631 -26.208 < 2e-16 ***
last_evaluation 0.0949358 0.0310277 3.060 0.00222 **
number_project -0.0323241 0.0045314 -7.133 1.07e-12 ***
average_montly_hours 0.0006084 0.0001074 5.664 1.53e-08 ***
time_spend_company 0.0348142 0.0033825 10.292 < 2e-16 ***
Work_accident -0.1005581 0.0136176 -7.384 1.69e-13 ***
promotion_last_5years -0.0495646 0.0334804 -1.480 0.13880
sal_1 -0.1786174 0.0183777 -9.719 < 2e-16 ***
sal_2 -0.0705828 0.0099914 -7.064 1.75e-12 ***
s_1 -0.0542933 0.0312387 -1.738 0.08225 .
s_2 -0.0302769 0.0284416 -1.065 0.28712
s_3 -0.0419180 0.0258330 -1.623 0.10470
s_4 -0.0240547 0.0220289 -1.092 0.27488
s_5 -0.0131796 0.0206357 -0.639 0.52305
s_6 0.0076130 0.0245561 0.310 0.75655
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4252 on 7984 degrees of freedom
Multiple R-squared: 0.1253, Adjusted R-squared: 0.1237
F-statistic: 76.26 on 15 and 7984 DF, p-value: < 2.2e-16
#VIF is fine but we will drop the variables one by one as per p-value. Let's drop s_6 first.
R_fit <- lm(left ~.-s_6, data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ . - s_6, data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7843 -0.3003 -0.1688 0.4309 1.0725
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.5046686 0.0305918 16.497 < 2e-16 ***
satisfaction_level -0.5207236 0.0198566 -26.224 < 2e-16 ***
last_evaluation 0.0948226 0.0310238 3.056 0.00225 **
number_project -0.0323010 0.0045306 -7.130 1.09e-12 ***
average_montly_hours 0.0006085 0.0001074 5.665 1.52e-08 ***
time_spend_company 0.0347879 0.0033812 10.289 < 2e-16 ***
Work_accident -0.1006462 0.0136139 -7.393 1.58e-13 ***
promotion_last_5years -0.0498923 0.0334618 -1.491 0.13600
sal_1 -0.1786012 0.0183766 -9.719 < 2e-16 ***
sal_2 -0.0705249 0.0099891 -7.060 1.80e-12 ***
s_1 -0.0589871 0.0273231 -2.159 0.03089 *
s_2 -0.0350225 0.0239699 -1.461 0.14403
s_3 -0.0466763 0.0207783 -2.246 0.02471 *
s_4 -0.0288088 0.0158144 -1.822 0.06854 .
s_5 -0.0179328 0.0138112 -1.298 0.19418
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4252 on 7985 degrees of freedom
Multiple R-squared: 0.1253, Adjusted R-squared: 0.1238
F-statistic: 81.71 on 14 and 7985 DF, p-value: < 2.2e-16
#Result after dropping s_5 s_6
R_fit <- lm(left ~.-s_6 -s_5, data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ . - s_6 - s_5, data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7892 -0.3002 -0.1680 0.4316 1.0861
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4920425 0.0290064 16.963 < 2e-16 ***
satisfaction_level -0.5216218 0.0198454 -26.284 < 2e-16 ***
last_evaluation 0.0956468 0.0310187 3.084 0.00205 **
number_project -0.0325078 0.0045280 -7.179 7.63e-13 ***
average_montly_hours 0.0006088 0.0001074 5.667 1.50e-08 ***
time_spend_company 0.0347575 0.0033813 10.279 < 2e-16 ***
Work_accident -0.1007695 0.0136142 -7.402 1.48e-13 ***
promotion_last_5years -0.0489001 0.0334545 -1.462 0.14387
sal_1 -0.1783554 0.0183764 -9.706 < 2e-16 ***
sal_2 -0.0702594 0.0099874 -7.035 2.16e-12 ***
s_1 -0.0457954 0.0253651 -1.805 0.07104 .
s_2 -0.0217195 0.0216709 -1.002 0.31626
s_3 -0.0333445 0.0180649 -1.846 0.06496 .
s_4 -0.0154901 0.0120371 -1.287 0.19818
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4252 on 7986 degrees of freedom
Multiple R-squared: 0.1251, Adjusted R-squared: 0.1237
F-statistic: 87.86 on 13 and 7986 DF, p-value: < 2.2e-16
#Result after dropping s_6 s_5 s_2
R_fit <- lm(left ~.-s_6 -s_5 -s_2, data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ . - s_6 - s_5 - s_2, data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7878 -0.3005 -0.1682 0.4317 1.0878
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4904305 0.0289618 16.934 < 2e-16 ***
satisfaction_level -0.5218547 0.0198440 -26.298 < 2e-16 ***
last_evaluation 0.0957317 0.0310186 3.086 0.00203 **
number_project -0.0325580 0.0045277 -7.191 7.02e-13 ***
average_montly_hours 0.0006095 0.0001074 5.675 1.44e-08 ***
time_spend_company 0.0347764 0.0033812 10.285 < 2e-16 ***
Work_accident -0.1010494 0.0136113 -7.424 1.25e-13 ***
promotion_last_5years -0.0495417 0.0334484 -1.481 0.13861
sal_1 -0.1781509 0.0183753 -9.695 < 2e-16 ***
sal_2 -0.0702307 0.0099873 -7.032 2.20e-12 ***
s_1 -0.0441102 0.0253093 -1.743 0.08140 .
s_3 -0.0316663 0.0179872 -1.760 0.07836 .
s_4 -0.0138032 0.0119189 -1.158 0.24686
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4252 on 7987 degrees of freedom
Multiple R-squared: 0.125, Adjusted R-squared: 0.1237
F-statistic: 95.09 on 12 and 7987 DF, p-value: < 2.2e-16
#Result after dropping s_6 s_5 s_2 s_4
R_fit <- lm(left ~.-s_6 -s_5 -s_2 -s_4, data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ . - s_6 - s_5 - s_2 - s_4, data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7852 -0.3002 -0.1682 0.4304 1.0912
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4879266 0.0288816 16.894 < 2e-16 ***
satisfaction_level -0.5224802 0.0198371 -26.339 < 2e-16 ***
last_evaluation 0.0949758 0.0310124 3.063 0.0022 **
number_project -0.0325465 0.0045278 -7.188 7.16e-13 ***
average_montly_hours 0.0006090 0.0001074 5.670 1.48e-08 ***
time_spend_company 0.0348224 0.0033811 10.299 < 2e-16 ***
Work_accident -0.1012144 0.0136108 -7.436 1.14e-13 ***
promotion_last_5years -0.0478940 0.0334188 -1.433 0.1519
sal_1 -0.1779939 0.0183752 -9.687 < 2e-16 ***
sal_2 -0.0701696 0.0099874 -7.026 2.30e-12 ***
s_1 -0.0410768 0.0251739 -1.632 0.1028
s_3 -0.0283607 0.0177596 -1.597 0.1103
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4252 on 7988 degrees of freedom
Multiple R-squared: 0.1249, Adjusted R-squared: 0.1237
F-statistic: 103.6 on 11 and 7988 DF, p-value: < 2.2e-16
#Result after dropping s_6 s_5 s_2 s_4 promotion_last_5years
R_fit <- lm(left ~.-s_6 -s_5 -s_2 -s_4 -promotion_last_5years, data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ . - s_6 - s_5 - s_2 - s_4 - promotion_last_5years,
data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7827 -0.3003 -0.1683 0.4314 1.0933
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4883854 0.0288817 16.910 < 2e-16 ***
satisfaction_level -0.5231718 0.0198325 -26.380 < 2e-16 ***
last_evaluation 0.0953436 0.0310133 3.074 0.00212 **
number_project -0.0324676 0.0045277 -7.171 8.12e-13 ***
average_montly_hours 0.0006089 0.0001074 5.669 1.49e-08 ***
time_spend_company 0.0345583 0.0033763 10.236 < 2e-16 ***
Work_accident -0.1019057 0.0136032 -7.491 7.55e-14 ***
sal_1 -0.1797026 0.0183377 -9.800 < 2e-16 ***
sal_2 -0.0709060 0.0099749 -7.108 1.27e-12 ***
s_1 -0.0453968 0.0249945 -1.816 0.06937 .
s_3 -0.0276537 0.0177539 -1.558 0.11937
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4253 on 7989 degrees of freedom
Multiple R-squared: 0.1246, Adjusted R-squared: 0.1235
F-statistic: 113.8 on 10 and 7989 DF, p-value: < 2.2e-16
#Result after dropping s_6 s_5 s_2 s_4 promotion_last_5years s_3
R_fit <- lm(left ~.-s_6 -s_5 -s_2 -s_4 -promotion_last_5years -s_3, data=R_train_1)
summary(R_fit)
Call:
lm(formula = left ~ . - s_6 - s_5 - s_2 - s_4 - promotion_last_5years -
s_3, data = R_train_1)
Residuals:
Min 1Q Median 3Q Max
-0.7802 -0.3002 -0.1688 0.4325 1.0955
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4864201 0.0288567 16.856 < 2e-16 ***
satisfaction_level -0.5235490 0.0198328 -26.398 < 2e-16 ***
last_evaluation 0.0957698 0.0310149 3.088 0.00202 **
number_project -0.0325445 0.0045279 -7.188 7.19e-13 ***
average_montly_hours 0.0006089 0.0001074 5.668 1.50e-08 ***
time_spend_company 0.0345160 0.0033765 10.223 < 2e-16 ***
Work_accident -0.1016023 0.0136030 -7.469 8.93e-14 ***
sal_1 -0.1794182 0.0183384 -9.784 < 2e-16 ***
sal_2 -0.0708363 0.0099756 -7.101 1.35e-12 ***
s_1 -0.0431841 0.0249563 -1.730 0.08360 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4253 on 7990 degrees of freedom
Multiple R-squared: 0.1244, Adjusted R-squared: 0.1234
F-statistic: 126.1 on 9 and 7990 DF, p-value: < 2.2e-16
#Calling library random forest
library(randomForest)
#Creating mod_RF to train dataset on R_train_1 dataset
mod_RF <- randomForest(left ~.-s_6 -s_5 -s_2 -s_4 -promotion_last_5years -s_3, R_train_1)
The response has five or fewer unique values. Are you sure you want to do regression?
#Looking at result
mod_RF
Call:
randomForest(formula = left ~ . - s_6 - s_5 - s_2 - s_4 - promotion_last_5years - s_3, data = R_train_1)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 0.1063211
% Var explained: 48.47
#Predicting the result of train model on dataset R_train_2 to check accuracy
pred_RF <- predict(mod_RF, R_train_2, type="class")
#Viewing the first few results
str(pred_RF)
Named num [1:2499] 0.2443 0.1787 0.1364 0.0378 0.1007 ...
- attr(*, "names")= chr [1:2499] "8001" "8002" "8003" "8004" ...
#Predicting the result on test dataset
pred_RF_new <- predict(mod_RF, R_test, type="class")
#Viewing the first few predicted results
str(pred_RF_new)
Named num [1:4500] 0.903 0.618 0.978 0.959 0.949 ...
- attr(*, "names")= chr [1:4500] "10500" "10501" "10502" "10503" ...
#Give name to predicted results as left (the column we need to find)
R_test$left = pred_RF_new
#Looking at the left column
str(R_test$left)
num [1:4500] 0.903 0.618 0.978 0.959 0.949 ...
#Submitting the file
submit = data.frame (left= R_test$left)
#Giving name to the file
write.csv(submit, file = "Atul_Kumar_P4_part2.csv", row.names = FALSE)
LS0tDQp0aXRsZTogIkhSIFByb2plY3QgTm90ZWJvb2siDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQNCiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgZGZfcHJpbnQ6IHBhZ2VkDQotLS0NCiANCg0KYGBge3J9DQojTG9hZGluZyBsaWJyYXJpZXMgDQpsaWJyYXJ5KGdvb2dsZVZpcykNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoY2FyZXQpDQpsaWJyYXJ5KGdibSkNCmxpYnJhcnkoTUFTUykNCg0KI1NldHRpbmcgd29ya2luZyBkaXJlY3RvcnkNCnNldHdkKCJHOlxcRG93bmxvYWRzXFxOZXcgZm9sZGVyIikjc2V0dGluZyB3b3JraW5nIGRpcmVjdG9yeQ0KDQojTG9hZGluZyB0cmFpbiBkYXRhc2V0DQpSX3RyYWluID0gcmVhZC5jc3YoImhyX3RyYWluLmNzdiIsICAgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQ0KDQojTG9va2luZyBhdCB0aGUgZGF0YSBzZXQNCnN0cihSX3RyYWluKQ0KYGBgDQoNCmBgYHtyfQ0KI0NoZWNraW5nIGlmIHdlIGhhdmUgYW55IE5BIHZhbHVlcyBpbiB0aGUgZGF0YXNldA0KYXBwbHkoUl90cmFpbiwgMiwgZnVuY3Rpb24oeCkgYW55KGlzLm5hKHgpKSkNCmBgYA0KDQpgYGB7cn0NCiNMb2FkaW5nIHRlc3QgZGF0YXNldA0KUl90ZXN0ID0gcmVhZC5jc3YoImhyX3Rlc3QuY3N2IiwgICBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQpzdHIoUl90ZXN0KQ0KYGBgDQpgYGB7cn0NCiNBZGRpbmcgMCB2YWx1ZXMgaW4gdGhlIGxlZnQgY29sdW1uIG9mIHRlc3QgZGF0YQ0KUl90ZXN0JGxlZnQ9cmVwKDAsNDUwMCkNCiNMb29raW5nIGF0IG5ldyBkYXRhc2V0DQpzdHIoUl90ZXN0KQ0KYGBgDQpgYGB7cn0NCiNDaGVja2luZyBpZiB3ZSBoYXZlIGFueSBOQSB2YWx1ZXMgaW4gdGhlIGRhdGFzZXQNCmFwcGx5KFJfdGVzdCwgMiwgZnVuY3Rpb24oeCkgYW55KGlzLm5hKHgpKSkNCmBgYA0KYGBge3J9DQojTG9hZGluZyBsaWJyYXJ5IGRwbHlyDQpsaWJyYXJ5KGRwbHlyKSANCg0KI0pvaW5pbmcgYm90aCBkYXRhc2V0cw0KUiA9IGJpbmRfcm93cyhSX3RyYWluLFJfdGVzdCkNCnN0cihSKQ0KYGBgDQpgYGB7cn0NCiNDaGVja2luZyBpZiB3ZSBoYXZlIGFueSBOQSB2YWx1ZXMgaW4gdGhlIGRhdGFzZXQNCmFwcGx5KFIsIDIsIGZ1bmN0aW9uKHgpIGFueShpcy5uYSh4KSkpDQpgYGANCg0KYGBge3J9DQojY2FsbGluZyBsaWJyYXJ5IGZvciBkYXRhIGNsZWFuaW5nDQpsaWJyYXJ5KGRwbHlyKQ0KdGFibGUoUiRzYWxhcnkpDQoNCiNQcm9iYWJpbGl0eSB0YWJsZSByb3VuZCB0byAyIGRlY2ltYWwgcGxhY2VzDQpyb3VuZChwcm9wLnRhYmxlKHRhYmxlKFIkc2FsYXJ5LFIkbGVmdCksMSksMikNCg0KI0Fzc2lnbmluZyBjYXRlZ29yaWVzDQojMSA6IGhpZ2ggMC45MA0KIzIgOiBtZWRpdW0gMC44MQ0KUj1SICU+JSBtdXRhdGUgKHNhbF8xID0gYXMubnVtZXJpYyhzYWxhcnkgJWluJSBjKCJoaWdoIikpLA0KICAgICAgICAgICAgICAgIHNhbF8yID0gYXMubnVtZXJpYyhzYWxhcnkgJWluJSBjKCJtZWRpdW0iKSkpICU+JSANCiAgc2VsZWN0KC1zYWxhcnkpDQoNCnRhYmxlKFIkc2FsZXMpDQojUHJvYmFiaWxpdHkgdGFibGUgcm91bmQgdG8gMiBkZWNpbWFsIHBsYWNlcw0Kcm91bmQocHJvcC50YWJsZSh0YWJsZShSJHNhbGVzLFIkbGVmdCksMSksMikNCg0KI0Fzc2lnbmluZyBjYXRlZ29yaWVzDQojMSA6IGFjY291bnRpbmcsIGhyIDAuNzYNCiMyIDogcHJvZHVjdF9tbmcsIHN1cHBvcnQgMC44MA0KIzMgOnNhbGVzLCB0ZWNobmljYWwgMC43OQ0KIzQgOiBJVCAwLjgxDQojNSA6IE1hbmFnZW1lbnQgMC44NA0KIzYgOiBSYW5kRA0KUiA9IFIgJT4lICBtdXRhdGUoc18xID0gYXMubnVtZXJpYyhzYWxlcyAlaW4lIGMoIm1hbmFnZW1lbnQiKSksDQogICAgICAgICAgICAgICAgICBzXzIgPSBhcy5udW1lcmljKHNhbGVzICVpbiUgYygiUmFuZEQiKSksDQogICAgICAgICAgICAgICAgICBzXzMgPSBhcy5udW1lcmljKHNhbGVzICVpbiUgYygiSVQiKSksDQogICAgICAgICAgICAgICAgICBzXzQgPSBhcy5udW1lcmljKHNhbGVzICVpbiUgYygicHJvZHVjdF9tbmciLCJzdXBwb3J0IikpLA0KICAgICAgICAgICAgICAgICAgc181ID0gYXMubnVtZXJpYyhzYWxlcyAlaW4lIGMoInNhbGVzIiwidGVjaG5pY2FsIikpLA0KICAgICAgICAgICAgICAgICAgc182ID0gYXMubnVtZXJpYyhzYWxlcyAlaW4lIGMoImFjY291bnRpbmciLCJociIpKSkgICU+JSANCiAgc2VsZWN0KC1zYWxlcykNCmBgYA0KYGBge3J9DQojTG9va2luZyBhdCB0aGUgY29tcGxldGUgZGF0YXNldCBvbmNlIGFnYWluDQpzdHIoUikgIA0KYGBgDQpgYGB7cn0NCiNTcGxpdHRpbmcgdGhlIGRhdGFzZXQgYmFjayANClJfdHJhaW4gPC0gUlsxOjEwNDk5LF0NClJfdGVzdCA8LSBSWzEwNTAwOjE0OTk5LF0NCiNEcm9wcGluZyB0aGUgY29sdW1uIGxlZnQgZnJvbSB0ZXN0IGRhdGFzZXQNClJfdGVzdCRsZWZ0ID0gTlVMTA0KYGBgDQpgYGB7cn0NCiNMb29raW5nIGF0IHRoZSB0cmFpbmluZyBkYXRhc2V0IGFmdGVyIHNwbGl0IA0Kc3RyKFJfdHJhaW4pIA0KYGBgDQpgYGB7cn0NCiNMb29raW5nIGF0IHRoZSB0ZXN0IGRhdGFzZXQgYWZ0ZXIgc3BsaXQNCnN0cihSX3Rlc3QpICAjVGVzdCBkYXRhDQpgYGANCg0KYGBge3J9DQojU3BsaXRpbmcgdGhlIFRyYWluIGRhdGEgaW4gdHdvIHBhcnRzDQpSX3RyYWluXzEgPC0gUl90cmFpblsxOjgwMDAsXQ0KUl90cmFpbl8yIDwtIFJfdHJhaW5bODAwMToxMDQ5OSxdDQpgYGANCiANCmBgYHtyfQ0KI0xvb2tpbmcgYXQgZmlyc3QgdHJhaW4gZGF0YXNldA0Kc3RyKFJfdHJhaW5fMSkNCmBgYA0KDQpgYGB7cn0NCiNMb29raW5nIGF0IHNlY29uZCB0cmFpbiBkYXRhc2V0DQpzdHIoUl90cmFpbl8yKQ0KYGBgDQpgYGB7cn0NCiNjaGVja2luZyB1cCBWSUYgDQpsaWJyYXJ5KGNhcikNClJfZml0IDwtIGxtKGxlZnQgfiAuLCBkYXRhPVJfdHJhaW5fMSkNCnN1bW1hcnkoUl9maXQpDQpgYGANCg0KYGBge3J9DQojVklGIGlzIGZpbmUgYnV0IHdlIHdpbGwgZHJvcCB0aGUgdmFyaWFibGVzIG9uZSBieSBvbmUgYXMgcGVyIHAtdmFsdWUuIExldCdzIGRyb3Agc182IGZpcnN0LiANClJfZml0IDwtIGxtKGxlZnQgfi4tc182LCBkYXRhPVJfdHJhaW5fMSkNCnN1bW1hcnkoUl9maXQpDQoNCmBgYA0KDQpgYGB7cn0NCiNSZXN1bHQgYWZ0ZXIgZHJvcHBpbmcgc181IHNfNg0KUl9maXQgPC0gbG0obGVmdCB+Li1zXzYgLXNfNSwgZGF0YT1SX3RyYWluXzEpDQpzdW1tYXJ5KFJfZml0KQ0KYGBgDQoNCmBgYHtyfQ0KI1Jlc3VsdCBhZnRlciBkcm9wcGluZyBzXzYgc181IHNfMg0KUl9maXQgPC0gbG0obGVmdCB+Li1zXzYgLXNfNSAtc18yLCBkYXRhPVJfdHJhaW5fMSkNCnN1bW1hcnkoUl9maXQpDQpgYGANCg0KYGBge3J9DQojUmVzdWx0IGFmdGVyIGRyb3BwaW5nIHNfNiBzXzUgc18yIHNfNA0KUl9maXQgPC0gbG0obGVmdCB+Li1zXzYgLXNfNSAtc18yIC1zXzQsIGRhdGE9Ul90cmFpbl8xKQ0Kc3VtbWFyeShSX2ZpdCkNCmBgYA0KDQpgYGB7cn0NCiNSZXN1bHQgYWZ0ZXIgZHJvcHBpbmcgc182IHNfNSBzXzIgc180IHByb21vdGlvbl9sYXN0XzV5ZWFycw0KUl9maXQgPC0gbG0obGVmdCB+Li1zXzYgLXNfNSAtc18yIC1zXzQgLXByb21vdGlvbl9sYXN0XzV5ZWFycywgZGF0YT1SX3RyYWluXzEpDQpzdW1tYXJ5KFJfZml0KQ0KDQpgYGANCg0KYGBge3J9DQojUmVzdWx0IGFmdGVyIGRyb3BwaW5nIHNfNiBzXzUgc18yIHNfNCBwcm9tb3Rpb25fbGFzdF81eWVhcnMgc18zDQpSX2ZpdCA8LSBsbShsZWZ0IH4uLXNfNiAtc181IC1zXzIgLXNfNCAtcHJvbW90aW9uX2xhc3RfNXllYXJzIC1zXzMsIGRhdGE9Ul90cmFpbl8xKQ0Kc3VtbWFyeShSX2ZpdCkNCmBgYA0KDQpgYGB7cn0NCiNDYWxsaW5nIGxpYnJhcnkgcmFuZG9tIGZvcmVzdA0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQpgYGANCg0KYGBge3J9DQojQ3JlYXRpbmcgbW9kX1JGIHRvIHRyYWluIGRhdGFzZXQgb24gUl90cmFpbl8xIGRhdGFzZXQNCm1vZF9SRiA8LSByYW5kb21Gb3Jlc3QobGVmdCB+Li1zXzYgLXNfNSAtc18yIC1zXzQgLXByb21vdGlvbl9sYXN0XzV5ZWFycyAtc18zLCBSX3RyYWluXzEpDQojTG9va2luZyBhdCByZXN1bHQNCm1vZF9SRg0KYGBgDQpgYGB7cn0NCiNQcmVkaWN0aW5nIHRoZSByZXN1bHQgb2YgdHJhaW4gbW9kZWwgb24gZGF0YXNldCBSX3RyYWluXzIgdG8gY2hlY2sgYWNjdXJhY3kNCnByZWRfUkYgPC0gcHJlZGljdChtb2RfUkYsIFJfdHJhaW5fMiwgdHlwZT0iY2xhc3MiKQ0KI1ZpZXdpbmcgdGhlIGZpcnN0IGZldyByZXN1bHRzDQpzdHIocHJlZF9SRikNCmBgYA0KYGBge3J9DQojUHJlZGljdGluZyB0aGUgcmVzdWx0IG9uIHRlc3QgZGF0YXNldCANCnByZWRfUkZfbmV3IDwtIHByZWRpY3QobW9kX1JGLCBSX3Rlc3QsIHR5cGU9ImNsYXNzIikNCiNWaWV3aW5nIHRoZSBmaXJzdCBmZXcgcHJlZGljdGVkIHJlc3VsdHMNCnN0cihwcmVkX1JGX25ldykNCmBgYA0KDQpgYGB7cn0NCiNHaXZlIG5hbWUgdG8gcHJlZGljdGVkIHJlc3VsdHMgYXMgbGVmdCAodGhlIGNvbHVtbiB3ZSBuZWVkIHRvIGZpbmQpDQpSX3Rlc3QkbGVmdCA9IHByZWRfUkZfbmV3DQojTG9va2luZyBhdCB0aGUgbGVmdCBjb2x1bW4NCnN0cihSX3Rlc3QkbGVmdCkNCmBgYA0KDQpgYGB7cn0NCiNTdWJtaXR0aW5nIHRoZSBmaWxlIA0Kc3VibWl0ID0gZGF0YS5mcmFtZSAobGVmdD0gUl90ZXN0JGxlZnQpDQojR2l2aW5nIG5hbWUgdG8gdGhlIGZpbGUNCndyaXRlLmNzdihzdWJtaXQsIGZpbGUgPSAiQXR1bF9LdW1hcl9QNF9wYXJ0Mi5jc3YiLCByb3cubmFtZXMgPSBGQUxTRSkNCg0KYGBgDQo=