Import Libraries
library(caTools)
## Warning: package 'caTools' was built under R version 3.6.1
Data Import
empData <- read.csv("empdata.csv")
Data Exploration
summary(empData)
## X id groups age healthy_eating
## Min. : 0.0 Min. : 0.0 A :375 Min. :18.00 Min. : 0.000
## 1st Qu.:249.8 1st Qu.:249.8 AB:125 1st Qu.:30.00 1st Qu.: 4.000
## Median :499.5 Median :499.5 B :125 Median :41.00 Median : 5.000
## Mean :499.5 Mean :499.5 O :375 Mean :41.16 Mean : 4.944
## 3rd Qu.:749.2 3rd Qu.:749.2 3rd Qu.:53.00 3rd Qu.: 6.000
## Max. :999.0 Max. :999.0 Max. :64.00 Max. :10.000
## active_lifestyle salary
## Min. : 0.000 Min. : 553
## 1st Qu.: 4.000 1st Qu.:1360
## Median : 6.000 Median :2174
## Mean : 5.683 Mean :2227
## 3rd Qu.: 7.000 3rd Qu.:2994
## Max. :10.000 Max. :5550
head(empData)
## X id groups age healthy_eating active_lifestyle salary
## 1 0 0 A 36 5 5 2297
## 2 1 1 A 55 3 5 1134
## 3 2 2 A 61 8 1 4969
## 4 3 3 O 29 3 6 902
## 5 4 4 O 34 6 2 3574
## 6 5 5 O 42 5 3 2761
str(empData)
## 'data.frame': 1000 obs. of 7 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ groups : Factor w/ 4 levels "A","AB","B","O": 1 1 1 4 4 4 2 3 1 1 ...
## $ age : int 36 55 61 29 34 42 53 41 47 31 ...
## $ healthy_eating : int 5 3 8 3 6 5 4 8 5 4 ...
## $ active_lifestyle: int 5 5 1 6 2 3 6 6 6 8 ...
## $ salary : int 2297 1134 4969 902 3574 2761 1484 3809 2065 1020 ...
Split Data
splitData <- sample.split(empData,SplitRatio = 0.8)
Training And Testing Data
train <- subset(empData,splitData=='TRUE')
test <- subset(empData,splitData=='FASLE')
Data Modeling
model <- lm(salary~.,train)
model
##
## Call:
## lm(formula = salary ~ ., data = train)
##
## Coefficients:
## (Intercept) X id groupsAB
## 927.33188 -0.01095 NA 40.35707
## groupsB groupsO age healthy_eating
## 78.55021 64.02442 -0.17894 468.66112
## active_lifestyle
## -184.08023
Prediction
pred <- predict(model,test)
## Warning in predict.lm(model, test): prediction from a rank-deficient fit
## may be misleading