Import Libraries

library(caTools)
## Warning: package 'caTools' was built under R version 3.6.1

Data Import

empData <- read.csv("empdata.csv")

Data Exploration

summary(empData)
##        X               id        groups        age        healthy_eating  
##  Min.   :  0.0   Min.   :  0.0   A :375   Min.   :18.00   Min.   : 0.000  
##  1st Qu.:249.8   1st Qu.:249.8   AB:125   1st Qu.:30.00   1st Qu.: 4.000  
##  Median :499.5   Median :499.5   B :125   Median :41.00   Median : 5.000  
##  Mean   :499.5   Mean   :499.5   O :375   Mean   :41.16   Mean   : 4.944  
##  3rd Qu.:749.2   3rd Qu.:749.2            3rd Qu.:53.00   3rd Qu.: 6.000  
##  Max.   :999.0   Max.   :999.0            Max.   :64.00   Max.   :10.000  
##  active_lifestyle     salary    
##  Min.   : 0.000   Min.   : 553  
##  1st Qu.: 4.000   1st Qu.:1360  
##  Median : 6.000   Median :2174  
##  Mean   : 5.683   Mean   :2227  
##  3rd Qu.: 7.000   3rd Qu.:2994  
##  Max.   :10.000   Max.   :5550
head(empData)
##   X id groups age healthy_eating active_lifestyle salary
## 1 0  0      A  36              5                5   2297
## 2 1  1      A  55              3                5   1134
## 3 2  2      A  61              8                1   4969
## 4 3  3      O  29              3                6    902
## 5 4  4      O  34              6                2   3574
## 6 5  5      O  42              5                3   2761
str(empData)
## 'data.frame':    1000 obs. of  7 variables:
##  $ X               : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ id              : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ groups          : Factor w/ 4 levels "A","AB","B","O": 1 1 1 4 4 4 2 3 1 1 ...
##  $ age             : int  36 55 61 29 34 42 53 41 47 31 ...
##  $ healthy_eating  : int  5 3 8 3 6 5 4 8 5 4 ...
##  $ active_lifestyle: int  5 5 1 6 2 3 6 6 6 8 ...
##  $ salary          : int  2297 1134 4969 902 3574 2761 1484 3809 2065 1020 ...

Split Data

splitData <- sample.split(empData,SplitRatio = 0.8)

Training And Testing Data

train <- subset(empData,splitData=='TRUE')
test <- subset(empData,splitData=='FASLE')

Data Modeling

model <- lm(salary~.,train)
model
## 
## Call:
## lm(formula = salary ~ ., data = train)
## 
## Coefficients:
##      (Intercept)                 X                id          groupsAB  
##        927.33188          -0.01095                NA          40.35707  
##          groupsB           groupsO               age    healthy_eating  
##         78.55021          64.02442          -0.17894         468.66112  
## active_lifestyle  
##       -184.08023

Prediction

pred <- predict(model,test)
## Warning in predict.lm(model, test): prediction from a rank-deficient fit
## may be misleading