Description

This report provide Salary Graduate Prediction using regression algorithms

the dataset using this report for modeling is real salary data in the India. The dataset is hosted in Kaggle. It can be downloaded here https://www.kaggle.com/manishkc06/engineering-graduate-salary-prediction

The report is structured as Follows:

  1. Data Extraction
  2. Exploratory Data Analysis
  3. Data Preparation
  4. Modeling
  5. Evaluation
  6. Recomendation

1. Data Extraction

Import necessary libraries.

rm(list = ls())
library(ggplot2)
library(corrgram)
library(gridExtra)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill

Library ggplot2: for graphics and visualization . Library corrgram: for visualization of correlation coefficient matrix. Library gridExtra: for plotting multiple graphs. Library caret: for One Hot Encoding. Read Salary Dataset from .csv file from data frame. Then see the dataframe’s structure.

## read data
salary_df <- read.csv("data/Engineering_graduate_salary.csv")
## structure of dataframe
str(salary_df)
## 'data.frame':    2998 obs. of  34 variables:
##  $ ID                   : int  604399 988334 301647 582313 339001 609356 1081649 610842 1183070 794062 ...
##  $ Gender               : chr  "f" "m" "m" "m" ...
##  $ DOB                  : chr  "1990-10-22" "1990-05-15" "1989-08-21" "1991-05-04" ...
##  $ X10percentage        : num  87.8 57 77.3 84.3 82 ...
##  $ X10board             : chr  "cbse" "cbse" "maharashtra state board,pune" "cbse" ...
##  $ X12graduation        : int  2009 2010 2007 2009 2008 2007 2007 2009 2010 2009 ...
##  $ X12percentage        : num  84 64.5 85.2 86 75 ...
##  $ X12board             : chr  "cbse" "cbse" "amravati divisional board" "cbse" ...
##  $ CollegeID            : int  6920 6624 9084 8195 4889 10950 14381 13208 5338 8346 ...
##  $ CollegeTier          : int  1 2 2 1 2 1 2 2 2 2 ...
##  $ Degree               : chr  "B.Tech/B.E." "B.Tech/B.E." "B.Tech/B.E." "B.Tech/B.E." ...
##  $ Specialization       : chr  "instrumentation and control engineering" "computer science & engineering" "electronics & telecommunications" "computer science & engineering" ...
##  $ collegeGPA           : num  73.8 65 61.9 80.4 64.3 ...
##  $ CollegeCityID        : int  6920 6624 9084 8195 4889 10950 14381 13208 5338 8346 ...
##  $ CollegeCityTier      : int  1 0 0 1 1 0 1 1 0 0 ...
##  $ CollegeState         : chr  "Delhi" "Uttar Pradesh" "Maharashtra" "Delhi" ...
##  $ GraduationYear       : int  2013 2014 2011 2013 2012 2013 2013 2013 2014 2014 ...
##  $ English              : int  650 440 485 675 575 535 510 370 510 500 ...
##  $ Logical              : int  665 435 475 620 495 595 495 470 555 410 ...
##  $ Quant                : int  810 210 505 635 365 620 405 280 440 560 ...
##  $ Domain               : num  0.694 0.342 0.825 0.99 0.278 ...
##  $ ComputerProgramming  : int  485 365 -1 655 315 455 -1 465 525 385 ...
##  $ ElectronicsAndSemicon: int  366 -1 400 -1 -1 300 -1 -1 -1 -1 ...
##  $ ComputerScience      : int  -1 -1 -1 -1 -1 -1 -1 -1 438 407 ...
##  $ MechanicalEngg       : int  -1 -1 -1 -1 -1 -1 469 -1 -1 -1 ...
##  $ ElectricalEngg       : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ TelecomEngg          : int  -1 -1 260 -1 -1 313 -1 -1 -1 -1 ...
##  $ CivilEngg            : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ conscientiousness    : num  -0.159 1.134 0.51 -0.446 -1.499 ...
##  $ agreeableness        : num  0.3789 0.0459 -0.1232 0.2124 -0.7473 ...
##  $ extraversion         : num  1.24 1.24 1.543 0.317 -1.07 ...
##  $ nueroticism          : num  0.1459 0.5262 -0.2902 0.2727 0.0622 ...
##  $ openess_to_experience: num  0.289 -0.286 -0.287 0.48 0.186 ...
##  $ Salary               : int  445000 110000 255000 420000 200000 440000 150000 105000 195000 200000 ...

The dataset has 2998 observations and 34 variables. The target variable is salary and the remaining variables are candidate features.

Compute statistical summary of each variabel.

## statistical summary
summary(salary_df)
##        ID             Gender              DOB            X10percentage  
##  Min.   :  11244   Length:2998        Length:2998        Min.   :43.00  
##  1st Qu.: 333465   Class :character   Class :character   1st Qu.:71.14  
##  Median : 639694   Mode  :character   Mode  :character   Median :78.97  
##  Mean   : 664893                                         Mean   :77.67  
##  3rd Qu.: 995177                                         3rd Qu.:85.60  
##  Max.   :1297877                                         Max.   :97.76  
##    X10board         X12graduation  X12percentage     X12board        
##  Length:2998        Min.   :1998   Min.   :40.00   Length:2998       
##  Class :character   1st Qu.:2007   1st Qu.:66.00   Class :character  
##  Mode  :character   Median :2008   Median :74.00   Mode  :character  
##                     Mean   :2008   Mean   :74.34                     
##                     3rd Qu.:2009   3rd Qu.:82.60                     
##                     Max.   :2012   Max.   :98.70                     
##    CollegeID        CollegeTier       Degree          Specialization    
##  Min.   :    2.0   Min.   :1.000   Length:2998        Length:2998       
##  1st Qu.:  526.2   1st Qu.:2.000   Class :character   Class :character  
##  Median : 4027.5   Median :2.000   Mode  :character   Mode  :character  
##  Mean   : 5210.2   Mean   :1.925                                        
##  3rd Qu.: 8822.2   3rd Qu.:2.000                                        
##  Max.   :18409.0   Max.   :2.000                                        
##    collegeGPA    CollegeCityID     CollegeCityTier  CollegeState      
##  Min.   : 6.63   Min.   :    2.0   Min.   :0.0000   Length:2998       
##  1st Qu.:66.53   1st Qu.:  526.2   1st Qu.:0.0000   Class :character  
##  Median :71.80   Median : 4027.5   Median :0.0000   Mode  :character  
##  Mean   :71.51   Mean   : 5210.2   Mean   :0.2962                     
##  3rd Qu.:76.30   3rd Qu.: 8822.2   3rd Qu.:1.0000                     
##  Max.   :99.93   Max.   :18409.0   Max.   :1.0000                     
##  GraduationYear    English         Logical          Quant      
##  Min.   :   0   Min.   :180.0   Min.   :195.0   Min.   :120.0  
##  1st Qu.:2012   1st Qu.:425.0   1st Qu.:441.2   1st Qu.:430.0  
##  Median :2013   Median :500.0   Median :505.0   Median :515.0  
##  Mean   :2012   Mean   :501.1   Mean   :500.4   Mean   :514.1  
##  3rd Qu.:2014   3rd Qu.:570.0   3rd Qu.:565.0   3rd Qu.:595.0  
##  Max.   :2017   Max.   :875.0   Max.   :795.0   Max.   :900.0  
##      Domain        ComputerProgramming ElectronicsAndSemicon ComputerScience 
##  Min.   :-1.0000   Min.   : -1.0       Min.   : -1.00        Min.   : -1.00  
##  1st Qu.: 0.3423   1st Qu.:295.0       1st Qu.: -1.00        1st Qu.: -1.00  
##  Median : 0.6226   Median :415.0       Median : -1.00        Median : -1.00  
##  Mean   : 0.5085   Mean   :351.9       Mean   : 96.23        Mean   : 94.15  
##  3rd Qu.: 0.8356   3rd Qu.:495.0       3rd Qu.:233.00        3rd Qu.: -1.00  
##  Max.   : 0.9999   Max.   :804.0       Max.   :612.00        Max.   :715.00  
##  MechanicalEngg   ElectricalEngg    TelecomEngg       CivilEngg      
##  Min.   : -1.00   Min.   : -1.00   Min.   : -1.00   Min.   : -1.000  
##  1st Qu.: -1.00   1st Qu.: -1.00   1st Qu.: -1.00   1st Qu.: -1.000  
##  Median : -1.00   Median : -1.00   Median : -1.00   Median : -1.000  
##  Mean   : 24.14   Mean   : 16.27   Mean   : 31.07   Mean   :  1.947  
##  3rd Qu.: -1.00   3rd Qu.: -1.00   3rd Qu.: -1.00   3rd Qu.: -1.000  
##  Max.   :623.00   Max.   :660.00   Max.   :548.00   Max.   :500.000  
##  conscientiousness  agreeableness      extraversion        nueroticism     
##  Min.   :-3.89330   Min.   :-5.7816   Min.   :-4.600900   Min.   :-2.6430  
##  1st Qu.:-0.64910   1st Qu.:-0.4353   1st Qu.:-0.604800   1st Qu.:-0.8682  
##  Median : 0.04640   Median : 0.2124   Median : 0.091400   Median :-0.1727  
##  Mean   :-0.03871   Mean   : 0.1262   Mean   :-0.008662   Mean   :-0.1460  
##  3rd Qu.: 0.70270   3rd Qu.: 0.8128   3rd Qu.: 0.672000   3rd Qu.: 0.5262  
##  Max.   : 1.99530   Max.   : 1.9048   Max.   : 2.161700   Max.   : 3.3525  
##  openess_to_experience     Salary       
##  Min.   :-7.3757       Min.   :  35000  
##  1st Qu.:-0.6692       1st Qu.: 180000  
##  Median :-0.0943       Median : 300000  
##  Mean   :-0.1411       Mean   : 305175  
##  3rd Qu.: 0.5024       3rd Qu.: 370000  
##  Max.   : 1.6302       Max.   :4000000

We can see the minimum, median, and maximum values of each numeric variable. We can also notice that the maximum values of salary is statistically far away for median and third quantile. This could be an outlier.

2. Exploratory Data Analysis

2.1. Univariate Analysis (one variable)

Plot distribution of salary using boxplot.

ggplot(data = salary_df, aes(y = Salary)) +
    geom_boxplot() +
    scale_y_continuous(limits = c(0, 1000000))

Based on boxplot above we can see taht there are outliers in salary.

2.2. Bivariate Analysis

Plot salary based on openess_to_experience.

ggplot(data = salary_df, aes(x = openess_to_experience,
                             y = Salary)) +
  geom_point() +
  scale_y_continuous(limits = c(0, 1000000))

Based on salary by openess_to_experience plot, we can see in general, the higher openess_to_experience, the higher the price.

2.3. Multivariate Analysis

Compute Person Correlation Coefficient (R) among all numerical variables. Then, visualize the result in a diagram.

salary_df_num <- salary_df[ ,c(13, 17:34)]
r <- cor(salary_df_num)
corrgram(salary_df_num, order = TRUE,
         upper.panel = panel.cor)

For target variable (salary), the variables with hight correlation in order are openess_to_experience (0.59), Quant(0.51), and conscientious(0.49)

Among features, several variables are highly correlated. For example, extraversion and logical(0.44)

3. Data Preparation

3.1. Data Cleaning

Removing outliers in salary

## Get outliers
out_salary <- boxplot.stats(salary_df_num$Salary)$out
## Get index of outliers
out_idx <- which(salary_df_num$Salary %in% c(out_salary))
## remove outlier
salary_df_num <- salary_df_num[ -out_idx,]
boxplot.stats(salary_df_num$Salary)
## $stats
## [1]  35000 180000 300000 360000 630000
## 
## $n
## [1] 2920
## 
## $conf
## [1] 294736.9 305263.1
## 
## $out
##  [1] 650000 650000 650000 650000 640000 650000 650000 645000 655000 645000
## [11] 655000 650000 650000 640000 655000

In dataset there are value (-1), we change (-1) to NA then NA to Mean to clean the data

Change NA to Mean (Domain)

## get index of NAs
idx_na_d <- which(is.na(salary_df_num$Domain))
## compute mean
mean_d <- mean(salary_df_num$Domain, na.rm = TRUE)
## input NAs with mean
salary_df_num$Domain[ idx_na_d ] <- mean_d

Change NA to Mean (ComputerProgramming) 2

## get index of NAs
idx_na_cp <- which(is.na(salary_df_num$ComputerProgramming))
## compute mean
mean_cp <- mean(salary_df_num$ComputerProgramming, na.rm = TRUE)
## input NAs with mean
salary_df_num$ComputerProgramming[ idx_na_cp ] <- mean_cp

Change NA to Mean (ElectronicsAndSemicon) 3

## get index of NAs
idx_na_electro <- which(is.na(salary_df_num$ElectronicsAndSemicon))
## compute mean
mean_electro <- mean(salary_df_num$ElectronicsAndSemicon, na.rm = TRUE)
## input NAs with mean
salary_df_num$ElectronicsAndSemicon[ idx_na_electro ] <- mean_electro

Change NA to Mean (Computer Science) 4

## get index of NAs
idx_na_cmptr <- which(is.na(salary_df_num$ComputerScience))
## compute mean
mean_cmptr <- mean(salary_df_num$ComputerScience, na.rm = TRUE)
## input NAs with mean
salary_df_num$ComputerScience[ idx_na_cmptr ] <- mean_cmptr

Change NA to Mean (MechanicalEngg) 5

## get index of NAs
idx_na_me <- which(is.na(salary_df_num$MechanicalEngg))
## compute mean
mean_me <- mean(salary_df_num$MechanicalEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$MechanicalEngg[ idx_na_me ] <- mean_me

Change NA to Mean (ElectricalEngg) 6

## get index of NAs
idx_na_ee <- which(is.na(salary_df_num$ElectricalEngg))
## compute mean
mean_ee <- mean(salary_df_num$ElectricalEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$ElectricalEngg[ idx_na_ee ] <- mean_ee

Change NA to Mean (TelecomEngg) 7

## get index of NAs
idx_na_te <- which(is.na(salary_df_num$TelecomEngg))
## compute mean
mean_te <- mean(salary_df_num$TelecomEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$TelecomEngg[ idx_na_te ] <- mean_te

Change NA to Mean (CivilEngg) 8

## get index of NAs
idx_na_ce <- which(is.na(salary_df_num$CivilEngg))
## compute mean
mean_ce <- mean(salary_df_num$CivilEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$CivilEngg[ idx_na_ce ] <- mean_ce

Delete Column (unique)

# Delete Column
salary_df$ID <- NULL
salary_df$DOB <- NULL

3.2 split data into train anda test

dim(salary_df_num)
## [1] 2920   19

traint:test = 70:30

# for reproducible result
set.seed(2021)
m <- nrow(salary_df_num)
m
## [1] 2920
m_train <- m * 0.7

train_idx <- sample(m, m_train)
train_df <- salary_df_num[ train_idx, ]
test_df <- salary_df_num[ -train_idx, ] 

4. Modeling

4.1 Simple linear Regression

model.slr <- lm(formula = Salary ~ openess_to_experience,
                data = train_df)
summary(model.slr)
## 
## Call:
## lm(formula = Salary ~ openess_to_experience, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -247863 -103881   13251   74330  370641 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             285550       2842 100.492   <2e-16 ***
## openess_to_experience     2494       2707   0.921    0.357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127000 on 2041 degrees of freedom
## Multiple R-squared:  0.0004158,  Adjusted R-squared:  -7.393e-05 
## F-statistic: 0.849 on 1 and 2041 DF,  p-value: 0.3569
model.slr2 <- lm(formula = Salary ~ agreeableness,
                data = train_df)
summary(model.slr2)
## 
## Call:
## lm(formula = Salary ~ agreeableness, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -256847 -100772    7690   74304  375984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     283800       2827 100.403  < 2e-16 ***
## agreeableness    10548       2890   3.649  0.00027 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 126600 on 2041 degrees of freedom
## Multiple R-squared:  0.006482,   Adjusted R-squared:  0.005995 
## F-statistic: 13.32 on 1 and 2041 DF,  p-value: 0.0002698
model.slr3 <- lm(formula = Salary ~ Quant,
                 data = train_df)
summary(model.slr3)
## 
## Call:
## lm(formula = Salary ~ Quant, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -272286  -88924   -9696   75421  414602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 95423.72   11453.37   8.331   <2e-16 ***
## Quant         369.15      21.69  17.021   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 118900 on 2041 degrees of freedom
## Multiple R-squared:  0.1243, Adjusted R-squared:  0.1239 
## F-statistic: 289.7 on 1 and 2041 DF,  p-value: < 2.2e-16

4.2 Polynomial Regression

model.poly <- lm(formula = Salary ~ openess_to_experience + I(openess_to_experience^2),
                 data = train_df)
summary(model.poly)
## 
## Call:
## lm(formula = Salary ~ openess_to_experience + I(openess_to_experience^2), 
##     data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -253623 -103347   11404   74175  372335 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  288002       2956  97.418  < 2e-16 ***
## openess_to_experience         -3951       3480  -1.135  0.25631    
## I(openess_to_experience^2)    -3132       1066  -2.940  0.00332 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 126800 on 2040 degrees of freedom
## Multiple R-squared:  0.004632,   Adjusted R-squared:  0.003656 
## F-statistic: 4.747 on 2 and 2040 DF,  p-value: 0.008777

4.3 Multivariate Linear Regression

model.mlr1 <- lm(formula = Salary ~ openess_to_experience + agreeableness + extraversion +
                   Quant + Logical + English + ComputerScience + ElectronicsAndSemicon + 
                   TelecomEngg + MechanicalEngg + ComputerProgramming + ElectricalEngg + 
                   CivilEngg + conscientiousness + extraversion + nueroticism + Domain
                   ,
                 
                 data = train_df)
model.mlr1
## 
## Call:
## lm(formula = Salary ~ openess_to_experience + agreeableness + 
##     extraversion + Quant + Logical + English + ComputerScience + 
##     ElectronicsAndSemicon + TelecomEngg + MechanicalEngg + ComputerProgramming + 
##     ElectricalEngg + CivilEngg + conscientiousness + extraversion + 
##     nueroticism + Domain, data = train_df)
## 
## Coefficients:
##           (Intercept)  openess_to_experience          agreeableness  
##              16164.98               -1389.93                5552.95  
##          extraversion                  Quant                Logical  
##               -412.90                 263.67                  57.82  
##               English        ComputerScience  ElectronicsAndSemicon  
##                169.57                -141.89                 -37.90  
##           TelecomEngg         MechanicalEngg    ComputerProgramming  
##                 21.63                 -18.95                  80.08  
##        ElectricalEngg              CivilEngg      conscientiousness  
##                -93.40                 119.37               -3635.72  
##           nueroticism                 Domain  
##              -2025.74               15412.80

4.4 MLR with Interaction

model.int <- lm(formula = Salary ~ openess_to_experience + agreeableness + 
                  openess_to_experience:agreeableness,
                data = train_df)
model.int
## 
## Call:
## lm(formula = Salary ~ openess_to_experience + agreeableness + 
##     openess_to_experience:agreeableness, data = train_df)
## 
## Coefficients:
##                         (Intercept)                openess_to_experience  
##                              283857                                -7850  
##                       agreeableness  openess_to_experience:agreeableness  
##                               12252                                -2500

4.5. MyModel

mymodel <- lm(formula = Salary ~ openess_to_experience + agreeableness + Quant +
                I(openess_to_experience^2) + I(agreeableness^2) + I(Quant^2) +
                openess_to_experience:agreeableness + openess_to_experience:Quant +
                agreeableness:Quant,
              data = train_df)
mymodel
## 
## Call:
## lm(formula = Salary ~ openess_to_experience + agreeableness + 
##     Quant + I(openess_to_experience^2) + I(agreeableness^2) + 
##     I(Quant^2) + openess_to_experience:agreeableness + openess_to_experience:Quant + 
##     agreeableness:Quant, data = train_df)
## 
## Coefficients:
##                         (Intercept)                openess_to_experience  
##                           7.232e+04                           -2.381e+04  
##                       agreeableness                                Quant  
##                           3.252e+04                            4.596e+02  
##          I(openess_to_experience^2)                   I(agreeableness^2)  
##                          -2.246e+03                            3.644e+02  
##                          I(Quant^2)  openess_to_experience:agreeableness  
##                          -8.372e-02                            6.630e+02  
##         openess_to_experience:Quant                  agreeableness:Quant  
##                           3.285e+01                           -4.762e+01

5. Evaluation

##Actual value from test data
actual <- test_df$Salary
## Predicted values using SLR model
pred.slr <- predict(model.slr, test_df)
pred.slr2 <- predict(model.slr2, test_df)
pred.slr3 <- predict(model.slr3, test_df)
pred.poly <- predict(model.poly, test_df)
pred.mlr1 <- predict(model.mlr1, test_df)
pred.int <- predict(model.int, test_df)
pred.mymodel <- predict(mymodel, test_df)
## create dataframe for actual and prediction values
prediction_df <- data.frame(actual, pred.slr, pred.slr2, pred.slr3,
                            pred.poly, pred.mlr1, pred.int, pred.mymodel)

5.1 visualize Actual vc Predicted Values

p1 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.slr)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "Simple Linear Regression vs Actual (openess_to_experience)")
p1
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p2 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.slr2)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "Simple Linear Regression vs Actual (agreeableness)")
p2
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p3 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.slr3)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "Simple Linear Regression vs Actual (Quant)")
p3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p4 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.poly)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "Polynomial Regression vs Actual")
p4
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p5 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.mlr1)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "Multivariate Regression vs Actual")
p5
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p6 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.int)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "MLR with Interaction vs Actual")
p6
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p7 <- ggplot(data = prediction_df,
             aes(x=actual,
                 y=pred.mymodel)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "Prediction myModel vs Actual")
p7
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Multi plot for all Model

multi_plot <- grid.arrange(p1, p2, p3, p4, p5, p6, p7, ncol = 2,
                           top = "Model Predicition")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

5.2. Compute Performance Metrics (RMSE and R)

performance <- function(actual, predicted, model){
  # Root Mean Square Error (RMSE)
  e <- predicted - actual #error
  se <- e^2 # square error
  sse <- sum(se) # sum of square error
  mse <- mean(se) # mean squared error
  rmse <- sqrt(mse) # root mean squared error
  ## Pearson'sCorrelation Coefficient (R)
  r <- cor(predicted, actual)
  
  result <- paste("=== Model: ", model,
                  "\nRoot Mean Squared Error (RMSE): ", round(rmse, 2),
                  "\nCorrelation Coefficient (R): ", round(r,5),
                  "\n\n")
  cat(result)
  #return (rmse)
}
performance(actual, pred.slr, "Single Regression Openess_to_experience")
## === Model:  Single Regression Openess_to_experience 
## Root Mean Squared Error (RMSE):  126171.13 
## Correlation Coefficient (R):  -0.03426
performance(actual, pred.slr2, "Single Regression agreeableness")
## === Model:  Single Regression agreeableness 
## Root Mean Squared Error (RMSE):  125746.19 
## Correlation Coefficient (R):  0.07063
performance(actual, pred.slr3, "Single Regression Quant")
## === Model:  Single Regression Quant 
## Root Mean Squared Error (RMSE):  117501.99 
## Correlation Coefficient (R):  0.36146
performance(actual, pred.poly, "Polynominal Regression")
## === Model:  Polynominal Regression 
## Root Mean Squared Error (RMSE):  125788.36 
## Correlation Coefficient (R):  0.07221
performance(actual, pred.mlr1, "Multivariate Regression")
## === Model:  Multivariate Regression 
## Root Mean Squared Error (RMSE):  112897.5 
## Correlation Coefficient (R):  0.44475
performance(actual, pred.int, "Multivariate Intersaction")
## === Model:  Multivariate Intersaction 
## Root Mean Squared Error (RMSE):  125155.84 
## Correlation Coefficient (R):  0.12393
performance(actual, pred.mymodel, "My Model")
## === Model:  My Model 
## Root Mean Squared Error (RMSE):  117230.18 
## Correlation Coefficient (R):  0.36687

To improve (RMSE) and (R) using Principal Component Analysis (PCA)

# Principal Component Analysis (PCA)
pr.out <- prcomp(salary_df_num, scale = TRUE)
names(pr.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
## Variant Explained
sdev <- pr.out$sdev
ve <- sdev ^ 2
ve
##  [1] 2.7675535 2.2626155 1.8463874 1.2917004 1.1195840 1.0549091 0.9942226
##  [8] 0.9827897 0.9441643 0.8813419 0.8226314 0.6587074 0.6184528 0.5991420
## [15] 0.5403762 0.4923130 0.4502987 0.3698562 0.3029541
## Proportion of Variance Explained (PVE)

pve <- ve / sum(ve)
pve
##  [1] 0.14566071 0.11908503 0.09717828 0.06798423 0.05892547 0.05552153
##  [7] 0.05232750 0.05172577 0.04969286 0.04638642 0.04329639 0.03466881
## [13] 0.03255015 0.03153379 0.02844085 0.02591121 0.02369993 0.01946612
## [19] 0.01594495

Plot PVE How many information of original data represented i each principal component

plot(pve,
     xlab = "Principal Component",
     ylab = "PVE",
     type = "b",
     ylim = c(0,1))

How many component (k) should we select

plot(cumsum(pve),
     xlab = "Principal Component",
     ylab = "Cumulative PVE",
     type = "b",
     ylim = c(0,1))

k <- 15
features_df <- pr.out$x[ , 1:k] 
features_df <- data.frame( features_df )
pve
##  [1] 0.14566071 0.11908503 0.09717828 0.06798423 0.05892547 0.05552153
##  [7] 0.05232750 0.05172577 0.04969286 0.04638642 0.04329639 0.03466881
## [13] 0.03255015 0.03153379 0.02844085 0.02591121 0.02369993 0.01946612
## [19] 0.01594495
cumsum(pve)
##  [1] 0.1456607 0.2647457 0.3619240 0.4299082 0.4888337 0.5443553 0.5966828
##  [8] 0.6484085 0.6981014 0.7444878 0.7877842 0.8224530 0.8550032 0.8865369
## [15] 0.9149778 0.9408890 0.9645889 0.9840550 1.0000000
### combine features and target
salary_pca_df <- cbind( salary_df_num$Salary, features_df)
colnames(salary_pca_df)[1] <- "salary"
# for reproducible result
set.seed(2021)
## PCA
m <- nrow(salary_pca_df)
m
## [1] 2920
m_train <- m * 0.7

train_idx <- sample(m, m_train)

train_pca_df <- salary_pca_df[ train_idx, ] 
test_pca_df <- salary_pca_df[ -train_idx, ] 

Modeling PCA

model.slr_pca <- lm(formula = salary ~ .,
                data = train_pca_df)
summary(model.slr_pca)
## 
## Call:
## lm(formula = salary ~ ., data = train_pca_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96171 -15640   -614  15895  80717 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 284692.6      561.8 506.776  < 2e-16 ***
## PC1         -27938.7      323.1 -86.464  < 2e-16 ***
## PC2          39063.9      424.3  92.077  < 2e-16 ***
## PC3         -12665.9      604.8 -20.941  < 2e-16 ***
## PC4         -19597.9     3187.5  -6.148 9.40e-10 ***
## PC5          28276.5      695.8  40.637  < 2e-16 ***
## PC6          20041.9     3593.6   5.577 2.77e-08 ***
## PC7         -16624.3     7969.3  -2.086   0.0371 *  
## PC8          21043.5     5114.4   4.115 4.03e-05 ***
## PC9         -34666.4     3055.8 -11.345  < 2e-16 ***
## PC10        -20829.7     4409.4  -4.724 2.47e-06 ***
## PC11         46076.8     1089.8  42.281  < 2e-16 ***
## PC12         63085.5      632.1  99.801  < 2e-16 ***
## PC13        -32178.0      684.4 -47.019  < 2e-16 ***
## PC14        -23111.2      668.7 -34.563  < 2e-16 ***
## PC15        -32099.3      701.4 -45.762  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23180 on 2027 degrees of freedom
## Multiple R-squared:  0.9669, Adjusted R-squared:  0.9667 
## F-statistic:  3952 on 15 and 2027 DF,  p-value: < 2.2e-16
pred.slr_pca <- predict(model.slr_pca, test_pca_df)
performance(actual, pred.slr_pca, "LR PCA")
## === Model:  LR PCA 
## Root Mean Squared Error (RMSE):  38833.68 
## Correlation Coefficient (R):  0.95353

Create dataframe PCA

## Create dataframe PCA
prediction_df_pca <- data.frame(pred.slr_pca)

Visualize PCA

## Visualize PCA
pca <- ggplot(data = prediction_df_pca,
             aes(x=actual,
                 y=pred.slr_pca)) +
  geom_point() +
  geom_smooth() +
  scale_x_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  scale_y_continuous(limits = c(0, 750000),
                     breaks = c(250000, 500000, 750000),
                     labels = c("250000INR", "500000INR", "$750000INR")) +
  labs(title = "PCA Visualize")
pca
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).