This report provide Salary Graduate Prediction using regression algorithms
the dataset using this report for modeling is real salary data in the India. The dataset is hosted in Kaggle. It can be downloaded here https://www.kaggle.com/manishkc06/engineering-graduate-salary-prediction
The report is structured as Follows:
Import necessary libraries.
rm(list = ls())
library(ggplot2)
library(corrgram)
library(gridExtra)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
Library ggplot2: for graphics and visualization . Library corrgram: for visualization of correlation coefficient matrix. Library gridExtra: for plotting multiple graphs. Library caret: for One Hot Encoding. Read Salary Dataset from .csv file from data frame. Then see the dataframe’s structure.
## read data
salary_df <- read.csv("data/Engineering_graduate_salary.csv")
## structure of dataframe
str(salary_df)
## 'data.frame': 2998 obs. of 34 variables:
## $ ID : int 604399 988334 301647 582313 339001 609356 1081649 610842 1183070 794062 ...
## $ Gender : chr "f" "m" "m" "m" ...
## $ DOB : chr "1990-10-22" "1990-05-15" "1989-08-21" "1991-05-04" ...
## $ X10percentage : num 87.8 57 77.3 84.3 82 ...
## $ X10board : chr "cbse" "cbse" "maharashtra state board,pune" "cbse" ...
## $ X12graduation : int 2009 2010 2007 2009 2008 2007 2007 2009 2010 2009 ...
## $ X12percentage : num 84 64.5 85.2 86 75 ...
## $ X12board : chr "cbse" "cbse" "amravati divisional board" "cbse" ...
## $ CollegeID : int 6920 6624 9084 8195 4889 10950 14381 13208 5338 8346 ...
## $ CollegeTier : int 1 2 2 1 2 1 2 2 2 2 ...
## $ Degree : chr "B.Tech/B.E." "B.Tech/B.E." "B.Tech/B.E." "B.Tech/B.E." ...
## $ Specialization : chr "instrumentation and control engineering" "computer science & engineering" "electronics & telecommunications" "computer science & engineering" ...
## $ collegeGPA : num 73.8 65 61.9 80.4 64.3 ...
## $ CollegeCityID : int 6920 6624 9084 8195 4889 10950 14381 13208 5338 8346 ...
## $ CollegeCityTier : int 1 0 0 1 1 0 1 1 0 0 ...
## $ CollegeState : chr "Delhi" "Uttar Pradesh" "Maharashtra" "Delhi" ...
## $ GraduationYear : int 2013 2014 2011 2013 2012 2013 2013 2013 2014 2014 ...
## $ English : int 650 440 485 675 575 535 510 370 510 500 ...
## $ Logical : int 665 435 475 620 495 595 495 470 555 410 ...
## $ Quant : int 810 210 505 635 365 620 405 280 440 560 ...
## $ Domain : num 0.694 0.342 0.825 0.99 0.278 ...
## $ ComputerProgramming : int 485 365 -1 655 315 455 -1 465 525 385 ...
## $ ElectronicsAndSemicon: int 366 -1 400 -1 -1 300 -1 -1 -1 -1 ...
## $ ComputerScience : int -1 -1 -1 -1 -1 -1 -1 -1 438 407 ...
## $ MechanicalEngg : int -1 -1 -1 -1 -1 -1 469 -1 -1 -1 ...
## $ ElectricalEngg : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ TelecomEngg : int -1 -1 260 -1 -1 313 -1 -1 -1 -1 ...
## $ CivilEngg : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ conscientiousness : num -0.159 1.134 0.51 -0.446 -1.499 ...
## $ agreeableness : num 0.3789 0.0459 -0.1232 0.2124 -0.7473 ...
## $ extraversion : num 1.24 1.24 1.543 0.317 -1.07 ...
## $ nueroticism : num 0.1459 0.5262 -0.2902 0.2727 0.0622 ...
## $ openess_to_experience: num 0.289 -0.286 -0.287 0.48 0.186 ...
## $ Salary : int 445000 110000 255000 420000 200000 440000 150000 105000 195000 200000 ...
The dataset has 2998 observations and 34 variables. The target variable is salary and the remaining variables are candidate features.
Compute statistical summary of each variabel.
## statistical summary
summary(salary_df)
## ID Gender DOB X10percentage
## Min. : 11244 Length:2998 Length:2998 Min. :43.00
## 1st Qu.: 333465 Class :character Class :character 1st Qu.:71.14
## Median : 639694 Mode :character Mode :character Median :78.97
## Mean : 664893 Mean :77.67
## 3rd Qu.: 995177 3rd Qu.:85.60
## Max. :1297877 Max. :97.76
## X10board X12graduation X12percentage X12board
## Length:2998 Min. :1998 Min. :40.00 Length:2998
## Class :character 1st Qu.:2007 1st Qu.:66.00 Class :character
## Mode :character Median :2008 Median :74.00 Mode :character
## Mean :2008 Mean :74.34
## 3rd Qu.:2009 3rd Qu.:82.60
## Max. :2012 Max. :98.70
## CollegeID CollegeTier Degree Specialization
## Min. : 2.0 Min. :1.000 Length:2998 Length:2998
## 1st Qu.: 526.2 1st Qu.:2.000 Class :character Class :character
## Median : 4027.5 Median :2.000 Mode :character Mode :character
## Mean : 5210.2 Mean :1.925
## 3rd Qu.: 8822.2 3rd Qu.:2.000
## Max. :18409.0 Max. :2.000
## collegeGPA CollegeCityID CollegeCityTier CollegeState
## Min. : 6.63 Min. : 2.0 Min. :0.0000 Length:2998
## 1st Qu.:66.53 1st Qu.: 526.2 1st Qu.:0.0000 Class :character
## Median :71.80 Median : 4027.5 Median :0.0000 Mode :character
## Mean :71.51 Mean : 5210.2 Mean :0.2962
## 3rd Qu.:76.30 3rd Qu.: 8822.2 3rd Qu.:1.0000
## Max. :99.93 Max. :18409.0 Max. :1.0000
## GraduationYear English Logical Quant
## Min. : 0 Min. :180.0 Min. :195.0 Min. :120.0
## 1st Qu.:2012 1st Qu.:425.0 1st Qu.:441.2 1st Qu.:430.0
## Median :2013 Median :500.0 Median :505.0 Median :515.0
## Mean :2012 Mean :501.1 Mean :500.4 Mean :514.1
## 3rd Qu.:2014 3rd Qu.:570.0 3rd Qu.:565.0 3rd Qu.:595.0
## Max. :2017 Max. :875.0 Max. :795.0 Max. :900.0
## Domain ComputerProgramming ElectronicsAndSemicon ComputerScience
## Min. :-1.0000 Min. : -1.0 Min. : -1.00 Min. : -1.00
## 1st Qu.: 0.3423 1st Qu.:295.0 1st Qu.: -1.00 1st Qu.: -1.00
## Median : 0.6226 Median :415.0 Median : -1.00 Median : -1.00
## Mean : 0.5085 Mean :351.9 Mean : 96.23 Mean : 94.15
## 3rd Qu.: 0.8356 3rd Qu.:495.0 3rd Qu.:233.00 3rd Qu.: -1.00
## Max. : 0.9999 Max. :804.0 Max. :612.00 Max. :715.00
## MechanicalEngg ElectricalEngg TelecomEngg CivilEngg
## Min. : -1.00 Min. : -1.00 Min. : -1.00 Min. : -1.000
## 1st Qu.: -1.00 1st Qu.: -1.00 1st Qu.: -1.00 1st Qu.: -1.000
## Median : -1.00 Median : -1.00 Median : -1.00 Median : -1.000
## Mean : 24.14 Mean : 16.27 Mean : 31.07 Mean : 1.947
## 3rd Qu.: -1.00 3rd Qu.: -1.00 3rd Qu.: -1.00 3rd Qu.: -1.000
## Max. :623.00 Max. :660.00 Max. :548.00 Max. :500.000
## conscientiousness agreeableness extraversion nueroticism
## Min. :-3.89330 Min. :-5.7816 Min. :-4.600900 Min. :-2.6430
## 1st Qu.:-0.64910 1st Qu.:-0.4353 1st Qu.:-0.604800 1st Qu.:-0.8682
## Median : 0.04640 Median : 0.2124 Median : 0.091400 Median :-0.1727
## Mean :-0.03871 Mean : 0.1262 Mean :-0.008662 Mean :-0.1460
## 3rd Qu.: 0.70270 3rd Qu.: 0.8128 3rd Qu.: 0.672000 3rd Qu.: 0.5262
## Max. : 1.99530 Max. : 1.9048 Max. : 2.161700 Max. : 3.3525
## openess_to_experience Salary
## Min. :-7.3757 Min. : 35000
## 1st Qu.:-0.6692 1st Qu.: 180000
## Median :-0.0943 Median : 300000
## Mean :-0.1411 Mean : 305175
## 3rd Qu.: 0.5024 3rd Qu.: 370000
## Max. : 1.6302 Max. :4000000
We can see the minimum, median, and maximum values of each numeric variable. We can also notice that the maximum values of salary is statistically far away for median and third quantile. This could be an outlier.
Plot distribution of salary using boxplot.
ggplot(data = salary_df, aes(y = Salary)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 1000000))
Based on boxplot above we can see taht there are outliers in salary.
Plot salary based on openess_to_experience.
ggplot(data = salary_df, aes(x = openess_to_experience,
y = Salary)) +
geom_point() +
scale_y_continuous(limits = c(0, 1000000))
Based on salary by openess_to_experience plot, we can see in general, the higher openess_to_experience, the higher the price.
Compute Person Correlation Coefficient (R) among all numerical variables. Then, visualize the result in a diagram.
salary_df_num <- salary_df[ ,c(13, 17:34)]
r <- cor(salary_df_num)
corrgram(salary_df_num, order = TRUE,
upper.panel = panel.cor)
For target variable (salary), the variables with hight correlation in order are openess_to_experience (0.59), Quant(0.51), and conscientious(0.49)
Among features, several variables are highly correlated. For example, extraversion and logical(0.44)
Removing outliers in salary
## Get outliers
out_salary <- boxplot.stats(salary_df_num$Salary)$out
## Get index of outliers
out_idx <- which(salary_df_num$Salary %in% c(out_salary))
## remove outlier
salary_df_num <- salary_df_num[ -out_idx,]
boxplot.stats(salary_df_num$Salary)
## $stats
## [1] 35000 180000 300000 360000 630000
##
## $n
## [1] 2920
##
## $conf
## [1] 294736.9 305263.1
##
## $out
## [1] 650000 650000 650000 650000 640000 650000 650000 645000 655000 645000
## [11] 655000 650000 650000 640000 655000
In dataset there are value (-1), we change (-1) to NA then NA to Mean to clean the data
Change NA to Mean (Domain)
## get index of NAs
idx_na_d <- which(is.na(salary_df_num$Domain))
## compute mean
mean_d <- mean(salary_df_num$Domain, na.rm = TRUE)
## input NAs with mean
salary_df_num$Domain[ idx_na_d ] <- mean_d
Change NA to Mean (ComputerProgramming) 2
## get index of NAs
idx_na_cp <- which(is.na(salary_df_num$ComputerProgramming))
## compute mean
mean_cp <- mean(salary_df_num$ComputerProgramming, na.rm = TRUE)
## input NAs with mean
salary_df_num$ComputerProgramming[ idx_na_cp ] <- mean_cp
Change NA to Mean (ElectronicsAndSemicon) 3
## get index of NAs
idx_na_electro <- which(is.na(salary_df_num$ElectronicsAndSemicon))
## compute mean
mean_electro <- mean(salary_df_num$ElectronicsAndSemicon, na.rm = TRUE)
## input NAs with mean
salary_df_num$ElectronicsAndSemicon[ idx_na_electro ] <- mean_electro
Change NA to Mean (Computer Science) 4
## get index of NAs
idx_na_cmptr <- which(is.na(salary_df_num$ComputerScience))
## compute mean
mean_cmptr <- mean(salary_df_num$ComputerScience, na.rm = TRUE)
## input NAs with mean
salary_df_num$ComputerScience[ idx_na_cmptr ] <- mean_cmptr
Change NA to Mean (MechanicalEngg) 5
## get index of NAs
idx_na_me <- which(is.na(salary_df_num$MechanicalEngg))
## compute mean
mean_me <- mean(salary_df_num$MechanicalEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$MechanicalEngg[ idx_na_me ] <- mean_me
Change NA to Mean (ElectricalEngg) 6
## get index of NAs
idx_na_ee <- which(is.na(salary_df_num$ElectricalEngg))
## compute mean
mean_ee <- mean(salary_df_num$ElectricalEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$ElectricalEngg[ idx_na_ee ] <- mean_ee
Change NA to Mean (TelecomEngg) 7
## get index of NAs
idx_na_te <- which(is.na(salary_df_num$TelecomEngg))
## compute mean
mean_te <- mean(salary_df_num$TelecomEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$TelecomEngg[ idx_na_te ] <- mean_te
Change NA to Mean (CivilEngg) 8
## get index of NAs
idx_na_ce <- which(is.na(salary_df_num$CivilEngg))
## compute mean
mean_ce <- mean(salary_df_num$CivilEngg, na.rm = TRUE)
## input NAs with mean
salary_df_num$CivilEngg[ idx_na_ce ] <- mean_ce
Delete Column (unique)
# Delete Column
salary_df$ID <- NULL
salary_df$DOB <- NULL
dim(salary_df_num)
## [1] 2920 19
traint:test = 70:30
# for reproducible result
set.seed(2021)
m <- nrow(salary_df_num)
m
## [1] 2920
m_train <- m * 0.7
train_idx <- sample(m, m_train)
train_df <- salary_df_num[ train_idx, ]
test_df <- salary_df_num[ -train_idx, ]
model.slr <- lm(formula = Salary ~ openess_to_experience,
data = train_df)
summary(model.slr)
##
## Call:
## lm(formula = Salary ~ openess_to_experience, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -247863 -103881 13251 74330 370641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 285550 2842 100.492 <2e-16 ***
## openess_to_experience 2494 2707 0.921 0.357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 127000 on 2041 degrees of freedom
## Multiple R-squared: 0.0004158, Adjusted R-squared: -7.393e-05
## F-statistic: 0.849 on 1 and 2041 DF, p-value: 0.3569
model.slr2 <- lm(formula = Salary ~ agreeableness,
data = train_df)
summary(model.slr2)
##
## Call:
## lm(formula = Salary ~ agreeableness, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -256847 -100772 7690 74304 375984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 283800 2827 100.403 < 2e-16 ***
## agreeableness 10548 2890 3.649 0.00027 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 126600 on 2041 degrees of freedom
## Multiple R-squared: 0.006482, Adjusted R-squared: 0.005995
## F-statistic: 13.32 on 1 and 2041 DF, p-value: 0.0002698
model.slr3 <- lm(formula = Salary ~ Quant,
data = train_df)
summary(model.slr3)
##
## Call:
## lm(formula = Salary ~ Quant, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -272286 -88924 -9696 75421 414602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95423.72 11453.37 8.331 <2e-16 ***
## Quant 369.15 21.69 17.021 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 118900 on 2041 degrees of freedom
## Multiple R-squared: 0.1243, Adjusted R-squared: 0.1239
## F-statistic: 289.7 on 1 and 2041 DF, p-value: < 2.2e-16
model.poly <- lm(formula = Salary ~ openess_to_experience + I(openess_to_experience^2),
data = train_df)
summary(model.poly)
##
## Call:
## lm(formula = Salary ~ openess_to_experience + I(openess_to_experience^2),
## data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -253623 -103347 11404 74175 372335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 288002 2956 97.418 < 2e-16 ***
## openess_to_experience -3951 3480 -1.135 0.25631
## I(openess_to_experience^2) -3132 1066 -2.940 0.00332 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 126800 on 2040 degrees of freedom
## Multiple R-squared: 0.004632, Adjusted R-squared: 0.003656
## F-statistic: 4.747 on 2 and 2040 DF, p-value: 0.008777
model.mlr1 <- lm(formula = Salary ~ openess_to_experience + agreeableness + extraversion +
Quant + Logical + English + ComputerScience + ElectronicsAndSemicon +
TelecomEngg + MechanicalEngg + ComputerProgramming + ElectricalEngg +
CivilEngg + conscientiousness + extraversion + nueroticism + Domain
,
data = train_df)
model.mlr1
##
## Call:
## lm(formula = Salary ~ openess_to_experience + agreeableness +
## extraversion + Quant + Logical + English + ComputerScience +
## ElectronicsAndSemicon + TelecomEngg + MechanicalEngg + ComputerProgramming +
## ElectricalEngg + CivilEngg + conscientiousness + extraversion +
## nueroticism + Domain, data = train_df)
##
## Coefficients:
## (Intercept) openess_to_experience agreeableness
## 16164.98 -1389.93 5552.95
## extraversion Quant Logical
## -412.90 263.67 57.82
## English ComputerScience ElectronicsAndSemicon
## 169.57 -141.89 -37.90
## TelecomEngg MechanicalEngg ComputerProgramming
## 21.63 -18.95 80.08
## ElectricalEngg CivilEngg conscientiousness
## -93.40 119.37 -3635.72
## nueroticism Domain
## -2025.74 15412.80
model.int <- lm(formula = Salary ~ openess_to_experience + agreeableness +
openess_to_experience:agreeableness,
data = train_df)
model.int
##
## Call:
## lm(formula = Salary ~ openess_to_experience + agreeableness +
## openess_to_experience:agreeableness, data = train_df)
##
## Coefficients:
## (Intercept) openess_to_experience
## 283857 -7850
## agreeableness openess_to_experience:agreeableness
## 12252 -2500
mymodel <- lm(formula = Salary ~ openess_to_experience + agreeableness + Quant +
I(openess_to_experience^2) + I(agreeableness^2) + I(Quant^2) +
openess_to_experience:agreeableness + openess_to_experience:Quant +
agreeableness:Quant,
data = train_df)
mymodel
##
## Call:
## lm(formula = Salary ~ openess_to_experience + agreeableness +
## Quant + I(openess_to_experience^2) + I(agreeableness^2) +
## I(Quant^2) + openess_to_experience:agreeableness + openess_to_experience:Quant +
## agreeableness:Quant, data = train_df)
##
## Coefficients:
## (Intercept) openess_to_experience
## 7.232e+04 -2.381e+04
## agreeableness Quant
## 3.252e+04 4.596e+02
## I(openess_to_experience^2) I(agreeableness^2)
## -2.246e+03 3.644e+02
## I(Quant^2) openess_to_experience:agreeableness
## -8.372e-02 6.630e+02
## openess_to_experience:Quant agreeableness:Quant
## 3.285e+01 -4.762e+01
##Actual value from test data
actual <- test_df$Salary
## Predicted values using SLR model
pred.slr <- predict(model.slr, test_df)
pred.slr2 <- predict(model.slr2, test_df)
pred.slr3 <- predict(model.slr3, test_df)
pred.poly <- predict(model.poly, test_df)
pred.mlr1 <- predict(model.mlr1, test_df)
pred.int <- predict(model.int, test_df)
pred.mymodel <- predict(mymodel, test_df)
## create dataframe for actual and prediction values
prediction_df <- data.frame(actual, pred.slr, pred.slr2, pred.slr3,
pred.poly, pred.mlr1, pred.int, pred.mymodel)
p1 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.slr)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "Simple Linear Regression vs Actual (openess_to_experience)")
p1
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p2 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.slr2)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "Simple Linear Regression vs Actual (agreeableness)")
p2
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p3 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.slr3)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "Simple Linear Regression vs Actual (Quant)")
p3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p4 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.poly)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "Polynomial Regression vs Actual")
p4
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p5 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.mlr1)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "Multivariate Regression vs Actual")
p5
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p6 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.int)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "MLR with Interaction vs Actual")
p6
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p7 <- ggplot(data = prediction_df,
aes(x=actual,
y=pred.mymodel)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "Prediction myModel vs Actual")
p7
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Multi plot for all Model
multi_plot <- grid.arrange(p1, p2, p3, p4, p5, p6, p7, ncol = 2,
top = "Model Predicition")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
performance <- function(actual, predicted, model){
# Root Mean Square Error (RMSE)
e <- predicted - actual #error
se <- e^2 # square error
sse <- sum(se) # sum of square error
mse <- mean(se) # mean squared error
rmse <- sqrt(mse) # root mean squared error
## Pearson'sCorrelation Coefficient (R)
r <- cor(predicted, actual)
result <- paste("=== Model: ", model,
"\nRoot Mean Squared Error (RMSE): ", round(rmse, 2),
"\nCorrelation Coefficient (R): ", round(r,5),
"\n\n")
cat(result)
#return (rmse)
}
performance(actual, pred.slr, "Single Regression Openess_to_experience")
## === Model: Single Regression Openess_to_experience
## Root Mean Squared Error (RMSE): 126171.13
## Correlation Coefficient (R): -0.03426
performance(actual, pred.slr2, "Single Regression agreeableness")
## === Model: Single Regression agreeableness
## Root Mean Squared Error (RMSE): 125746.19
## Correlation Coefficient (R): 0.07063
performance(actual, pred.slr3, "Single Regression Quant")
## === Model: Single Regression Quant
## Root Mean Squared Error (RMSE): 117501.99
## Correlation Coefficient (R): 0.36146
performance(actual, pred.poly, "Polynominal Regression")
## === Model: Polynominal Regression
## Root Mean Squared Error (RMSE): 125788.36
## Correlation Coefficient (R): 0.07221
performance(actual, pred.mlr1, "Multivariate Regression")
## === Model: Multivariate Regression
## Root Mean Squared Error (RMSE): 112897.5
## Correlation Coefficient (R): 0.44475
performance(actual, pred.int, "Multivariate Intersaction")
## === Model: Multivariate Intersaction
## Root Mean Squared Error (RMSE): 125155.84
## Correlation Coefficient (R): 0.12393
performance(actual, pred.mymodel, "My Model")
## === Model: My Model
## Root Mean Squared Error (RMSE): 117230.18
## Correlation Coefficient (R): 0.36687
To improve (RMSE) and (R) using Principal Component Analysis (PCA)
# Principal Component Analysis (PCA)
pr.out <- prcomp(salary_df_num, scale = TRUE)
names(pr.out)
## [1] "sdev" "rotation" "center" "scale" "x"
## Variant Explained
sdev <- pr.out$sdev
ve <- sdev ^ 2
ve
## [1] 2.7675535 2.2626155 1.8463874 1.2917004 1.1195840 1.0549091 0.9942226
## [8] 0.9827897 0.9441643 0.8813419 0.8226314 0.6587074 0.6184528 0.5991420
## [15] 0.5403762 0.4923130 0.4502987 0.3698562 0.3029541
## Proportion of Variance Explained (PVE)
pve <- ve / sum(ve)
pve
## [1] 0.14566071 0.11908503 0.09717828 0.06798423 0.05892547 0.05552153
## [7] 0.05232750 0.05172577 0.04969286 0.04638642 0.04329639 0.03466881
## [13] 0.03255015 0.03153379 0.02844085 0.02591121 0.02369993 0.01946612
## [19] 0.01594495
Plot PVE How many information of original data represented i each principal component
plot(pve,
xlab = "Principal Component",
ylab = "PVE",
type = "b",
ylim = c(0,1))
How many component (k) should we select
plot(cumsum(pve),
xlab = "Principal Component",
ylab = "Cumulative PVE",
type = "b",
ylim = c(0,1))
k <- 15
features_df <- pr.out$x[ , 1:k]
features_df <- data.frame( features_df )
pve
## [1] 0.14566071 0.11908503 0.09717828 0.06798423 0.05892547 0.05552153
## [7] 0.05232750 0.05172577 0.04969286 0.04638642 0.04329639 0.03466881
## [13] 0.03255015 0.03153379 0.02844085 0.02591121 0.02369993 0.01946612
## [19] 0.01594495
cumsum(pve)
## [1] 0.1456607 0.2647457 0.3619240 0.4299082 0.4888337 0.5443553 0.5966828
## [8] 0.6484085 0.6981014 0.7444878 0.7877842 0.8224530 0.8550032 0.8865369
## [15] 0.9149778 0.9408890 0.9645889 0.9840550 1.0000000
### combine features and target
salary_pca_df <- cbind( salary_df_num$Salary, features_df)
colnames(salary_pca_df)[1] <- "salary"
# for reproducible result
set.seed(2021)
## PCA
m <- nrow(salary_pca_df)
m
## [1] 2920
m_train <- m * 0.7
train_idx <- sample(m, m_train)
train_pca_df <- salary_pca_df[ train_idx, ]
test_pca_df <- salary_pca_df[ -train_idx, ]
Modeling PCA
model.slr_pca <- lm(formula = salary ~ .,
data = train_pca_df)
summary(model.slr_pca)
##
## Call:
## lm(formula = salary ~ ., data = train_pca_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96171 -15640 -614 15895 80717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284692.6 561.8 506.776 < 2e-16 ***
## PC1 -27938.7 323.1 -86.464 < 2e-16 ***
## PC2 39063.9 424.3 92.077 < 2e-16 ***
## PC3 -12665.9 604.8 -20.941 < 2e-16 ***
## PC4 -19597.9 3187.5 -6.148 9.40e-10 ***
## PC5 28276.5 695.8 40.637 < 2e-16 ***
## PC6 20041.9 3593.6 5.577 2.77e-08 ***
## PC7 -16624.3 7969.3 -2.086 0.0371 *
## PC8 21043.5 5114.4 4.115 4.03e-05 ***
## PC9 -34666.4 3055.8 -11.345 < 2e-16 ***
## PC10 -20829.7 4409.4 -4.724 2.47e-06 ***
## PC11 46076.8 1089.8 42.281 < 2e-16 ***
## PC12 63085.5 632.1 99.801 < 2e-16 ***
## PC13 -32178.0 684.4 -47.019 < 2e-16 ***
## PC14 -23111.2 668.7 -34.563 < 2e-16 ***
## PC15 -32099.3 701.4 -45.762 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23180 on 2027 degrees of freedom
## Multiple R-squared: 0.9669, Adjusted R-squared: 0.9667
## F-statistic: 3952 on 15 and 2027 DF, p-value: < 2.2e-16
pred.slr_pca <- predict(model.slr_pca, test_pca_df)
performance(actual, pred.slr_pca, "LR PCA")
## === Model: LR PCA
## Root Mean Squared Error (RMSE): 38833.68
## Correlation Coefficient (R): 0.95353
Create dataframe PCA
## Create dataframe PCA
prediction_df_pca <- data.frame(pred.slr_pca)
Visualize PCA
## Visualize PCA
pca <- ggplot(data = prediction_df_pca,
aes(x=actual,
y=pred.slr_pca)) +
geom_point() +
geom_smooth() +
scale_x_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
scale_y_continuous(limits = c(0, 750000),
breaks = c(250000, 500000, 750000),
labels = c("250000INR", "500000INR", "$750000INR")) +
labs(title = "PCA Visualize")
pca
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).