MBA <- read.csv("E:/Documents/internship-R/MBA Starting Salaries Data.csv")
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(MBA)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
DESCRIPTIVE PLOTS
hist(MBA$age, breaks=20,col="red",xlab="Age in years", main="Age distribution")
plot(MBA$sex,main = "Graph showing number of Males and Females",col="green")
hist(MBA$work_yrs, breaks=20,col="grey",xlab="Work Experience in years", main="Work experience distribution")
hist(MBA$gmat_tot, breaks=40,col="dark green",xlab="score out of 800", main="Gmat Score distribution")
plot(MBA$frstlang,main = "First Language Distribution",col="orange")
newdata <- MBA[ which(MBA$satis<='7'), ]
hist(newdata$satis, breaks=5,col="magenta",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction distribution")
SCATTER PLOTS
library(car)
## Warning: package 'car' was built under R version 3.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
pairs(~salary+sex+age+gmat_tpc+frstlang+satis+work_yrs, data=MBA,main="Comparision of Salary and other variables")
library(car)
scatterplot(salary ~age, data=MBA,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs age",
xlab="age",
ylab="salary")
library(car)
scatterplot(salary ~age, data=MBA,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs age",
xlab="age",
ylab="salary")
newdata1 <- MBA[ which(MBA$salary !="998" & MBA$salary !="999"), ]
scatterplot(salary ~work_yrs, data=newdata1,
main="Scatter plot of salary vs Work exp.",
xlab="Work experience in years",
ylab="salary")
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="MBA starting salary analysis Correlogram")
CONTINGENCY TABLES
aggregate(salary~age,data=MBA,mean)
## age salary
## 1 22 42500.00
## 2 23 57282.00
## 3 24 49342.24
## 4 25 43395.55
## 5 26 35982.07
## 6 27 31499.37
## 7 28 39809.00
## 8 29 28067.95
## 9 30 55291.25
## 10 31 40599.40
## 11 32 13662.25
## 12 33 118000.00
## 13 34 26250.00
## 14 35 0.00
## 15 36 0.00
## 16 37 0.00
## 17 39 56000.00
## 18 40 183000.00
## 19 42 0.00
## 20 43 0.00
## 21 48 0.00
chisq.test(table(MBA$salary,MBA$age))
## Warning in chisq.test(table(MBA$salary, MBA$age)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(MBA$salary, MBA$age)
## X-squared = 1114.2, df = 880, p-value = 1.178e-07
SALARY AND EXPERIENCE
aggregate(MBA$salary~MBA$work_yrs,FUN=mean)
## MBA$work_yrs MBA$salary
## 1 0 31999.67
## 2 1 34677.08
## 3 2 45531.24
## 4 3 38494.21
## 5 4 27510.81
## 6 5 34476.10
## 7 6 62041.33
## 8 7 11221.78
## 9 8 60156.86
## 10 9 499.50
## 11 10 59000.00
## 12 11 0.00
## 13 12 0.00
## 14 13 0.00
## 15 15 183000.00
## 16 16 72333.33
## 17 18 0.00
## 18 22 0.00
chisq.test(table(MBA$salary,MBA$work_yrs))
## Warning in chisq.test(table(MBA$salary, MBA$work_yrs)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(MBA$salary, MBA$work_yrs)
## X-squared = 846.05, df = 748, p-value = 0.007162
SALARY AND GMAT SCORES
aggregate(MBA$salary~MBA$gmat_tot,FUN=mean)
## MBA$gmat_tot MBA$salary
## 1 450 499.000
## 2 460 998.000
## 3 480 0.000
## 4 500 105833.000
## 5 510 0.000
## 6 520 78256.000
## 7 530 39800.000
## 8 540 41600.000
## 9 550 42213.625
## 10 560 36047.238
## 11 570 40610.889
## 12 580 53466.333
## 13 590 21999.333
## 14 600 48849.350
## 15 610 26944.000
## 16 620 62664.800
## 17 630 38885.636
## 18 640 9582.667
## 19 650 44562.125
## 20 660 33456.500
## 21 670 41793.471
## 22 680 51332.917
## 23 690 998.500
## 24 700 73400.000
## 25 710 40699.700
## 26 720 21499.500
## 27 730 499.500
## 28 740 748.750
## 29 750 0.000
## 30 760 0.000
## 31 790 999.000
chisq.test(table(MBA$salary,MBA$gmat_tot))
## Warning in chisq.test(table(MBA$salary, MBA$gmat_tot)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(MBA$salary, MBA$gmat_tot)
## X-squared = 1267.7, df = 1320, p-value = 0.8456
REGRESSION ANALYSIS
MODEL 1
mod1 <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = MBA)
summary(mod1)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc,
## data = MBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48199 -41195 -33034 56735 182897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 141539.0 59303.9 2.387 0.0177 *
## gmat_tot -369.7 222.7 -1.660 0.0980 .
## gmat_qpc 465.7 615.2 0.757 0.4497
## gmat_vpc 573.4 563.0 1.018 0.3094
## gmat_tpc 523.2 443.0 1.181 0.2386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50900 on 269 degrees of freedom
## Multiple R-squared: 0.01651, Adjusted R-squared: 0.001889
## F-statistic: 1.129 on 4 and 269 DF, p-value: 0.343
p value is greater than 0.05,so this model is not a good model. R-squared value shows that there is 1.6 % error in data evaluation and Residual Standard error is 50900.
MODEL 2
mod2<- lm(salary ~satis+work_yrs+frstlang, data = MBA)
summary(mod2)
##
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = MBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49117 -47235 -2246 49225 187005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59455.437 11221.844 5.298 2.43e-07 ***
## satis -45.735 7.912 -5.780 2.05e-08 ***
## work_yrs -458.980 907.095 -0.506 0.613
## frstlang -9650.834 9087.065 -1.062 0.289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48150 on 270 degrees of freedom
## Multiple R-squared: 0.1168, Adjusted R-squared: 0.107
## F-statistic: 11.91 on 3 and 270 DF, p-value: 2.4e-07
p value is less than 0.05,so this model is better than model 1. R-squared value shows that there is 1.1 % error in data evaluation and Residual Standard error is 48150.
MODEL 3
mod3<- lm(salary ~age+sex, data =MBA)
summary(mod3)
##
## Call:
## lm(formula = salary ~ age + sex, data = MBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48601 -38030 -35529 54440 185565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51953.6 24832.5 2.092 0.0374 *
## age -833.3 830.9 -1.003 0.3168
## sex 7906.6 7124.9 1.110 0.2681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50920 on 271 degrees of freedom
## Multiple R-squared: 0.008421, Adjusted R-squared: 0.001103
## F-statistic: 1.151 on 2 and 271 DF, p-value: 0.3179
This shows that only no variable is correlated to salary of mba.Also the overall p value is greater than 0.05,so this model is not a good model. R-squared value shows that there is 0 % error in data evaluation and Residual Standard error is 50920.