getwd()
## [1] "C:/Users/TANAY/Downloads"
mba <- read.csv("MBA Starting Salaries Data.csv")
summary(mba)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
placed <- subset(mba, mba$salary>1000)
boxplot(placed$salary, main="Salary", horizontal=TRUE)
unplaced <- subset(mba, mba$salary==0)
library(lattice)
histogram(placed$age, main="Distribution of age",xlab="age")
histogram(placed$gmat_tot, main="Distribution of GMAT score",xlab="GMAT Score")
library(car)
scatterplot(salary~age, data=placed, main="Salary Vs Age", ylab="Salary", xlab="age")
scatterplot(salary~gmat_tot, data=placed, main="Salary Vs GMAT score", ylab="Salary", xlab="GMAt score")
scatterplot(salary~age, data=placed, main="Salary Vs Age", ylab="Salary", xlab="age")
scatterplot(salary~work_yrs, data=placed, main="Salary Vs Work experience", ylab="Salary", xlab="Work experience")
scatterplot(salary ~ satis , data=placed,
xlab="Satisfaction level", ylab="Salary",
main="Salary v/s Satisfaction level")
boxplot(salary ~ sex , data=placed,
main="Gender v/s Salary",
horizontal=TRUE,
xlab="Salary",
main="Salary v/s Sex")
aggregate(placed$salary~placed$f_avg, FUN=mean)
## placed$f_avg placed$salary
## 1 0.00 146000.0
## 2 2.00 95000.0
## 3 2.25 90000.0
## 4 2.50 99880.0
## 5 2.67 86000.0
## 6 2.75 107404.4
## 7 2.83 105000.0
## 8 3.00 103596.0
## 9 3.25 99660.0
## 10 3.33 82000.0
## 11 3.50 103547.1
## 12 3.60 95500.0
## 13 3.67 97500.0
## 14 3.75 113333.3
## 15 4.00 117500.0
aggregate(placed$salary~placed$s_avg, FUN=mean)
## placed$s_avg placed$salary
## 1 2.20 105000.00
## 2 2.30 98000.00
## 3 2.40 93500.00
## 4 2.50 131000.00
## 5 2.60 107285.00
## 6 2.70 92375.00
## 7 2.80 101657.14
## 8 2.90 96888.92
## 9 2.91 105000.00
## 10 3.00 103250.00
## 11 3.09 103500.00
## 12 3.10 103542.86
## 13 3.20 104888.89
## 14 3.27 95000.00
## 15 3.30 99681.82
## 16 3.40 94600.00
## 17 3.45 105000.00
## 18 3.50 99700.00
## 19 3.60 116500.00
## 20 3.70 110500.00
## 21 3.80 112500.00
## 22 4.00 146000.00
aggregate(placed$salary~placed$age, FUN=mean)
## placed$age placed$salary
## 1 22 85000.00
## 2 23 91651.20
## 3 24 101518.75
## 4 25 99086.96
## 5 26 101665.00
## 6 27 102214.29
## 7 28 103625.00
## 8 29 102083.33
## 9 30 109916.67
## 10 31 100500.00
## 11 32 107300.00
## 12 33 118000.00
## 13 34 105000.00
## 14 39 112000.00
## 15 40 183000.00
aggregate(placed$salary~placed$work_yrs, FUN=mean)
## placed$work_yrs placed$salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
library("corrgram")
corrgram(placed,upper.panel=panel.pie, main="Corrgram of placed MBAs variables")
cor.test(placed$salary,placed$age)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$age
## t = 5.7968, df = 101, p-value = 7.748e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3388862 0.6320523
## sample estimates:
## cor
## 0.4996428
cor.test(placed$salary,placed$work_yrs)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$work_yrs
## t = 5.1303, df = 101, p-value = 1.403e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2863362 0.5957697
## sample estimates:
## cor
## 0.4546663
cor.test(placed$salary,placed$sex)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$sex
## t = -1.6948, df = 101, p-value = 0.0932
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3485952 0.0281416
## sample estimates:
## cor
## -0.1662887
cor.test(placed$salary,placed$satis)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$satis
## t = -0.40283, df = 101, p-value = 0.6879
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2317788 0.1546729
## sample estimates:
## cor
## -0.0400506
cor.test(placed$salary,placed$gmat_tot)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$gmat_tot
## t = -0.91501, df = 101, p-value = 0.3624
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2792952 0.1046903
## sample estimates:
## cor
## -0.09067141
cor.test(placed$salary,placed$frstlang)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$frstlang
## t = 2.7846, df = 101, p-value = 0.0064
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07749965 0.43791500
## sample estimates:
## cor
## 0.2670195
cor.test(placed$salary,placed$f_avg)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$f_avg
## t = -1.0717, df = 101, p-value = 0.2864
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.29353985 0.08931862
## sample estimates:
## cor
## -0.106039
cor.test(placed$salary,placed$s_avg)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$s_avg
## t = 1.0277, df = 101, p-value = 0.3065
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09363639 0.28955576
## sample estimates:
## cor
## 0.1017317
fit <- lm(salary~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg +f_avg + quarter + work_yrs +frstlang + satis ,data=placed)
summary(fit)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang +
## satis, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78005.66 52981.93 1.472 0.1444
## age 1750.65 1130.92 1.548 0.1251
## sex -3584.07 3595.85 -0.997 0.3216
## gmat_tot 16.19 178.85 0.090 0.9281
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## s_avg -931.53 8240.31 -0.113 0.9102
## f_avg -2222.82 3894.57 -0.571 0.5696
## quarter -2336.56 2721.89 -0.858 0.3929
## work_yrs 749.66 1135.90 0.660 0.5110
## frstlang 7719.42 7373.27 1.047 0.2979
## satis -1086.54 2157.76 -0.504 0.6158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
fit1 <- lm(salary~ age + sex + work_yrs ,data=placed)
summary(fit1)
##
## Call:
## lm(formula = salary ~ age + sex + work_yrs, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29250 -9239 -1146 5429 84318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45674.9 24522.2 1.863 0.0655 .
## age 2263.4 1004.8 2.253 0.0265 *
## sex -3852.2 3395.2 -1.135 0.2593
## work_yrs 478.2 1085.3 0.441 0.6604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15600 on 99 degrees of freedom
## Multiple R-squared: 0.2602, Adjusted R-squared: 0.2378
## F-statistic: 11.61 on 3 and 99 DF, p-value: 1.389e-06
fit2 <- lm(salary~ age + work_yrs + frstlang ,data=placed)
summary(fit2)
##
## Call:
## lm(formula = salary ~ age + work_yrs + frstlang, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31941 -9139 -1086 4793 75526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40492.7 23417.5 1.729 0.0869 .
## age 1892.0 1075.9 1.759 0.0818 .
## work_yrs 747.2 1116.9 0.669 0.5050
## frstlang 8546.9 6728.1 1.270 0.2069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared: 0.2626, Adjusted R-squared: 0.2403
## F-statistic: 11.75 on 3 and 99 DF, p-value: 1.188e-06
fit3 <- lm(salary~ age + work_yrs ,data=placed)
summary(fit3)
##
## Call:
## lm(formula = salary ~ age + work_yrs, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31675 -8099 -2108 4411 80650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36967.5 23323.8 1.585 0.1161
## age 2413.8 997.4 2.420 0.0173 *
## work_yrs 388.8 1084.0 0.359 0.7206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15620 on 100 degrees of freedom
## Multiple R-squared: 0.2506, Adjusted R-squared: 0.2356
## F-statistic: 16.72 on 2 and 100 DF, p-value: 5.438e-07
Therefore, MBA starting salary depends on age,work experience.