setwd("C:/Users/Saurabh Sethia/Desktop/R")
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep = ""))
View(mba.df)
Creating Summary Statistics of variable coloumns of the dataset.
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
str(mba.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
We create a new datset without the 998 and 999 figures in set ie. excluding responses here students did not answer survey or did not disclose salary
mba2.df <- mba.df[which(mba.df$salary != '999' & mba.df$salary != '998' & mba.df$satis != '998'), ]
View(mba2.df)
Now we would plot variables individually. Lets start by plotting the box plot of age and sex of the students in the dataset
table(mba.df$sex)
##
## 1 2
## 206 68
library(lattice)
boxplot(mba.df$age, horizontal = TRUE, xlab = "Age of students")
Now plotting other variables related to GMAT score ie. Total gmat score, Quantitative Gmat percentile, verbal GMAT percentile and overall GMat percentile
boxplot(mba.df$gmat_tot, horizontal = TRUE)
boxplot(mba.df$gmat_qpc, horizontal = TRUE)
boxplot(mba.df$gmat_vpc, horizontal = TRUE)
boxplot(mba.df$gmat_tpc, horizontal = TRUE)
Ater analying the GMAT scores of the students through which they applied for a seat in the college, lets look at their MBA performance. Here we would see the box plot of spring MBA average and fall MBA average and quartile ranking
boxplot(mba.df$s_avg, horizontal = TRUE)
boxplot(mba.df$f_avg, horizontal = TRUE)
boxplot(mba.df$quarter, horizontal = TRUE)
Now we would see box plot of the number of years of experience the candidates acquired before entering the college
boxplot(mba.df$work_yrs)
barplot(table(mba.df$frstlang))
After getting information of individual variables we would like to see the scatter plot of various variables with each other. We would plot a scatter plot matrix in order to see some varables with each other.
library(car)
scatterplotMatrix(formula = ~ gmat_tot + gmat_qpc + gmat_vpc + s_avg + f_avg, data = mba.df)
Plotting corrgram from the mba2 dataset
library(corrgram)
corrgram(mba2.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of correlations between variables")
Now we would analyse students who were placed.We would analyse data of respondents who did not decline to state their salary or did not take part in the process
mbaplaced.df <- mba2.df[which(mba2.df$salary > '0'), ]
View(mbaplaced.df)
Now we would examine the factors which contribute most to the chances or possibility of students getting placed by analysing the other variables # 1. Age
table(mbaplaced.df$age)
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 1 5 16 23 14 14 8 6 6 4 1 1 1 1 2
mytable <- xtabs(~ salary + age, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 717.62, df = 574, p-value = 3.929e-05
cor(mbaplaced.df$age, mbaplaced.df$salary)
## [1] 0.4996428
A very low p-value signifies that the null hypotheses of no relation between age and salary can be ignored. Also correlation value too comes out to be significant.
table(mbaplaced.df$sex)
##
## 1 2
## 72 31
mytable <- xtabs(~ salary + sex, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 52.681, df = 41, p-value = 0.1045
cor(mbaplaced.df$salary, mbaplaced.df$sex)
## [1] -0.1662887
Here the p-value is not very low to be ignored. Thus the hypothese that Salary or getting placed does not depend on sex may be independent.
scatterplot(mbaplaced.df$gmat_tot, mbaplaced.df$salary)
mytable <- xtabs(~ salary + gmat_tot, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 927.24, df = 820, p-value = 0.005279
mytable <- xtabs(~ salary + gmat_qpc, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 1464.3, df = 1353, p-value = 0.018
mytable <- xtabs(~ salary + gmat_vpc, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 1183.3, df = 1066, p-value = 0.006802
mytable <- xtabs(~ salary + gmat_tpc, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 1422.2, df = 1230, p-value = 0.0001065
cor(mbaplaced.df$salary, mbaplaced.df$gmat_tpc)
## [1] -0.1320178
Not a very low p-value thus cannot reject null hypotheses. However the Gmat_tpc has a low p-value. However there no strong correlation attached with it.
mytable <- xtabs(~ salary + s_avg, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 792.97, df = 861, p-value = 0.9524
mytable <- xtabs(~ salary + f_avg, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 596.28, df = 574, p-value = 0.2518
cor(mbaplaced.df$salary, mbaplaced.df$s_avg)
## [1] 0.1017317
cor(mbaplaced.df$salary, mbaplaced.df$f_avg)
## [1] -0.106039
Not a very low p-value obtained thus unable to ingnore null hypotheses.
mytable <- xtabs(~ salary + quarter, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 129.85, df = 123, p-value = 0.3186
mytable <- xtabs(~ salary + work_yrs, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 535.23, df = 451, p-value = 0.003809
mytable <- xtabs(~ salary + frstlang, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 69.847, df = 41, p-value = 0.003296
cor(mbaplaced.df$salary, mbaplaced.df$quarter)
## [1] -0.1284853
cor(mbaplaced.df$salary, mbaplaced.df$work_yrs)
## [1] 0.4546663
cor(mbaplaced.df$salary, mbaplaced.df$frstlang)
## [1] 0.2670195
Low p-values observed in the case of Work eperience in years and First language as English. Also positive and significant correlation observed for the two wih the Salary
mytable <- xtabs(~ salary + satis, data = mbaplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 109.1, df = 164, p-value = 0.9997
cor(mbaplaced.df$salary, mbaplaced.df$frstlang)
## [1] 0.2670195
Although a strong correlation is observed between the two, a not very low p-value makes us unable to igore the null hypotheses.
We would run linear regression model of variables which have shown strong correlation and lower p-values
fitage <- lm(salary ~ age, data = mbaplaced.df)
summary(fitage)
##
## Call:
## lm(formula = salary ~ age, data = mbaplaced.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31454 -8533 -2182 4546 80886
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29962.6 12697.8 2.360 0.0202 *
## age 2728.8 470.7 5.797 7.75e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15550 on 101 degrees of freedom
## Multiple R-squared: 0.2496, Adjusted R-squared: 0.2422
## F-statistic: 33.6 on 1 and 101 DF, p-value: 7.748e-08
fitgmat <- lm(salary ~ gmat_tpc, data = mbaplaced.df)
summary(fitgmat)
##
## Call:
## lm(formula = salary ~ gmat_tpc, data = mbaplaced.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41929 -7964 -2071 4107 109784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121147.3 13648.4 8.876 2.66e-14 ***
## gmat_tpc -214.3 160.1 -1.338 0.184
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17800 on 101 degrees of freedom
## Multiple R-squared: 0.01743, Adjusted R-squared: 0.0077
## F-statistic: 1.792 on 1 and 101 DF, p-value: 0.1837
fitsavg <- lm(salary ~ s_avg, data = mbaplaced.df)
summary(fitsavg)
##
## Call:
## lm(formula = salary ~ s_avg, data = mbaplaced.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40989 -8087 -2068 3682 119814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88179 14558 6.057 2.39e-08 ***
## s_avg 4803 4673 1.028 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17860 on 101 degrees of freedom
## Multiple R-squared: 0.01035, Adjusted R-squared: 0.0005508
## F-statistic: 1.056 on 1 and 101 DF, p-value: 0.3065
fitexp <- lm(salary ~ work_yrs, data = mbaplaced.df)
summary(fitexp)
##
## Call:
## lm(formula = salary ~ work_yrs, data = mbaplaced.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34498 -7745 -498 3803 86419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93101 2496 37.30 < 2e-16 ***
## work_yrs 2699 526 5.13 1.4e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15990 on 101 degrees of freedom
## Multiple R-squared: 0.2067, Adjusted R-squared: 0.1989
## F-statistic: 26.32 on 1 and 101 DF, p-value: 1.403e-06
fitlang <- lm(salary ~ frstlang, data = mbaplaced.df)
summary(fitlang)
##
## Call:
## lm(formula = salary ~ frstlang, data = mbaplaced.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37749 -7749 -1749 3751 99386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82883 7434 11.150 <2e-16 ***
## frstlang 18866 6775 2.785 0.0064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17310 on 101 degrees of freedom
## Multiple R-squared: 0.0713, Adjusted R-squared: 0.0621
## F-statistic: 7.754 on 1 and 101 DF, p-value: 0.0064
Going on the above linear regressions, a single mathematical model to best fit the salary would be,
Bestfit <- lm(salary ~ age + s_avg + work_yrs + frstlang, data = mbaplaced.df)
summary(Bestfit)
##
## Call:
## lm(formula = salary ~ age + s_avg + work_yrs + frstlang, data = mbaplaced.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32957 -9005 -1362 4613 76947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34470.6 26189.2 1.316 0.1912
## age 1833.3 1085.7 1.689 0.0945 .
## s_avg 2207.1 4233.5 0.521 0.6033
## work_yrs 746.2 1121.1 0.666 0.5072
## frstlang 9270.9 6894.3 1.345 0.1818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15630 on 98 degrees of freedom
## Multiple R-squared: 0.2647, Adjusted R-squared: 0.2347
## F-statistic: 8.818 on 4 and 98 DF, p-value: 4.008e-06
coefficients(Bestfit)
## (Intercept) age s_avg work_yrs frstlang
## 34470.5888 1833.2517 2207.1095 746.1779 9270.9037
Now we would seperately create a data set to analyse students not placed with that of placed.
mbanotplaced.df <- mba2.df
mbanotplaced.df$salary[mbanotplaced.df$salary > 0] <- '1'
View(mbanotplaced.df)
table(mbanotplaced.df$salary)
##
## 0 1
## 90 103
Thus number of placed student who participated in the data are 103 and 90 were not able to be placed. Now analysing various factors
mytable <- xtabs(~ salary + age, data = mbanotplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 27.943, df = 20, p-value = 0.1108
mytable <- xtabs(~ salary + sex, data = mbanotplaced.df)
chisq.test(mytable)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable
## X-squared = 0.29208, df = 1, p-value = 0.5889
mytable <- xtabs(~ salary + gmat_tot, data = mbanotplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 27.919, df = 27, p-value = 0.4152
mytable <- xtabs(~ salary + gmat_tpc, data = mbanotplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 37.397, df = 38, p-value = 0.4971
mytable <- xtabs(~ salary + work_yrs, data = mbanotplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 24.663, df = 17, p-value = 0.1025
mytable <- xtabs(~ salary + s_avg, data = mbanotplaced.df)
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 33.09, df = 30, p-value = 0.3187
The factors which worked for the placed students were majorly Age, GMat tpc score, Spring average, Work Experience in years and having English as their first language.