setwd("C:/Users/Dell/Downloads/Sameer Mathur")
mba.df<-read.csv("MBA Starting Salaries.csv")
View(mba.df)
Summarizing every variable of the dataset.
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba.df)[,1:5]
## vars n mean sd median
## age 1 274 27.36 3.71 27
## sex 2 274 1.25 0.43 1
## gmat_tot 3 274 619.45 57.54 620
## gmat_qpc 4 274 80.64 14.87 83
## gmat_vpc 5 274 78.32 16.86 81
## gmat_tpc 6 274 84.20 14.02 87
## s_avg 7 274 3.03 0.38 3
## f_avg 8 274 3.06 0.53 3
## quarter 9 274 2.48 1.11 2
## work_yrs 10 274 3.87 3.23 3
## frstlang 11 274 1.12 0.32 1
## salary 12 274 39025.69 50951.56 999
## satis 13 274 172.18 371.61 6
Drawing Bar Plots to visualize the distribution of each variable independently.
par(mfrow=c(1,2))
hist(mba.df$age,xlab = "Age",col = "peachpuff1")
hist(mba.df$sex,xlab = "Sex",col = "peachpuff1",xaxt="n")
axis(side = 1,at=c(1,2),labels = c("Males","Females"))
hist(mba.df$gmat_tot,xlab = "GMAT total score",col = "peachpuff1",breaks = 10)
hist(mba.df$work_yrs,xlab = "Work Experience",col = "peachpuff1")
hist(mba.df$frstlang,xlab = "First language",col = "peachpuff1")
hist(mba.df$salary,xlab = "Age",main="Salaries",col = "peachpuff1",breaks = 10)
hist(mba.df$satis,xlab = "Age",col = "peachpuff1")
Drawing Scatter Plots to understand how are the variables correlated pair-wise.
par(mfrow=c(1,2))
plot(mba.df$gmat_tot,mba.df$work_yrs)
plot(mba.df$gmat_tot,mba.df$age)
plot(mba.df$gmat_tot,mba.df$s_avg)
plot(mba.df$gmat_tot,mba.df$f_avg)
plot(mba.df$salary,mba.df$gmat_tot)
plot(mba.df$salary,mba.df$age)
Finding the covariance matrix,i.e. the corelation between all the variables.
cov(mba.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
Changing the values of columns ‘sex’ and ‘first language’
mba.df$sex[mba.df$sex==1]<-"Male"
mba.df$sex[mba.df$sex==2]<-"Female"
mba.df$sex <- factor(mba.df$sex)
mba.df$frstlang[mba.df$frstlang == 1] <- 'English'
mba.df$frstlang[mba.df$frstlang == 2] <- 'Other'
mba.df$frstlang <- factor(mba.df$frstlang)
Plotting a corrgram of all the variables in the data frame
library(corrgram)
corrgram(mba.df,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt,main="Corrgram of Relations between MBA salaries and various factors")
Creating four seperate dataframes
# MBAs who got placed and disclosed their salaries
placed.df <- mba.df[which (mba.df$salary > 1000) , ]
View(placed.df)
# MBAs who did not get placed
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)
# MBAs who were placed but did not disclose their salary
notDisclosedSalary.df <- mba.df[which (mba.df$salary == 999) , ]
View(notDisclosedSalary.df)
# MBAs who did not answer the survey
notAnsweredSurvey.df <- mba.df[which (mba.df$salary == 998) , ]
View(notAnsweredSurvey.df)
knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)
Creating a new column ‘Got Placed’ with variables TRUE and FALSE
knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)
knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)
## 'data.frame': 228 obs. of 14 variables:
## $ age : int 22 27 25 25 27 28 24 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 1 1 2 ...
## $ gmat_tot : int 660 700 680 650 710 620 670 560 530 650 ...
## $ gmat_qpc : int 90 94 87 82 96 52 84 52 50 79 ...
## $ gmat_vpc : int 92 98 96 91 96 98 96 81 62 93 ...
## $ gmat_tpc : int 94 98 96 93 98 87 95 72 61 93 ...
## $ s_avg : num 3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
## $ f_avg : num 3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 1 2 2 3 2 5 0 1 3 1 ...
## $ frstlang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
## $ salary : int 85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
## $ satis : int 5 6 5 7 6 5 4 5 3 7 ...
## $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
library(lattice)
histogram(~salary, data = mba.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='grey' )
age<- table(placed.df$salary,placed.df$age)
age
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 64000 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 77000 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0
## 86000 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 88500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 90000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 92000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
## 95000 0 0 1 5 0 0 0 1 0 0 0 0 0 0 0
## 96000 0 0 1 1 2 0 0 0 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## 98000 0 1 3 2 1 1 1 1 0 0 0 0 0 0 0
## 99000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 1 4 1 1 1 0 0 0 1 0 0 0 0 0
## 100400 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101000 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101600 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 103000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## 105000 0 1 1 2 3 1 0 0 1 1 0 0 1 0 0
## 106000 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0
## 107000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 112000 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
## 115000 0 0 1 1 0 3 0 0 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 120000 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 162000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
mytable<- xtabs(~salary+sex,data = placed.df)
mytable
## sex
## salary Female Male
## 64000 1 0
## 77000 0 1
## 78256 1 0
## 82000 1 0
## 85000 3 1
## 86000 2 0
## 88000 1 0
## 88500 0 1
## 90000 0 3
## 92000 1 2
## 93000 1 2
## 95000 3 4
## 96000 1 3
## 96500 0 1
## 97000 0 2
## 98000 4 6
## 99000 1 0
## 100000 5 4
## 100400 0 1
## 101000 2 0
## 101100 0 1
## 101600 0 1
## 102500 0 1
## 103000 0 1
## 104000 0 2
## 105000 0 11
## 106000 1 2
## 107000 0 1
## 107300 0 1
## 107500 0 1
## 108000 0 2
## 110000 1 0
## 112000 0 3
## 115000 0 5
## 118000 0 1
## 120000 1 3
## 126710 0 1
## 130000 0 1
## 145800 0 1
## 146000 0 1
## 162000 0 1
## 220000 1 0
language<- xtabs(salary~frstlang,data = knownMBA.df)
language
## frstlang
## English Other
## 9793840 853291
work<- xtabs(salary~work_yrs,data = placed.df)
work
## work_yrs
## 0 1 2 3 4 5 6 7 8
## 95000 828256 3711600 2134710 1160000 722000 741500 98000 420100
## 10 15 16
## 118000 366000 217000
score<- xtabs(gmat_tot~salary,data = placed.df)
score
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 560 660 520 670 2700 1310 650 620 1860 1990
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 1770 4210 2400 500 1200 6110 580 5550 630 1220
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 660 630 670 620 1090 6530 1980 600 660 630
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 1170 640 1950 3070 620 2570 550 650 620 630
## 162000 220000
## 700 500
aggregate(knownMBA.df$salary,by=list(knownMBA.df$sex),mean)
## Group.1 x
## 1 Female 51851.71
## 2 Male 44898.70
aggregate(knownMBA.df$gmat_tot,by=list(knownMBA.df$sex),mean)
## Group.1 x
## 1 Female 612.8814
## 2 Male 618.8757
aggregate(placed.df$salary,by=list(placed.df$work_yrs),mean)
## Group.1 x
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
aggregate(placed.df$salary,by=list(placed.df$age),mean)
## Group.1 x
## 1 22 85000.00
## 2 23 91651.20
## 3 24 101518.75
## 4 25 99086.96
## 5 26 101665.00
## 6 27 102214.29
## 7 28 103625.00
## 8 29 102083.33
## 9 30 109916.67
## 10 31 100500.00
## 11 32 107300.00
## 12 33 118000.00
## 13 34 105000.00
## 14 39 112000.00
## 15 40 183000.00
aggregate(knownMBA.df$gmat_tot,by=list(knownMBA.df$age),mean)
## Group.1 x
## 1 22 630.0000
## 2 23 622.5000
## 3 24 619.6774
## 4 25 628.6842
## 5 26 595.8621
## 6 27 643.4286
## 7 28 602.9412
## 8 29 620.0000
## 9 30 630.9091
## 10 31 580.0000
## 11 32 608.7500
## 12 33 620.0000
## 13 34 597.5000
## 14 35 580.0000
## 15 36 590.0000
## 16 37 560.0000
## 17 39 650.0000
## 18 40 565.0000
## 19 42 650.0000
## 20 43 555.0000
## 21 48 590.0000
chisq.test(age)
## Warning in chisq.test(age): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: age
## X-squared = 717.62, df = 574, p-value = 3.929e-05
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 52.681, df = 41, p-value = 0.1045
chisq.test(language)
##
## Chi-squared test for given probabilities
##
## data: language
## X-squared = 7507500, df = 1, p-value < 2.2e-16
chisq.test(work)
##
## Chi-squared test for given probabilities
##
## data: work
## X-squared = 14068000, df = 11, p-value < 2.2e-16
Model <- salary ~
work_yrs + age + sex + frstlang + satis
fit <- lm(Model, data = placed.df)
summary(fit)
##
## Call:
## lm(formula = Model, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25863 -9753 -834 5571 78637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64196.3 26797.8 2.396 0.0185 *
## work_yrs 850.8 1117.2 0.762 0.4482
## age 1719.3 1089.4 1.578 0.1178
## sexMale 4999.4 3420.1 1.462 0.1470
## frstlangOther 10459.5 6775.8 1.544 0.1259
## satis -2471.2 1978.7 -1.249 0.2147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15460 on 97 degrees of freedom
## Multiple R-squared: 0.2878, Adjusted R-squared: 0.2511
## F-statistic: 7.839 on 5 and 97 DF, p-value: 3.121e-06
Model1<- gmat_tot~ age+ sex + work_yrs+frstlang
fit1<-lm(Model1, data = knownMBA.df)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = knownMBA.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.958 -47.435 -1.049 38.001 168.977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 579.817 43.916 13.203 <2e-16 ***
## age 2.066 1.847 1.119 0.2645
## sexMale 6.207 8.463 0.733 0.4641
## work_yrs -5.195 2.107 -2.465 0.0144 *
## frstlangOther -29.556 12.286 -2.406 0.0170 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.9 on 223 degrees of freedom
## Multiple R-squared: 0.06419, Adjusted R-squared: 0.0474
## F-statistic: 3.824 on 4 and 223 DF, p-value: 0.005