Salary.df<-read.csv(paste("MBA Starting Salaries Data.csv"))
View(Salary.df)
summary(Salary.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(Salary.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
str(Salary.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
mean(Salary.df$age)
## [1] 27.35766
mean(Salary.df$s_avg)
## [1] 3.025401
mean(Salary.df$f_avg)
## [1] 3.061533
mean(Salary.df$work_yrs)
## [1] 3.872263
mean(Salary.df$salary)
## [1] 39025.69
mean(Salary.df$satis)
## [1] 172.1788
median(Salary.df$age)
## [1] 27
median(Salary.df$s_avg)
## [1] 3
median(Salary.df$f_avg)
## [1] 3
median(Salary.df$work_yrs)
## [1] 3
median(Salary.df$salary)
## [1] 999
median(Salary.df$satis)
## [1] 6
sd(Salary.df$age)
## [1] 3.710666
sd(Salary.df$s_avg)
## [1] 0.3810743
sd(Salary.df$f_avg)
## [1] 0.5250451
sd(Salary.df$work_yrs)
## [1] 3.232464
sd(Salary.df$salary)
## [1] 50951.56
sd(Salary.df$satis)
## [1] 371.6146
boxplot(Salary.df$age,horizontal=TRUE,main="Visualisation of age")
boxplot(Salary.df$gmat_tot,horizontal=TRUE,main="Visualisation of GMAT Total Score")
boxplot(Salary.df$work_yrs,horizontal=TRUE,main="Visualisation of number of work years")
boxplot(Salary.df$salary,horizontal=TRUE,main="Visualisation of salary")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot.matrix(formula=~age+sex+gmat_tot+s_avg+f_avg+quarter+work_yrs+frstlang+salary+satis,cex=0.6,diagonal="histogram",data = Salary.df)
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
library(corrgram)
corrgram(Salary.df, order = T, text.panel=panel.txt,
lower.panel = panel.shade,
upper.panel = panel.pie, main="Corrgram of all variables")
cov(Salary.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
var(Salary.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
placed <- Salary.df[ which(Salary.df$salary!=0),]
mytable1 <- xtabs(~salary+sex,data = placed)
mytable1
## sex
## salary 1 2
## 998 37 9
## 999 30 5
## 64000 0 1
## 77000 1 0
## 78256 0 1
## 82000 0 1
## 85000 1 3
## 86000 0 2
## 88000 0 1
## 88500 1 0
## 90000 3 0
## 92000 2 1
## 93000 2 1
## 95000 4 3
## 96000 3 1
## 96500 1 0
## 97000 2 0
## 98000 6 4
## 99000 0 1
## 100000 4 5
## 100400 1 0
## 101000 0 2
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 2 0
## 105000 11 0
## 106000 2 1
## 107000 1 0
## 107300 1 0
## 107500 1 0
## 108000 2 0
## 110000 0 1
## 112000 3 0
## 115000 5 0
## 118000 1 0
## 120000 3 1
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
mytable2 <- xtabs(~salary+age,data = placed)
mytable2
## age
## salary 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 998 0 0 2 15 11 11 4 0 1 2 0 0 0 0 0
## 999 0 0 2 6 5 7 3 5 3 2 2 0 0 0 0
## 64000 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 77000 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0
## 86000 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 88500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 90000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 92000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
## 95000 0 0 1 5 0 0 0 1 0 0 0 0 0 0 0
## 96000 0 0 1 1 2 0 0 0 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## 98000 0 1 3 2 1 1 1 1 0 0 0 0 0 0 0
## 99000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 1 4 1 1 1 0 0 0 1 0 0 0 0 0
## 100400 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101000 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101600 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 103000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## 105000 0 1 1 2 3 1 0 0 1 1 0 0 1 0 0
## 106000 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0
## 107000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 112000 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
## 115000 0 0 1 1 0 3 0 0 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 120000 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 162000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
mytable3 <- xtabs(~gmat_tot+sex,data = placed)
mytable3
## sex
## gmat_tot 1 2
## 450 0 1
## 460 1 0
## 500 2 1
## 520 0 1
## 530 1 1
## 540 2 0
## 550 4 0
## 560 9 4
## 570 9 2
## 580 8 3
## 590 5 1
## 600 14 3
## 610 7 2
## 620 12 4
## 630 14 3
## 640 4 2
## 650 7 4
## 660 8 3
## 670 8 5
## 680 7 2
## 690 4 0
## 700 2 1
## 710 5 1
## 720 1 1
## 730 1 0
## 740 3 0
## 790 1 0
mytable4 <- xtabs(~work_yrs+sex,data = placed)
mytable4
## sex
## work_yrs 1 2
## 0 2 0
## 1 8 4
## 2 41 19
## 3 34 8
## 4 28 6
## 5 6 3
## 6 7 3
## 7 4 0
## 8 4 1
## 9 1 0
## 10 1 0
## 15 1 1
## 16 2 0
mytable5 <- xtabs(~salary+frstlang,data = placed)
mytable5
## frstlang
## salary 1 2
## 998 38 8
## 999 26 9
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 64.319, df = 43, p-value = 0.0192
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 948.87, df = 602, p-value < 2.2e-16
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 17.414, df = 26, p-value = 0.896
chisq.test(mytable4)
## Warning in chisq.test(mytable4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable4
## X-squared = 8.2662, df = 12, p-value = 0.764
chisq.test(mytable5)
## Warning in chisq.test(mytable5): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable5
## X-squared = 48.273, df = 43, p-value = 0.2682
t.test(mytable1)
##
## One Sample t-test
##
## data: mytable1
## t = 3.765, df = 87, p-value = 0.000302
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.9870773 3.1947409
## sample estimates:
## mean of x
## 2.090909
t.test(mytable2)
##
## One Sample t-test
##
## data: mytable2
## t = 6.6143, df = 659, p-value = 7.727e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.1960245 0.3615512
## sample estimates:
## mean of x
## 0.2787879
t.test(mytable3)
##
## One Sample t-test
##
## data: mytable3
## t = 7.058, df = 53, p-value = 3.637e-09
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 2.439094 4.375721
## sample estimates:
## mean of x
## 3.407407
t.test(mytable4)
##
## One Sample t-test
##
## data: mytable4
## t = 3.2898, df = 25, p-value = 0.002979
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 2.646542 11.507304
## sample estimates:
## mean of x
## 7.076923
t.test(mytable5)
##
## One Sample t-test
##
## data: mytable5
## t = 3.7666, df = 87, p-value = 0.0003004
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.9875448 3.1942733
## sample estimates:
## mean of x
## 2.090909
reg <- lm(salary~age+sex+work_yrs,data = placed)
summary(reg)
##
## Call:
## lm(formula = salary ~ age + sex + work_yrs, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82407 -52377 22515 45293 117966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74699 62860 1.188 0.2363
## age -2005 2548 -0.787 0.4324
## sex 13777 8961 1.537 0.1259
## work_yrs 5673 2813 2.017 0.0452 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51660 on 180 degrees of freedom
## Multiple R-squared: 0.04794, Adjusted R-squared: 0.03207
## F-statistic: 3.021 on 3 and 180 DF, p-value: 0.03112
reg <- lm(salary~gmat_tot+sex+gmat_tot+frstlang+quarter,data = placed)
summary(reg)
##
## Call:
## lm(formula = salary ~ gmat_tot + sex + gmat_tot + frstlang +
## quarter, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90378 -48397 15960 40122 170419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 206446.17 50007.22 4.128 5.6e-05 ***
## gmat_tot -174.55 69.37 -2.516 0.0127 *
## sex 10419.93 8725.34 1.194 0.2340
## frstlang -27618.09 11267.00 -2.451 0.0152 *
## quarter -8798.76 3360.58 -2.618 0.0096 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50340 on 179 degrees of freedom
## Multiple R-squared: 0.1012, Adjusted R-squared: 0.08115
## F-statistic: 5.041 on 4 and 179 DF, p-value: 0.0007144
reg <- lm(salary~age+work_yrs,data = placed)
summary(reg)
##
## Call:
## lm(formula = salary ~ age + work_yrs, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -87007 -54400 27484 44513 126316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104828 59952 1.749 0.0821 .
## age -2534 2534 -1.000 0.3186
## work_yrs 6014 2815 2.137 0.0340 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51860 on 181 degrees of freedom
## Multiple R-squared: 0.03543, Adjusted R-squared: 0.02478
## F-statistic: 3.325 on 2 and 181 DF, p-value: 0.0382
By analysing the above models, we can see that the second model best fits.
notplaced <- Salary.df[ which(Salary.df$salary==0),]
View(notplaced)
mytable3 <- xtabs(~gmat_tot+sex,data = notplaced)
mytable3
## sex
## gmat_tot 1 2
## 450 1 0
## 480 1 0
## 510 2 0
## 530 2 1
## 540 2 1
## 550 3 1
## 560 3 5
## 570 5 2
## 580 4 0
## 590 3 0
## 600 3 0
## 610 7 2
## 620 2 2
## 630 4 1
## 640 3 3
## 650 3 2
## 660 3 0
## 670 4 0
## 680 2 1
## 700 1 1
## 710 3 1
## 720 2 0
## 730 1 0
## 740 1 0
## 750 1 0
## 760 1 0
mytable4 <- xtabs(~work_yrs+sex,data = notplaced)
mytable4
## sex
## work_yrs 1 2
## 0 1 0
## 1 12 0
## 2 16 6
## 3 9 5
## 4 8 1
## 5 7 5
## 6 2 0
## 7 3 2
## 8 2 0
## 9 0 1
## 10 0 1
## 11 1 1
## 12 2 0
## 13 0 1
## 16 1 0
## 18 1 0
## 22 2 0
We can compare the above contingency table with table corresponding to placed dataframe
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 19.78, df = 25, p-value = 0.7583
chisq.test(mytable4)
## Warning in chisq.test(mytable4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable4
## X-squared = 21.229, df = 16, p-value = 0.1699
setwd("C:/Users/SURABHI/Desktop/IIM INTERNSHIP")
Model.df<-read.csv(paste("MBA Starting Salaries Data.csv"),header=T,na.strings=c(""))
sapply(Model.df,function(x) sum(is.na(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 0 0 0 0 0 0 0 0
## quarter work_yrs frstlang salary satis
## 0 0 0 0 0
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(Model.df, main = "Missing values vs observed")
data <- subset(Model.df,select=c(1,2,3,4,5,6,7,8,9,10,11,12))
data$salary[data$salary>0] <- 1
train <- data[1:800,]
test <- data[801:889,]
model <- glm(salary~gmat_qpc,family=binomial(link='logit'),data=train)
summary(model)
##
## Call:
## glm(formula = salary ~ gmat_qpc, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5893 -1.4189 0.8395 0.8976 1.0766
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.205952 0.693627 -0.297 0.767
## gmat_qpc 0.011480 0.008535 1.345 0.179
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 346.93 on 273 degrees of freedom
## Residual deviance: 345.13 on 272 degrees of freedom
## (526 observations deleted due to missingness)
## AIC: 349.13
##
## Number of Fisher Scoring iterations: 4