Read data
salary<-read.csv(file="MBA Starting Salaries Data.csv")
summary statistics
summary(salary)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
standard deviation of 1)age
sd(salary$age)
## [1] 3.710666
2)salary
sd(salary$salary)
## [1] 50951.56
3)gmat_qpc,gmat_vpc,gmat_tpc,gmat_tot
sd(salary$gmat_qpc)
## [1] 14.86853
sd(salary$gmat_vpc)
## [1] 16.85966
sd(salary$gmat_tpc)
## [1] 14.02162
sd(salary$gmat_tot)
## [1] 57.53858
boxplot of Age
boxplot(salary$age, main="Age",
col=c("yellow"),horizontal=TRUE,
xlab="Age (Years)" )
boxplot of gmat_qpc
boxplot(salary$gmat_qpc, main="gmat_qpc",
col=c("green"),horizontal=TRUE,
xlab="percentile" )
boxplot of gmat_tpc
boxplot(salary$gmat_tpc, main="gmat_tpc",
col=c("blue"),horizontal=TRUE,
xlab="percentile" )
boxplot of gmat_vpc
boxplot(salary$gmat_vpc, main="gmat_vpc",
col=c("grey"),horizontal=TRUE,
xlab="percentile" )
boxplot of gmat_tot
boxplot(salary$gmat_tot, main="gmat_tot",
col=c("pink"),horizontal=TRUE,
xlab="gmat_tot" )
boxplot of spring MBA avg
boxplot(salary$s_avg, main="spring MBA avg",
col=c("red"),horizontal=TRUE,
xlab="average" )
boxplot of fail MBA avg
boxplot(salary$f_avg, main="fail MBA avg",
col=c("light blue"),horizontal=TRUE,
xlab="average" )
boxplot of starting salary
boxplot(salary$salary, main="salary",
col=c("yellow"),horizontal=TRUE,
xlab="amount" )
boxplot of work years
boxplot(salary$work_yrs, main="work years",
col=c("maroon"),horizontal=TRUE,
xlab="Years" )
scatterplotmatrix of gmat_tpc,salary,age,
library(car)
scatterplotMatrix(formula = ~ gmat_tpc + salary + age, cex=0.6,
data=salary, diagonal="histogram")
scatterplotmatrix of f_avg,s_avg,salary,age
library(car)
scatterplotMatrix(formula = ~ s_avg + f_avg + age + salary, cex=0.5,
data=salary, diagonal="histogram")
corrgram of intercorrelations
library(corrgram)
corrgram(salary, order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram of salary intercorrelations")
varience matrix of given dataset
var(salary)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
covarience matrix of given dataset
cov(salary)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
subset of people who got job
employed <- subset(salary, salary>998, select = age:satis)
View(employed)
contengency-tables 1)age and salary they get
aggregate(age~salary,data=employed,mean)
## salary age
## 1 999 27.48571
## 2 64000 24.00000
## 3 77000 23.00000
## 4 78256 23.00000
## 5 82000 26.00000
## 6 85000 25.00000
## 7 86000 25.50000
## 8 88000 25.00000
## 9 88500 27.00000
## 10 90000 25.66667
## 11 92000 25.66667
## 12 93000 28.00000
## 13 95000 25.42857
## 14 96000 25.25000
## 15 96500 24.00000
## 16 97000 27.50000
## 17 98000 25.50000
## 18 99000 28.00000
## 19 100000 25.33333
## 20 100400 29.00000
## 21 101000 24.50000
## 22 101100 29.00000
## 23 101600 26.00000
## 24 102500 30.00000
## 25 103000 27.00000
## 26 104000 29.50000
## 27 105000 27.00000
## 28 106000 29.66667
## 29 107000 26.00000
## 30 107300 32.00000
## 31 107500 27.00000
## 32 108000 26.50000
## 33 110000 28.00000
## 34 112000 30.66667
## 35 115000 26.00000
## 36 118000 33.00000
## 37 120000 28.75000
## 38 126710 26.00000
## 39 130000 26.00000
## 40 145800 24.00000
## 41 146000 40.00000
## 42 162000 25.00000
## 43 220000 40.00000
2)gender and salary
aggregate(sex==1~salary,data=employed,mean)
## salary sex == 1
## 1 999 0.8571429
## 2 64000 0.0000000
## 3 77000 1.0000000
## 4 78256 0.0000000
## 5 82000 0.0000000
## 6 85000 0.2500000
## 7 86000 0.0000000
## 8 88000 0.0000000
## 9 88500 1.0000000
## 10 90000 1.0000000
## 11 92000 0.6666667
## 12 93000 0.6666667
## 13 95000 0.5714286
## 14 96000 0.7500000
## 15 96500 1.0000000
## 16 97000 1.0000000
## 17 98000 0.6000000
## 18 99000 0.0000000
## 19 100000 0.4444444
## 20 100400 1.0000000
## 21 101000 0.0000000
## 22 101100 1.0000000
## 23 101600 1.0000000
## 24 102500 1.0000000
## 25 103000 1.0000000
## 26 104000 1.0000000
## 27 105000 1.0000000
## 28 106000 0.6666667
## 29 107000 1.0000000
## 30 107300 1.0000000
## 31 107500 1.0000000
## 32 108000 1.0000000
## 33 110000 0.0000000
## 34 112000 1.0000000
## 35 115000 1.0000000
## 36 118000 1.0000000
## 37 120000 0.7500000
## 38 126710 1.0000000
## 39 130000 1.0000000
## 40 145800 1.0000000
## 41 146000 1.0000000
## 42 162000 1.0000000
## 43 220000 0.0000000
3)GMAT percentile and salary
aggregate(gmat_tpc~salary,data=employed,mean)
## salary gmat_tpc
## 1 999 84.34286
## 2 64000 71.00000
## 3 77000 95.00000
## 4 78256 58.00000
## 5 82000 95.00000
## 6 85000 95.00000
## 7 86000 93.00000
## 8 88000 93.00000
## 9 88500 87.00000
## 10 90000 87.33333
## 11 92000 93.33333
## 12 93000 79.66667
## 13 95000 81.00000
## 14 96000 82.25000
## 15 96500 52.00000
## 16 97000 83.00000
## 17 98000 83.40000
## 18 99000 79.00000
## 19 100000 85.00000
## 20 100400 89.00000
## 21 101000 86.00000
## 22 101100 95.00000
## 23 101600 91.00000
## 24 102500 96.00000
## 25 103000 88.00000
## 26 104000 67.00000
## 27 105000 80.18182
## 28 106000 93.00000
## 29 107000 83.00000
## 30 107300 94.00000
## 31 107500 91.00000
## 32 108000 79.00000
## 33 110000 91.00000
## 34 112000 91.66667
## 35 115000 82.80000
## 36 118000 87.00000
## 37 120000 90.00000
## 38 126710 69.00000
## 39 130000 93.00000
## 40 145800 87.00000
## 41 146000 91.00000
## 42 162000 98.00000
## 43 220000 51.00000
chi-square tests 1)salary and gender
chisq.test(salary,sex)
## Warning in chisq.test(salary, sex): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: salary
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
2)salary and Age
chisq.test(salary,age)
## Warning in chisq.test(salary, age): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: salary
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
3)salary and gmat_tot
chisq.test(salary,gmat_tot)
## Warning in chisq.test(salary, gmat_tot): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: salary
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
4)salary and work years
chisq.test(salary,work_yrs)
## Warning in chisq.test(salary, work_yrs): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: salary
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
5)salary and satis
chisq.test(salary,satis)
## Warning in chisq.test(salary, satis): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: salary
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
T-tests 1)salary and gender
t.test(employed$salary,employed$sex)
##
## Welch Two Sample t-test
##
## data: employed$salary and employed$sex
## t = 19.224, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 69215.68 85088.04
## sample estimates:
## mean of x mean of y
## 77153.12319 1.26087
2)salary and Age
t.test(employed$salary,employed$age)
##
## Welch Two Sample t-test
##
## data: employed$salary and employed$age
## t = 19.217, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 69189.99 85062.35
## sample estimates:
## mean of x mean of y
## 77153.12319 26.95652
3)salary and gmat_tot
t.test(employed$salary,employed$gmat_tot)
##
## Welch Two Sample t-test
##
## data: employed$salary and employed$gmat_tot
## t = 19.07, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 68597.66 84470.03
## sample estimates:
## mean of x mean of y
## 77153.1232 619.2754
4)salary and work years
t.test(employed$salary,employed$work_yrs)
##
## Welch Two Sample t-test
##
## data: employed$salary and employed$work_yrs
## t = 19.223, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 69213.28 85085.64
## sample estimates:
## mean of x mean of y
## 77153.123188 3.666667
5)salary and satis
t.test(employed$salary,employed$satis)
##
## Welch Two Sample t-test
##
## data: employed$salary and employed$satis
## t = 19.223, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 69211.41 85083.77
## sample estimates:
## mean of x mean of y
## 77153.123188 5.528986
Regression analysis model 1
model1<-lm(salary ~ age+sex+gmat_tpc+frstlang+quarter+work_yrs, data=employed)
summary(model1)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + frstlang + quarter +
## work_yrs, data = employed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101412 -29018 14532 27276 142207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 155229 66812 2.323 0.02170 *
## age -1778 2459 -0.723 0.47081
## sex 9460 8977 1.054 0.29387
## gmat_tpc -117 300 -0.390 0.69719
## frstlang -21882 13077 -1.673 0.09666 .
## quarter -9442 3436 -2.748 0.00684 **
## work_yrs 4151 2608 1.591 0.11391
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44950 on 131 degrees of freedom
## Multiple R-squared: 0.1309, Adjusted R-squared: 0.09105
## F-statistic: 3.287 on 6 and 131 DF, p-value: 0.0048
Regression analysis model 2
model2<-lm(salary ~ gmat_tot+s_avg+f_avg+work_yrs+satis, data=employed)
summary(model2)
##
## Call:
## lm(formula = salary ~ gmat_tot + s_avg + f_avg + work_yrs + satis,
## data = employed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -102471 -20817 8022 25893 123201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -60752.4 48719.8 -1.247 0.21461
## gmat_tot -123.8 63.0 -1.965 0.05147 .
## s_avg 32579.5 10397.5 3.133 0.00213 **
## f_avg -1093.7 8894.6 -0.123 0.90232
## work_yrs 1060.2 1301.6 0.815 0.41678
## satis 20852.6 2989.6 6.975 1.33e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38620 on 132 degrees of freedom
## Multiple R-squared: 0.3533, Adjusted R-squared: 0.3288
## F-statistic: 14.42 on 5 and 132 DF, p-value: 2.902e-11
MOdel 1 is more suitable.
COMPARING THOSE WHO GOT A JOB WITH THOSE WHO DID NOT GET A JOB.
subset of unemployed.
unemployed<-subset(salary,salary== 0)
Employed<-employed[1:90,]
chi-square tests 1)age of people with job and without job
chisq.test(Employed$age,unemployed$age)
## Warning in chisq.test(Employed$age, unemployed$age): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$age and unemployed$age
## X-squared = 277.03, df = 252, p-value = 0.1336
2)work experience of people with job and without job
chisq.test(Employed$work_yrs,unemployed$work_yrs)
## Warning in chisq.test(Employed$work_yrs, unemployed$work_yrs): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$work_yrs and unemployed$work_yrs
## X-squared = 168.43, df = 176, p-value = 0.6456
3)gender of with job and without job people
chisq.test(Employed$sex,unemployed$sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Employed$sex and unemployed$sex
## X-squared = 0.0035941, df = 1, p-value = 0.9522
4)gmat total scores comparison
chisq.test(Employed$gmat_tot,unemployed$gmat_tot)
## Warning in chisq.test(Employed$gmat_tot, unemployed$gmat_tot): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$gmat_tot and unemployed$gmat_tot
## X-squared = 482.83, df = 550, p-value = 0.9819
5)gmat pecentile comparison
chisq.test(Employed$gmat_tpc,unemployed$gmat_tpc)
## Warning in chisq.test(Employed$gmat_tpc, unemployed$gmat_tpc): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$gmat_tpc and unemployed$gmat_tpc
## X-squared = 832.7, df = 812, p-value = 0.2995
6)gmat_vpc,gmat_qpc,s_avg and f_avg
chisq.test(Employed$gmat_vpc,unemployed$gmat_vpc)
## Warning in chisq.test(Employed$gmat_vpc, unemployed$gmat_vpc): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$gmat_vpc and unemployed$gmat_vpc
## X-squared = 679.89, df = 625, p-value = 0.06324
chisq.test(Employed$gmat_qpc,unemployed$gmat_qpc)
## Warning in chisq.test(Employed$gmat_qpc, unemployed$gmat_qpc): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$gmat_qpc and unemployed$gmat_qpc
## X-squared = 1417.6, df = 1295, p-value = 0.009397
chisq.test(Employed$s_avg,unemployed$s_avg)
## Warning in chisq.test(Employed$s_avg, unemployed$s_avg): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$s_avg and unemployed$s_avg
## X-squared = 412.8, df = 390, p-value = 0.2048
chisq.test(Employed$f_avg,unemployed$f_avg)
## Warning in chisq.test(Employed$f_avg, unemployed$f_avg): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Employed$f_avg and unemployed$f_avg
## X-squared = 240.27, df = 238, p-value = 0.4465
Logistic regression
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
Obs<-subset(salary, salary != 998 & salary != 999 & salary!=0)
missmap(Obs, main = "Missing values vs observed")
Obs$employed <- ifelse(Obs$salary > 0,1, 0)
splitting dataset
c1<-Obs[1:96,]
c2<-Obs[96:193,]
annova test for c1
nmodel <- glm(formula = employed ~ age+gmat_tpc+gmat_qpc+gmat_vpc+frstlang+quarter, family = binomial(link = "logit"), data = c1)
anova(nmodel,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: employed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 95 0.0000e+00
## age 1 0 94 5.5695e-10 1
## gmat_tpc 1 0 93 5.5695e-10 1
## gmat_qpc 1 0 92 5.5695e-10 1
## gmat_vpc 1 0 91 5.5695e-10 1
## frstlang 1 0 90 5.5695e-10 1
## quarter 1 0 89 5.5695e-10 1
annova test for c2
nmodel <- glm(formula = employed ~ age+gmat_tpc+gmat_qpc+gmat_vpc+frstlang+quarter, family = binomial(link = "logit"), data = c2)
anova(nmodel,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: employed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 7 0.0000e+00
## age 1 0 6 3.4294e-10 1
## gmat_tpc 1 0 5 3.4294e-10 1
## gmat_qpc 1 0 4 3.4294e-10 1
## gmat_vpc 1 0 3 3.4294e-10 1
## frstlang 1 0 2 3.4294e-10 1
## quarter 0 0 2 3.4294e-10