setwd("~/winter internship")
sal <- read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
View(sal)
library(psych)
describe(sal)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
boxplot(sal$salary ~ sal$sex, horizontal=TRUE,
xlab="salary",ylab="gender" ,las=1,
main="Salary distribution based on gender")
axis(side=4,at=c(1,2),labels=c("male","female"))
boxplot(sal$salary ~ sal$quarter, horizontal=TRUE,
xlab="salary",ylab="quarter" ,las=1,
main="Salary distribution based on quartile ranking")
boxplot(sal$salary ~ sal$work_yrs, horizontal=TRUE,
xlab="salary",ylab="work experience" ,las=1,
main="Salary distribution based on work experience")
boxplot(sal$salary ~ sal$frstlang, horizontal=TRUE,
xlab="salary",ylab="language" ,las=1,
main="Salary distribution based on first language spoken")
axis(side=4,at=c(1,2),labels=c("english","other"))
boxplot(sal$salary ~ sal$satis, horizontal=TRUE,
xlab="salary",ylab="salary satisfaction" ,las=1,
main="Salary distribution based on satisfaction")
library(lattice)
barchart(satis ~ salary,data = sal,col="orange")
x3 <- table(sal$salary , sal$frstlang)
barplot(x3, main = "Starting Salary vs language", xlab = "language-english or others", ylab = "Starting Salary",col="red")
library(lattice)
barchart(quarter ~ salary,data = sal,col="orange")
x4 <- table(sal$salary , sal$quarter)
barplot(x4, , xlab = "quartile perncentile", ylab = "Starting Salary count",col="red")
x2 <- table(sal$salary , sal$work_yrs)
barplot(x2, , xlab = "work experience", ylab = "Starting Salary count",col="red")
x1 <- table(sal$salary , sal$sex)
barplot(x1, , xlab = "Sex", ylab = "Starting Salary count",col="red")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(sal$age, sal$salary, main = "scatterplot- Starting Salary vs age ", pch=16)
library(car)
scatterplot(sal$gmat_qpc, sal$salary, main = "scatterplot- Starting Salary vs gmat quantitative percentile", pch=16)
library(car)
scatterplot(sal$gmat_tpc, sal$salary, main = "scatterplot- Starting Salary vs total gmat percentile", pch=16)
library(car)
scatterplot(sal$gmat_vpc, sal$salary, main = "scatterplot- Starting Salary vs verbal gmat percentile ", pch=16)
library(car)
scatterplot(sal$gmat_tot, sal$salary, main = "scatterplot- Starting Salary vs total gmat score ", pch=16)
library(car)
scatterplot(sal$s_avg, sal$salary, main = "scatterplot- Starting Salary vs summer mba average ", pch=16)
library(car)
scatterplot(sal$f_avg, sal$salary, main = "scatterplot- Starting Salary vs fall mba average ", pch=16)
library(corrgram)
corrgram(sal,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)
round(cor(sal),2)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.03 -0.15 -0.22 -0.04 -0.17 0.15 -0.02
## sex -0.03 1.00 -0.05 -0.16 0.07 -0.01 0.13 0.09
## gmat_tot -0.15 -0.05 1.00 0.72 0.75 0.85 0.11 0.10
## gmat_qpc -0.22 -0.16 0.72 1.00 0.15 0.65 -0.03 0.07
## gmat_vpc -0.04 0.07 0.75 0.15 1.00 0.67 0.20 0.08
## gmat_tpc -0.17 -0.01 0.85 0.65 0.67 1.00 0.12 0.08
## s_avg 0.15 0.13 0.11 -0.03 0.20 0.12 1.00 0.55
## f_avg -0.02 0.09 0.10 0.07 0.08 0.08 0.55 1.00
## quarter -0.05 -0.13 -0.09 0.04 -0.17 -0.08 -0.76 -0.45
## work_yrs 0.86 -0.01 -0.18 -0.24 -0.07 -0.17 0.13 -0.04
## frstlang 0.06 0.00 -0.14 0.14 -0.39 -0.10 -0.14 -0.04
## salary -0.06 0.07 -0.05 -0.04 -0.01 0.00 0.15 0.03
## satis -0.13 -0.05 0.08 0.06 0.06 0.09 -0.03 0.01
## quarter work_yrs frstlang salary satis
## age -0.05 0.86 0.06 -0.06 -0.13
## sex -0.13 -0.01 0.00 0.07 -0.05
## gmat_tot -0.09 -0.18 -0.14 -0.05 0.08
## gmat_qpc 0.04 -0.24 0.14 -0.04 0.06
## gmat_vpc -0.17 -0.07 -0.39 -0.01 0.06
## gmat_tpc -0.08 -0.17 -0.10 0.00 0.09
## s_avg -0.76 0.13 -0.14 0.15 -0.03
## f_avg -0.45 -0.04 -0.04 0.03 0.01
## quarter 1.00 -0.09 0.10 -0.16 0.00
## work_yrs -0.09 1.00 -0.03 0.01 -0.11
## frstlang 0.10 -0.03 1.00 -0.09 0.08
## salary -0.16 0.01 -0.09 1.00 -0.34
## satis 0.00 -0.11 0.08 -0.34 1.00
x <- sal[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- sal[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
## age gmat_tot gmat_qpc gmat_vpc
## age 1.376904e+01 -3.115879e+01 -1.192655e+01 -2.763643
## gmat_tot -3.115879e+01 3.310688e+03 6.200233e+02 726.000642
## gmat_qpc -1.192655e+01 6.200233e+02 2.210731e+02 38.148258
## gmat_vpc -2.763643e+00 7.260006e+02 3.814826e+01 284.248122
## gmat_tpc -8.839978e+00 6.839911e+02 1.357997e+02 157.493249
## s_avg 2.116874e-01 2.480257e+00 -1.691233e-01 1.313570
## f_avg -3.399348e-02 3.154688e+00 5.753854e-01 0.672070
## work_yrs 1.029494e+01 -3.391634e+01 -1.137186e+01 -3.618165
## salary -1.183042e+04 -1.611600e+05 -3.335823e+04 -5273.852384
## gmat_tpc s_avg f_avg work_yrs salary
## age -8.8399775 0.2116874 -0.03399348 10.2949386 -1.183042e+04
## gmat_tot 683.9910698 2.4802572 3.15468838 -33.9163391 -1.611600e+05
## gmat_qpc 135.7996845 -0.1691233 0.57538542 -11.3718617 -3.335823e+04
## gmat_vpc 157.4932488 1.3135702 0.67207000 -3.6181653 -5.273852e+03
## gmat_tpc 196.6057057 0.6271001 0.58698618 -7.8575172 3.522750e+03
## s_avg 0.6271001 0.1452176 0.11016898 0.1592639 2.831601e+03
## f_avg 0.5869862 0.1101690 0.27567237 -0.0662870 7.876560e+02
## work_yrs -7.8575172 0.1592639 -0.06628700 10.4488249 1.486147e+03
## salary 3522.7500067 2831.6009858 787.65597177 1486.1470415 2.596062e+09
job <- sal[ which(sal$salary !="998" & sal$salary !="999" & sal$salary!="0"), ]
head(job)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35 22 2 660 90 92 94 3.5 3.75 1
## 36 27 2 700 94 98 98 3.3 3.25 1
## 37 25 2 680 87 96 96 3.5 2.67 1
## 38 25 2 650 82 91 93 3.4 3.25 1
## 39 27 1 710 96 96 98 3.3 3.50 1
## 40 28 2 620 52 98 87 3.4 3.75 1
## work_yrs frstlang salary satis
## 35 1 1 85000 5
## 36 2 1 85000 6
## 37 2 1 86000 5
## 38 3 1 88000 7
## 39 2 1 92000 6
## 40 5 1 93000 5
View(job)
chisq.test(job$age, job$salary)
## Warning in chisq.test(job$age, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$age and job$salary
## X-squared = 717.62, df = 574, p-value = 3.929e-05
chisq.test(job$sex, job$salary)
## Warning in chisq.test(job$sex, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$sex and job$salary
## X-squared = 52.681, df = 41, p-value = 0.1045
chisq.test(job$gmat_tot, job$salary)
## Warning in chisq.test(job$gmat_tot, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$gmat_tot and job$salary
## X-squared = 927.24, df = 820, p-value = 0.005279
chisq.test(job$gmat_qpc, job$salary)
## Warning in chisq.test(job$gmat_qpc, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$gmat_qpc and job$salary
## X-squared = 1464.3, df = 1353, p-value = 0.018
chisq.test(job$gmat_vpc, job$salary)
## Warning in chisq.test(job$gmat_vpc, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$gmat_vpc and job$salary
## X-squared = 1183.3, df = 1066, p-value = 0.006802
chisq.test(job$gmat_tpc, job$salary)
## Warning in chisq.test(job$gmat_tpc, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$gmat_tpc and job$salary
## X-squared = 1422.2, df = 1230, p-value = 0.0001065
chisq.test(job$s_avg, job$salary)
## Warning in chisq.test(job$s_avg, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$s_avg and job$salary
## X-squared = 792.97, df = 861, p-value = 0.9524
chisq.test(job$f_avg, job$salary)
## Warning in chisq.test(job$f_avg, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$f_avg and job$salary
## X-squared = 596.28, df = 574, p-value = 0.2518
chisq.test(job$quarter, job$salary)
## Warning in chisq.test(job$quarter, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$quarter and job$salary
## X-squared = 129.85, df = 123, p-value = 0.3186
chisq.test(job$satis, job$salary)
## Warning in chisq.test(job$satis, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$satis and job$salary
## X-squared = 109.1, df = 164, p-value = 0.9997
chisq.test(job$work_yrs, job$salary)
## Warning in chisq.test(job$work_yrs, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$work_yrs and job$salary
## X-squared = 535.23, df = 451, p-value = 0.003809
chisq.test(job$frstlang, job$salary)
## Warning in chisq.test(job$frstlang, job$salary): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$frstlang and job$salary
## X-squared = 69.847, df = 41, p-value = 0.003296
chisq.test(job$sex, job$salary)
## Warning in chisq.test(job$sex, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$sex and job$salary
## X-squared = 52.681, df = 41, p-value = 0.1045
t.test(job$age, job$salary)
##
## Welch Two Sample t-test
##
## data: job$age and job$salary
## t = -58.503, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106496.23 -99511.69
## sample estimates:
## mean of x mean of y
## 26.7767 103030.7379
t.test(job$sex, job$salary)
##
## Welch Two Sample t-test
##
## data: job$sex and job$salary
## t = -58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106521.71 -99537.17
## sample estimates:
## mean of x mean of y
## 1.300971e+00 1.030307e+05
t.test(job$gmat_tot, job$salary)
##
## Welch Two Sample t-test
##
## data: job$gmat_tot and job$salary
## t = -58.168, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -105907.00 -98922.43
## sample estimates:
## mean of x mean of y
## 616.0194 103030.7379
t.test(job$gmat_qpc, job$salary)
##
## Welch Two Sample t-test
##
## data: job$gmat_qpc and job$salary
## t = -58.473, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106443.28 -99458.74
## sample estimates:
## mean of x mean of y
## 79.72816 103030.73786
t.test(job$gmat_vpc, job$salary)
##
## Welch Two Sample t-test
##
## data: job$gmat_vpc and job$salary
## t = -58.473, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106444.4 -99459.9
## sample estimates:
## mean of x mean of y
## 78.56311 103030.73786
t.test(job$gmat_tpc, job$salary)
##
## Welch Two Sample t-test
##
## data: job$gmat_tpc and job$salary
## t = -58.47, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106438.49 -99453.94
## sample estimates:
## mean of x mean of y
## 84.52427 103030.73786
t.test(job$s_avg, job$salary)
##
## Welch Two Sample t-test
##
## data: job$s_avg and job$salary
## t = -58.516, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106519.92 -99535.37
## sample estimates:
## mean of x mean of y
## 3.09233 103030.73786
t.test(job$f_avg, job$salary)
##
## Welch Two Sample t-test
##
## data: job$f_avg and job$salary
## t = -58.516, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106519.92 -99535.38
## sample estimates:
## mean of x mean of y
## 3.090971e+00 1.030307e+05
t.test(job$quarter, job$salary)
##
## Welch Two Sample t-test
##
## data: job$quarter and job$salary
## t = -58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106520.7 -99536.2
## sample estimates:
## mean of x mean of y
## 2.262136e+00 1.030307e+05
t.test(job$satis, job$salary)
##
## Welch Two Sample t-test
##
## data: job$satis and job$salary
## t = -58.515, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106517.13 -99532.58
## sample estimates:
## mean of x mean of y
## 5.883495e+00 1.030307e+05
t.test(job$work_yrs, job$salary)
##
## Welch Two Sample t-test
##
## data: job$work_yrs and job$salary
## t = -58.516, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106519.33 -99534.79
## sample estimates:
## mean of x mean of y
## 3.679612e+00 1.030307e+05
t.test(job$frstlang, job$salary)
##
## Welch Two Sample t-test
##
## data: job$frstlang and job$salary
## t = -58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106521.9 -99537.4
## sample estimates:
## mean of x mean of y
## 1.067961e+00 1.030307e+05
t.test(job$sex, job$salary)
##
## Welch Two Sample t-test
##
## data: job$sex and job$salary
## t = -58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106521.71 -99537.17
## sample estimates:
## mean of x mean of y
## 1.300971e+00 1.030307e+05
mytable <-xtabs(~salary+sex,data=job)
mytable
## sex
## salary 1 2
## 64000 0 1
## 77000 1 0
## 78256 0 1
## 82000 0 1
## 85000 1 3
## 86000 0 2
## 88000 0 1
## 88500 1 0
## 90000 3 0
## 92000 2 1
## 93000 2 1
## 95000 4 3
## 96000 3 1
## 96500 1 0
## 97000 2 0
## 98000 6 4
## 99000 0 1
## 100000 4 5
## 100400 1 0
## 101000 0 2
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 2 0
## 105000 11 0
## 106000 2 1
## 107000 1 0
## 107300 1 0
## 107500 1 0
## 108000 2 0
## 110000 0 1
## 112000 3 0
## 115000 5 0
## 118000 1 0
## 120000 3 1
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
mytable1 <-xtabs(~salary+work_yrs,data=job)
mytable1
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
mytable2<-xtabs(~salary+frstlang,data=job)
mytable2
## frstlang
## salary 1 2
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
mytable3<-xtabs(~salary+gmat_tot,data=job)
mytable3
## gmat_tot
## salary 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
## 64000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 77000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 86000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 88000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 88500 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 90000 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
## 92000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 93000 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0
## 95000 0 0 1 0 0 2 0 0 0 0 2 0 0 0 0 0
## 96000 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0
## 96500 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 98000 0 0 0 0 0 1 3 1 1 0 1 0 0 0 0 0
## 99000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 0 0 0 0 2 0 1 0 1 1 0 1 0 2 0
## 100400 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 101000 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 101600 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 102500 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 104000 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 105000 0 0 0 0 2 0 2 3 0 1 0 1 0 0 1 0
## 106000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 107500 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 108000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 112000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 115000 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 120000 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 145800 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gmat_tot
## salary 670 680 700 710 720
## 64000 0 0 0 0 0
## 77000 0 0 0 0 0
## 78256 0 0 0 0 0
## 82000 1 0 0 0 0
## 85000 0 0 1 0 1
## 86000 0 1 0 0 0
## 88000 0 0 0 0 0
## 88500 0 0 0 0 0
## 90000 0 0 0 0 0
## 92000 0 0 0 1 0
## 93000 0 0 0 0 0
## 95000 2 0 0 0 0
## 96000 0 0 0 0 0
## 96500 0 0 0 0 0
## 97000 0 0 0 0 0
## 98000 1 1 0 1 0
## 99000 0 0 0 0 0
## 100000 0 0 0 1 0
## 100400 0 0 0 0 0
## 101000 0 0 0 0 0
## 101100 0 0 0 0 0
## 101600 0 0 0 0 0
## 102500 1 0 0 0 0
## 103000 0 0 0 0 0
## 104000 0 0 0 0 0
## 105000 0 1 0 0 0
## 106000 0 2 0 0 0
## 107000 0 0 0 0 0
## 107300 0 0 0 0 0
## 107500 0 0 0 0 0
## 108000 0 0 0 0 0
## 110000 0 0 0 0 0
## 112000 1 1 0 0 0
## 115000 0 0 0 1 0
## 118000 0 0 0 0 0
## 120000 1 0 1 0 0
## 126710 0 0 0 0 0
## 130000 0 0 0 0 0
## 145800 0 0 0 0 0
## 146000 0 0 0 0 0
## 162000 0 0 1 0 0
## 220000 0 0 0 0 0
mytable4<-xtabs(~salary+age,data=job)
mytable4
## age
## salary 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 64000 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 77000 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0
## 86000 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 88500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 90000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 92000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
## 95000 0 0 1 5 0 0 0 1 0 0 0 0 0 0 0
## 96000 0 0 1 1 2 0 0 0 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## 98000 0 1 3 2 1 1 1 1 0 0 0 0 0 0 0
## 99000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 1 4 1 1 1 0 0 0 1 0 0 0 0 0
## 100400 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101000 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101600 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 103000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## 105000 0 1 1 2 3 1 0 0 1 1 0 0 1 0 0
## 106000 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0
## 107000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 112000 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
## 115000 0 0 1 1 0 3 0 0 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 120000 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 162000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
mytable5<-xtabs(~salary+satis,data=job)
mytable5
## satis
## salary 3 4 5 6 7
## 64000 0 0 0 0 1
## 77000 0 0 0 1 0
## 78256 0 0 1 0 0
## 82000 0 0 0 0 1
## 85000 0 0 1 3 0
## 86000 0 0 2 0 0
## 88000 0 0 0 0 1
## 88500 0 0 0 1 0
## 90000 0 0 2 0 1
## 92000 0 0 1 1 1
## 93000 0 0 1 2 0
## 95000 1 1 1 2 2
## 96000 0 0 1 1 2
## 96500 0 0 0 1 0
## 97000 0 0 0 1 1
## 98000 0 0 2 5 3
## 99000 0 0 0 1 0
## 100000 0 0 1 6 2
## 100400 0 0 0 0 1
## 101000 0 0 1 1 0
## 101100 0 0 0 1 0
## 101600 0 0 0 1 0
## 102500 0 0 1 0 0
## 103000 0 0 0 1 0
## 104000 0 0 1 1 0
## 105000 0 0 4 6 1
## 106000 0 0 0 2 1
## 107000 0 0 1 0 0
## 107300 0 0 0 0 1
## 107500 0 0 1 0 0
## 108000 0 0 0 2 0
## 110000 0 0 1 0 0
## 112000 0 0 0 2 1
## 115000 0 0 3 2 0
## 118000 0 0 0 0 1
## 120000 0 0 2 2 0
## 126710 0 0 0 1 0
## 130000 0 0 0 0 1
## 145800 0 0 0 1 0
## 146000 0 0 0 1 0
## 162000 0 0 1 0 0
## 220000 0 0 0 1 0
MODEL 1
fit <- lm(salary ~ gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc + age + sex + quarter + frstlang + work_yrs + satis + f_avg + s_avg , data = job)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc +
## age + sex + quarter + frstlang + work_yrs + satis + f_avg +
## s_avg, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78005.66 52981.93 1.472 0.1444
## gmat_tot 16.19 178.85 0.090 0.9281
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## age 1750.65 1130.92 1.548 0.1251
## sex -3584.07 3595.85 -0.997 0.3216
## quarter -2336.56 2721.89 -0.858 0.3929
## frstlang 7719.42 7373.27 1.047 0.2979
## work_yrs 749.66 1135.90 0.660 0.5110
## satis -1086.54 2157.76 -0.504 0.6158
## f_avg -2222.82 3894.57 -0.571 0.5696
## s_avg -931.53 8240.31 -0.113 0.9102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
MODEL 2
fit <- lm(salary ~ gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc + sex + quarter + frstlang + satis + f_avg + s_avg , data = job)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc +
## sex + quarter + frstlang + satis + f_avg + s_avg, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31452 -7909 -1321 6283 93310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109064.86 52356.52 2.083 0.04001 *
## gmat_tot 53.22 193.43 0.275 0.78384
## gmat_qpc 727.40 537.14 1.354 0.17899
## gmat_vpc 606.30 541.39 1.120 0.26567
## gmat_tpc -1702.39 770.31 -2.210 0.02959 *
## sex -6306.25 3784.14 -1.666 0.09902 .
## quarter -2598.15 2946.61 -0.882 0.38021
## frstlang 18737.05 6970.54 2.688 0.00853 **
## satis -341.23 2327.82 -0.147 0.88378
## f_avg -7546.95 3989.74 -1.892 0.06169 .
## s_avg 6097.52 8737.71 0.698 0.48704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16710 on 92 degrees of freedom
## Multiple R-squared: 0.2115, Adjusted R-squared: 0.1258
## F-statistic: 2.468 on 10 and 92 DF, p-value: 0.01156
MODEL 3
fit <- lm(salary ~ gmat_tot+gmat_qpc+gmat_tpc +gmat_vpc + age + sex + quarter + frstlang + work_yrs + satis , data = job)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_tpc + gmat_vpc +
## age + sex + quarter + frstlang + work_yrs + satis, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27095 -8075 -265 5590 70231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74344.83 51260.45 1.450 0.1504
## gmat_tot -14.53 164.63 -0.088 0.9299
## gmat_qpc 834.28 479.73 1.739 0.0854 .
## gmat_tpc -1406.80 703.65 -1.999 0.0485 *
## gmat_vpc 594.88 481.23 1.236 0.2195
## age 1796.21 1111.80 1.616 0.1096
## sex -3888.24 3511.97 -1.107 0.2711
## quarter -1614.07 1443.38 -1.118 0.2664
## frstlang 7745.73 7167.62 1.081 0.2827
## work_yrs 796.13 1122.85 0.709 0.4801
## satis -1240.34 2107.59 -0.589 0.5576
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15290 on 92 degrees of freedom
## Multiple R-squared: 0.3395, Adjusted R-squared: 0.2677
## F-statistic: 4.73 on 10 and 92 DF, p-value: 1.877e-05
MODEL 4
fit <- lm(salary ~ gmat_tot+gmat_qpc+gmat_tpc +gmat_vpc + age+ sex + quarter + frstlang + satis , data = job)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_tpc + gmat_vpc +
## age + sex + quarter + frstlang + satis, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23910 -8097 -938 5154 71005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60845.48 47466.11 1.282 0.2031
## gmat_tot -14.14 164.19 -0.086 0.9315
## gmat_qpc 818.90 477.95 1.713 0.0900 .
## gmat_tpc -1380.08 700.76 -1.969 0.0519 .
## gmat_vpc 568.98 478.55 1.189 0.2375
## age 2488.70 529.81 4.697 9.08e-06 ***
## sex -3593.73 3477.98 -1.033 0.3042
## quarter -1610.02 1439.50 -1.118 0.2663
## frstlang 6285.79 6847.12 0.918 0.3610
## satis -1270.86 2101.51 -0.605 0.5468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15250 on 93 degrees of freedom
## Multiple R-squared: 0.3359, Adjusted R-squared: 0.2717
## F-statistic: 5.227 on 9 and 93 DF, p-value: 9.591e-06
MODEL5
fit <- lm(salary ~ gmat_qpc+gmat_tpc +gmat_vpc + age+ sex+ quarter , data = job)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_qpc + gmat_tpc + gmat_vpc + age +
## sex + quarter, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25414 -7424 -895 5108 72791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53235.7 20930.8 2.543 0.0126 *
## gmat_qpc 836.3 349.0 2.396 0.0185 *
## gmat_tpc -1449.3 681.8 -2.126 0.0361 *
## gmat_vpc 537.3 350.9 1.531 0.1290
## age 2647.9 475.8 5.565 2.37e-07 ***
## sex -2889.2 3340.4 -0.865 0.3892
## quarter -1651.4 1361.4 -1.213 0.2281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15100 on 96 degrees of freedom
## Multiple R-squared: 0.3279, Adjusted R-squared: 0.2859
## F-statistic: 7.807 on 6 and 96 DF, p-value: 7.449e-07
Age, gmat_tpc,gmat_qpc are statistically significant Sex,quarter and gmat_vpc are statistically insignificant
sal$salary[sal$salary>0] = "placed"
sal$salary[sal$salary < 1] = " not placed"
table1=xtabs(~sex+salary,data=sal)
table1
## salary
## sex not placed placed
## 1 67 139
## 2 23 45
chisq.test(table1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table1
## X-squared = 0.0023919, df = 1, p-value = 0.961
placement doesn’t depend on sex
table2=xtabs(~age+salary,data=sal)
table2
## salary
## age not placed placed
## 22 1 1
## 23 3 5
## 24 13 20
## 25 9 44
## 26 10 30
## 27 14 32
## 28 6 15
## 29 11 11
## 30 2 10
## 31 2 8
## 32 5 3
## 33 0 1
## 34 3 1
## 35 3 0
## 36 2 0
## 37 1 0
## 39 1 1
## 40 0 2
## 42 1 0
## 43 2 0
## 48 1 0
chisq.test(table2)
## Warning in chisq.test(table2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table2
## X-squared = 42.144, df = 20, p-value = 0.002649
placement depends on age
chisq.test(sal$salary,sal$gmat_tot)
## Warning in chisq.test(sal$salary, sal$gmat_tot): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: sal$salary and sal$gmat_tot
## X-squared = 33.128, df = 30, p-value = 0.317
placement doesn’t depend on gmat score
chisq.test(sal$salary,sal$work_yrs)
## Warning in chisq.test(sal$salary, sal$work_yrs): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: sal$salary and sal$work_yrs
## X-squared = 35.953, df = 17, p-value = 0.004653
placement depends on work ex
table3=xtabs(~satis+salary,data=sal)
table3
## salary
## satis not placed placed
## 1 0 1
## 2 0 1
## 3 0 5
## 4 4 13
## 5 36 38
## 6 40 57
## 7 10 23
## 998 0 46
chisq.test(sal$salary,sal$satis)
## Warning in chisq.test(sal$salary, sal$satis): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: sal$salary and sal$satis
## X-squared = 38.163, df = 7, p-value = 2.822e-06
placement depend on satisfaction level
table4=xtabs(~quarter+salary,data=sal)
table4
## salary
## quarter not placed placed
## 1 18 51
## 2 27 43
## 3 23 47
## 4 22 43
chisq.test(sal$salary,sal$gmat_tpc)
## Warning in chisq.test(sal$salary, sal$gmat_tpc): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: sal$salary and sal$gmat_tpc
## X-squared = 41.21, df = 41, p-value = 0.4614
placement doesn’t depend on gmat percentile
table5=xtabs(~frstlang+salary,data=sal)
table5
## salary
## frstlang not placed placed
## 1 82 160
## 2 8 24
chisq.test(table5)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table5
## X-squared = 0.64868, df = 1, p-value = 0.4206
placement doesn’t depend on language
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(sal, main = "Missing values vs observed")
sal1<-subset(sal,salary!='998',salary!='999')
sal1$salary[sal1$salary>0] = 1
sal1$salary[sal1$salary < 1] = 0
sal1$salary <- as.numeric(sal1$salary)
train <- sal1[1:128,]
test<- sal1[129:228,]
x<-glm(salary~. ,family = binomial(link = "logit"),data = train )
summary(x)
##
## Call:
## glm(formula = salary ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.00823 -1.01971 0.03027 0.90522 1.56385
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.073601 8.425423 0.127 0.899
## age -0.158356 0.119846 -1.321 0.186
## sex 0.093349 0.476485 0.196 0.845
## gmat_tot -0.000669 0.016705 -0.040 0.968
## gmat_qpc 0.010828 0.045364 0.239 0.811
## gmat_vpc 0.010181 0.043432 0.234 0.815
## gmat_tpc -0.016722 0.032506 -0.514 0.607
## s_avg 1.519797 1.692086 0.898 0.369
## f_avg -0.296257 0.417324 -0.710 0.478
## quarter -0.702429 0.770050 -0.912 0.362
## work_yrs 0.038171 0.140488 0.272 0.786
## frstlang 0.425971 0.825812 0.516 0.606
## satis 0.007840 0.012721 0.616 0.538
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.99 on 127 degrees of freedom
## Residual deviance: 130.04 on 115 degrees of freedom
## AIC: 156.04
##
## Number of Fisher Scoring iterations: 10
fitted.results <- predict(x,test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$salary)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.52"
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
p <- predict(x, test, type="response")
pr <- prediction(p, test$salary)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)