Summary statistics:

describe(mba)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
notplaced = mba[mba$salary==0, ]
placed = mba[mba$salary!= 0, ]

Some plots to visualize the distribution of each variable independently:

par(mfrow=c(1,2))
placed <- mba[mba$salary!=0 & mba$salary!=998 & mba$salary!=999, ]
boxplot(placed$salary)
boxplot(placed$age)

Scatter Plots to understand how are the variables correlated pair-wise

par(mfrow=c(2,2))
plot(placed$salary,placed$work_yrs)
plot(placed$salary,placed$gmat_tot)
plot(notplaced$age,notplaced$work_yrs)
plot(notplaced$sex,notplaced$gmat_tot)

Corrgram:

corrgram(mba, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  diag.panel=panel.minmax, 
    main="Corrgram of Premium vs Economy")

Variance-Covariance Matrix:

"variance"
## [1] "variance"
var(mba)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05
"covariance"
## [1] "covariance"
cov(mba)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05

Effect of Sex on the on Salary

aggregate(salary ~ sex,data = placed, mean)
##   sex    salary
## 1   1 104970.97
## 2   2  98524.39

Effect of Satisfaction level on the on Salary

aggregate(salary ~ satis , data = placed, mean)
##   satis    salary
## 1     3  95000.00
## 2     4  95000.00
## 3     5 102974.34
## 4     6 105364.20
## 5     7  98531.82

Effect of MBA’s Starting salary based on Work Experience

aggregate(salary ~ work_yrs , data = placed, mean)
##    work_yrs    salary
## 1         0  95000.00
## 2         1 103532.00
## 3         2  97673.68
## 4         3 101652.86
## 5         4 105454.55
## 6         5 103142.86
## 7         6 105928.57
## 8         7  98000.00
## 9         8 105025.00
## 10       10 118000.00
## 11       15 183000.00
## 12       16 108500.00

Consider some hypotheses: 1)Males get a higher starting salary compared to females. 2)People who have English as their first language earn a better salary than other people.

Hypothesis 1: T test:

t.test(placed$salary, placed$sex)
## 
##  Welch Two Sample t-test
## 
## data:  placed$salary and placed$sex
## t = 58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99537.17 106521.71
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 1.300971e+00

H0: Females and males have the same salary. Difference between the mean salary of females and mean salary of males is 0 H1: Males have a higher salary compared to the females. Difference between the mean salary of females and mean salary of males is not 0. Due to very low p value we reject the null hypothesis.

Chi sq test: H0: the salary and sex are independent of each other. H1: there is a dependency between salary and sex

table1 <- xtabs(~salary+sex, data=placed)
chisq.test(table1)
## Warning in chisq.test(table1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table1
## X-squared = 52.681, df = 41, p-value = 0.1045

P value is higher than 0.05 therefore we fail to reject the null hypothesis

Hypothesis 2: T test:

t.test(placed$salary, placed$frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  placed$salary and placed$frstlang
## t = 58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99537.4 106521.9
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 1.067961e+00

H0: People who have English as first language as well as people who have other languages as first language have the same salary. Difference between the mean salary of both of them is 0 H1: People who have English as first language have a higher salary compared to the people who do not have English as their first language. Difference between the mean salary of both of them is not 0.

Due to very low p value we reject the null hypothesis.

Chi sq test: H0:the salary and first language are independent of each other. H1:there is a dependency between salary and first language

table2 <- xtabs(~salary+frstlang, data=placed)
chisq.test(table2)
## Warning in chisq.test(table2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table2
## X-squared = 69.847, df = 41, p-value = 0.003296

P value is found to be less than 0.05. Therefore we reject the null hypothesis.

Regression models Hypothesis 1:

fit <- lm(salary~sex+gmat_tot,data=placed)
summary(fit)
## 
## Call:
## lm(formula = salary ~ sex + gmat_tot, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36330  -8517  -2179   2651 117683 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 131914.55   22051.24   5.982 3.44e-08 ***
## sex          -6517.82    3806.11  -1.712   0.0899 .  
## gmat_tot       -33.12      34.61  -0.957   0.3409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17710 on 100 degrees of freedom
## Multiple R-squared:  0.03648,    Adjusted R-squared:  0.01721 
## F-statistic: 1.893 on 2 and 100 DF,  p-value: 0.156

salary=(-6517.82)sex+(-33.12)gmat_tot+131914.55

fit <- lm(salary~frstlang+work_yrs,data=placed)
summary(fit)
## 
## Call:
## lm(formula = salary ~ frstlang + work_yrs, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33972  -8955   -455   4545  76681 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  79941.4     6788.8  11.775  < 2e-16 ***
## frstlang     13064.0     6283.2   2.079   0.0402 *  
## work_yrs      2483.3      527.9   4.704 8.18e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 100 degrees of freedom
## Multiple R-squared:  0.2396, Adjusted R-squared:  0.2244 
## F-statistic: 15.75 on 2 and 100 DF,  p-value: 1.128e-06

salary=(13064.0)frstlang+(2483.3)work_yrs+79941.4 Which is the best model? R squared values- Model1:0.03648 Model2:0.2396 Thus model 2 is the model that best fits the data

Comparing those who got a job, and those who didnt

"setup"
## [1] "setup"
clean <- mba[mba$salary!=998 & mba$salary!=999, ]
"a new column is added to the data frame which has 1 if the person has a job and 0 if that person does not have a job"
## [1] "a new column is added to the data frame which has 1 if the person has a job and 0 if that person does not have a job"
job <- ifelse(clean$salary==0,0,1)
clean <- cbind(clean,job)

Does gender(sex) play a role in getting/not getting a job?

mytable <- xtabs(~job+sex, data=clean)
addmargins(mytable)
##      sex
## job     1   2 Sum
##   0    67  23  90
##   1    72  31 103
##   Sum 139  54 193

among people who have job, how many are male and female

prop.table(mytable,1)
##    sex
## job         1         2
##   0 0.7444444 0.2555556
##   1 0.6990291 0.3009709

among people of a particular sex, how many have jobs

prop.table(mytable,2)
##    sex
## job         1         2
##   0 0.4820144 0.4259259
##   1 0.5179856 0.5740741

Almost 70% of the jobs have gone to males and only 30% of the jobs have gone to females. 51% of males have a job while 57% of females have a job

Chi sq test: Null hypothesis is that the job and the sex are independent Alternate hypothesis is that the job and the sex are not independent

chisq.test(mytable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable
## X-squared = 0.29208, df = 1, p-value = 0.5889

We get a relatively high p value therefore we fail to reject the null hypothesis.

Does English as first language play a role in getting/not getting a job?

mytable <- xtabs(~job+frstlang, data=clean)
addmargins(mytable)
##      frstlang
## job     1   2 Sum
##   0    82   8  90
##   1    96   7 103
##   Sum 178  15 193

among people who have job, trying to analyse % of english first language people

prop.table(mytable,1)
##    frstlang
## job          1          2
##   0 0.91111111 0.08888889
##   1 0.93203883 0.06796117

among people who have english as first language, trying to find out how many have a job

prop.table(mytable,2)
##    frstlang
## job         1         2
##   0 0.4606742 0.5333333
##   1 0.5393258 0.4666667

93% of people who have got jobs have English as their first language Almost 54% of people who have English as their first language have got jobs

Chi sq test: Null hypothesis is that job and first language are independent alternate hypothesis is that the job and the first language are not independent

chisq.test(mytable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable
## X-squared = 0.074127, df = 1, p-value = 0.7854

p value is relatively high so we fail to reject the null hypothesis.

Logistic Regression

Logistic regression is conducted when the dependent variable is a categorical variable(it has a fixed set of values). The independent variable may be a categorical, continuous or a mixture of both.

We split the data into two chunks: training and testing set. The training set will be used to fit our model which we will be testing over the testing set.

train <- clean[1:174,]
test <- clean[175:193,]

Model: We consider age and work experience as the independent variables with job( 0 or 1) being the dependent variable.

model <- glm(job~age+work_yrs,family=binomial(link='logit'),data=train)
summary(model)
## 
## Call:
## glm(formula = job ~ age + work_yrs, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4759  -1.1674  -0.5629   1.1316   1.7677  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  5.26635    1.98814   2.649  0.00808 **
## age         -0.21512    0.08353  -2.575  0.01002 * 
## work_yrs     0.14511    0.08997   1.613  0.10677   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 241.01  on 173  degrees of freedom
## Residual deviance: 231.40  on 171  degrees of freedom
## AIC: 237.4
## 
## Number of Fisher Scoring iterations: 4
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: job
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                       173     241.01            
## age       1   6.9171       172     234.09 0.008537 **
## work_yrs  1   2.6950       171     231.40 0.100661   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We find out that age is statistically significant with a low p value.

accuracy of the fitted values that are used for testing after being trained

fitted.results <- predict(model,newdata=subset(test,select=c(1,10)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$job)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.684210526315789"

Now after training, we try testing the model and report an accuracy of 0.684210526315789.