MBA Starting Salaries Data

Including Plots

You can also embed plots, for example:

#Raj Kapoor
#mr.rajkapoor393@gmail.com
#NIT Rourkela

dataset = read.csv('MBA Starting Salaries Data.csv')
View(dataset)

#getting rid of erroneous values in the dataset (null and 999 and 998)
dataset2=dataset[which (dataset$salary!=0 & dataset$salary!=999 & dataset$salary!=998), ]
View(dataset2)

#dataset2 has 103 entries
#Splitting into training and test set
library(caTools)
set.seed(123)
split = sample.split(dataset2$salary, SplitRatio = 0.8)
training_set = subset(dataset2, split == TRUE)
test_set = subset(dataset2, split == FALSE)
View(training_set)
View(test_set)

cor(dataset2)

##                  age         sex    gmat_tot     gmat_qpc    gmat_vpc
## age       1.00000000 -0.14352927 -0.07871678 -0.165039057  0.01799420
## sex      -0.14352927  1.00000000 -0.01955548 -0.147099027  0.05341428
## gmat_tot -0.07871678 -0.01955548  1.00000000  0.666382266  0.78038546
## gmat_qpc -0.16503906 -0.14709903  0.66638227  1.000000000  0.09466541
## gmat_vpc  0.01799420  0.05341428  0.78038546  0.094665411  1.00000000
## gmat_tpc -0.09609156 -0.04686981  0.96680810  0.658650025  0.78443167
## s_avg     0.15654954  0.08079985  0.17198874  0.015471662  0.15865101
## f_avg    -0.21699191  0.16572186  0.12246257  0.098418869  0.02290167
## quarter  -0.12568145 -0.02139041 -0.10578964  0.012648346 -0.12862079
## work_yrs  0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang  0.35026743  0.07512009 -0.13164323  0.014198516 -0.21835333
## salary    0.49964284 -0.16628869 -0.09067141  0.014141299 -0.13743230
## satis     0.10832308 -0.09199534  0.06474206 -0.003984632  0.14863481
##             gmat_tpc       s_avg       f_avg     quarter    work_yrs
## age      -0.09609156  0.15654954 -0.21699191 -0.12568145  0.88052470
## sex      -0.04686981  0.08079985  0.16572186 -0.02139041 -0.09233003
## gmat_tot  0.96680810  0.17198874  0.12246257 -0.10578964 -0.12280018
## gmat_qpc  0.65865003  0.01547166  0.09841887  0.01264835 -0.18270126
## gmat_vpc  0.78443167  0.15865101  0.02290167 -0.12862079 -0.02812182
## gmat_tpc  1.00000000  0.13938500  0.07051391 -0.09955033 -0.13246963
## s_avg     0.13938500  1.00000000  0.44590413 -0.84038355  0.16328236
## f_avg     0.07051391  0.44590413  1.00000000 -0.43144819 -0.21633018
## quarter  -0.09955033 -0.84038355 -0.43144819  1.00000000 -0.12896722
## work_yrs -0.13246963  0.16328236 -0.21633018 -0.12896722  1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394  0.10955726  0.19627277
## salary   -0.13201783  0.10173175 -0.10603897 -0.12848526  0.45466634
## satis     0.11630842 -0.14356557 -0.11773304  0.22511985  0.06299926
##             frstlang      salary        satis
## age       0.35026743  0.49964284  0.108323083
## sex       0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141  0.064742057
## gmat_qpc  0.01419852  0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230  0.148634805
## gmat_tpc -0.16437561 -0.13201783  0.116308417
## s_avg    -0.13788905  0.10173175 -0.143565573
## f_avg    -0.05061394 -0.10603897 -0.117733043
## quarter   0.10955726 -0.12848526  0.225119851
## work_yrs  0.19627277  0.45466634  0.062999256
## frstlang  1.00000000  0.26701953  0.089834769
## salary    0.26701953  1.00000000 -0.040050600
## satis     0.08983477 -0.04005060  1.000000000

#Visualizing the corgram
library(corrgram)
corrgram(dataset2, order=NULL, lower.panel=panel.shade,
         upper.panel=NULL, text.panel=panel.txt,
         main="Mba Starting Salaries")

#We observe that age , workyrs and firstlang are highly corelated and
#create a regression model on the basis of this
attach(dataset2)
regressor = lm(formula = salary ~ age+work_yrs+frstlang,
               data = dataset2)
summary(regressor)

## 
## Call:
## lm(formula = salary ~ age + work_yrs + frstlang, data = dataset2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31941  -9139  -1086   4793  75526 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  40492.7    23417.5   1.729   0.0869 .
## age           1892.0     1075.9   1.759   0.0818 .
## work_yrs       747.2     1116.9   0.669   0.5050  
## frstlang      8546.9     6728.1   1.270   0.2069  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared:  0.2626, Adjusted R-squared:  0.2403 
## F-statistic: 11.75 on 3 and 99 DF,  p-value: 1.188e-06

#not getting a good model so trying a regression on all features

regressor2 = lm(formula = salary ~ .,
                data = dataset2)
summary(regressor2)

## 
## Call:
## lm(formula = salary ~ ., data = dataset2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 78005.66   52981.93   1.472   0.1444  
## age          1750.65    1130.92   1.548   0.1251  
## sex         -3584.07    3595.85  -0.997   0.3216  
## gmat_tot       16.19     178.85   0.090   0.9281  
## gmat_qpc      796.55     496.78   1.603   0.1123  
## gmat_vpc      546.31     501.97   1.088   0.2794  
## gmat_tpc    -1457.09     714.94  -2.038   0.0445 *
## s_avg        -931.53    8240.31  -0.113   0.9102  
## f_avg       -2222.82    3894.57  -0.571   0.5696  
## quarter     -2336.56    2721.89  -0.858   0.3929  
## work_yrs      749.66    1135.90   0.660   0.5110  
## frstlang     7719.42    7373.27   1.047   0.2979  
## satis       -1086.54    2157.76  -0.504   0.6158  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05

#getting a 0.04 p value on gmat_tpc
#finally choosing a 0.05 level of signif and going with gmat_tpc and age

regressor3 = lm(formula = salary ~ gmat_tpc+age,
                data = dataset2)
summary(regressor3)

## 
## Call:
## lm(formula = salary ~ gmat_tpc + age, data = dataset2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33439  -8093  -1819   4692  76859 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42789.6    18247.8   2.345    0.021 *  
## gmat_tpc      -137.7      140.6  -0.979    0.330    
## age           2684.3      473.0   5.675 1.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15560 on 100 degrees of freedom
## Multiple R-squared:  0.2568, Adjusted R-squared:  0.2419 
## F-statistic: 17.27 on 2 and 100 DF,  p-value: 3.6e-07

#trying with prediction on test sets for regressors1,2,3
y_pred1 = predict(regressor, newdata = test_set)
y_pred2 = predict(regressor2, newdata = test_set)
y_pred3 = predict(regressor3, newdata = test_set)

View(y_pred1)
View(y_pred2)
View(y_pred3)

#we see the test set for row 39 salary was 92000
#regressor2 performed the most well
#but improvizations have to be made


#proceeding to the second part of the task
# Logistic Regression

# Importing the dataset GETTING RID OF 999-998 values

dataset3=dataset[which (dataset$salary!=999 & dataset$salary!=998), ]
View(dataset3)

#adding a new field to find whether studeent has been placed or not 
dataset3$Placed_B<-ifelse(dataset3$salary==0,0,1)
#removing the dataset field salary so that we acn judge on basis of independent parameters only
dataset3<-dataset3[-c(12)]


# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset3$Placed_B, SplitRatio = 0.75)
training_set = subset(dataset3, split == TRUE)
test_set = subset(dataset3, split == FALSE)

# Feature Scaling
training_set[,1:12] = scale(training_set[,1:12])
test_set[,1:12] = scale(test_set[,1:12])

# Fitting Logistic Regression to the Training set
classifier = glm(formula = Placed_B~ .,
                 family = binomial,
                 data = training_set)

# Predicting the Test set results
prob_pred = predict(classifier, type = 'response', newdata = test_set[,1:12])
y_pred = ifelse(prob_pred > 0.5, 1, 0)

# Making the Confusion Matrix
cm = table(test_set[, 13], y_pred > 0.5)
cm

##    
##     FALSE TRUE
##   0    12   10
##   1     9   17

#FALSE TRUE
#0    12   10
#1     9   17

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

MBA Starting Salaries Data

Raj Kapoor

23 January 2018

R Markdown

Including Plots