R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Read the data

dataset = read.csv('MBA Starting Salaries Data.csv')
View(dataset)

Attach the dataframe

attach(dataset)

Summarize the data

summary(dataset)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(dataset)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
#getting rid of erroneous values in the dataset (null and 999 and 998)
dataset2=dataset[which (dataset$salary!=0 & dataset$salary!=999 & dataset$salary!=998), ]
View(dataset2)

#Splitting into training and test set
library(caTools)
set.seed(123)
split = sample.split(dataset2$salary, SplitRatio = 0.8)
training_set = subset(dataset2, split == TRUE)
test_set = subset(dataset2, split == FALSE)
View(training_set)
View(test_set)

cor(dataset2)
##                  age         sex    gmat_tot     gmat_qpc    gmat_vpc
## age       1.00000000 -0.14352927 -0.07871678 -0.165039057  0.01799420
## sex      -0.14352927  1.00000000 -0.01955548 -0.147099027  0.05341428
## gmat_tot -0.07871678 -0.01955548  1.00000000  0.666382266  0.78038546
## gmat_qpc -0.16503906 -0.14709903  0.66638227  1.000000000  0.09466541
## gmat_vpc  0.01799420  0.05341428  0.78038546  0.094665411  1.00000000
## gmat_tpc -0.09609156 -0.04686981  0.96680810  0.658650025  0.78443167
## s_avg     0.15654954  0.08079985  0.17198874  0.015471662  0.15865101
## f_avg    -0.21699191  0.16572186  0.12246257  0.098418869  0.02290167
## quarter  -0.12568145 -0.02139041 -0.10578964  0.012648346 -0.12862079
## work_yrs  0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang  0.35026743  0.07512009 -0.13164323  0.014198516 -0.21835333
## salary    0.49964284 -0.16628869 -0.09067141  0.014141299 -0.13743230
## satis     0.10832308 -0.09199534  0.06474206 -0.003984632  0.14863481
##             gmat_tpc       s_avg       f_avg     quarter    work_yrs
## age      -0.09609156  0.15654954 -0.21699191 -0.12568145  0.88052470
## sex      -0.04686981  0.08079985  0.16572186 -0.02139041 -0.09233003
## gmat_tot  0.96680810  0.17198874  0.12246257 -0.10578964 -0.12280018
## gmat_qpc  0.65865003  0.01547166  0.09841887  0.01264835 -0.18270126
## gmat_vpc  0.78443167  0.15865101  0.02290167 -0.12862079 -0.02812182
## gmat_tpc  1.00000000  0.13938500  0.07051391 -0.09955033 -0.13246963
## s_avg     0.13938500  1.00000000  0.44590413 -0.84038355  0.16328236
## f_avg     0.07051391  0.44590413  1.00000000 -0.43144819 -0.21633018
## quarter  -0.09955033 -0.84038355 -0.43144819  1.00000000 -0.12896722
## work_yrs -0.13246963  0.16328236 -0.21633018 -0.12896722  1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394  0.10955726  0.19627277
## salary   -0.13201783  0.10173175 -0.10603897 -0.12848526  0.45466634
## satis     0.11630842 -0.14356557 -0.11773304  0.22511985  0.06299926
##             frstlang      salary        satis
## age       0.35026743  0.49964284  0.108323083
## sex       0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141  0.064742057
## gmat_qpc  0.01419852  0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230  0.148634805
## gmat_tpc -0.16437561 -0.13201783  0.116308417
## s_avg    -0.13788905  0.10173175 -0.143565573
## f_avg    -0.05061394 -0.10603897 -0.117733043
## quarter   0.10955726 -0.12848526  0.225119851
## work_yrs  0.19627277  0.45466634  0.062999256
## frstlang  1.00000000  0.26701953  0.089834769
## salary    0.26701953  1.00000000 -0.040050600
## satis     0.08983477 -0.04005060  1.000000000
#Visualizing the corgram
library(corrgram)
corrgram(dataset2, order=NULL, lower.panel=panel.shade,
         upper.panel=NULL, text.panel=panel.txt,
         main="Mba Starting Salaries")

#We observe that age , workyrs and firstlang are highly corelated and
#create a regression model on the basis of this
attach(dataset2)
## The following objects are masked from dataset:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs
regressor = lm(formula = salary ~ age+work_yrs+frstlang,
               data = dataset2)
summary(regressor)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + frstlang, data = dataset2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31941  -9139  -1086   4793  75526 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  40492.7    23417.5   1.729   0.0869 .
## age           1892.0     1075.9   1.759   0.0818 .
## work_yrs       747.2     1116.9   0.669   0.5050  
## frstlang      8546.9     6728.1   1.270   0.2069  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared:  0.2626, Adjusted R-squared:  0.2403 
## F-statistic: 11.75 on 3 and 99 DF,  p-value: 1.188e-06
#not getting a good model so trying a regression on all features

regressor2 = lm(formula = salary ~ .,
               data = dataset2)
summary(regressor2)
## 
## Call:
## lm(formula = salary ~ ., data = dataset2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 78005.66   52981.93   1.472   0.1444  
## age          1750.65    1130.92   1.548   0.1251  
## sex         -3584.07    3595.85  -0.997   0.3216  
## gmat_tot       16.19     178.85   0.090   0.9281  
## gmat_qpc      796.55     496.78   1.603   0.1123  
## gmat_vpc      546.31     501.97   1.088   0.2794  
## gmat_tpc    -1457.09     714.94  -2.038   0.0445 *
## s_avg        -931.53    8240.31  -0.113   0.9102  
## f_avg       -2222.82    3894.57  -0.571   0.5696  
## quarter     -2336.56    2721.89  -0.858   0.3929  
## work_yrs      749.66    1135.90   0.660   0.5110  
## frstlang     7719.42    7373.27   1.047   0.2979  
## satis       -1086.54    2157.76  -0.504   0.6158  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05
#finally choosing a 0.05 level of signif and going with gmat_tpc and age

regressor3 = lm(formula = salary ~ gmat_tpc+age,
                data = dataset2)
summary(regressor3)
## 
## Call:
## lm(formula = salary ~ gmat_tpc + age, data = dataset2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33439  -8093  -1819   4692  76859 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42789.6    18247.8   2.345    0.021 *  
## gmat_tpc      -137.7      140.6  -0.979    0.330    
## age           2684.3      473.0   5.675 1.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15560 on 100 degrees of freedom
## Multiple R-squared:  0.2568, Adjusted R-squared:  0.2419 
## F-statistic: 17.27 on 2 and 100 DF,  p-value: 3.6e-07
#trying with prediction on test sets for regressors1,2,3
y_pred1 = predict(regressor, newdata = test_set)
y_pred2 = predict(regressor2, newdata = test_set)
y_pred3 = predict(regressor3, newdata = test_set)

View(y_pred1)
View(y_pred2)
View(y_pred3)
# Logistic Regression

# Importing the dataset GETTING RID OF 999-998 values

dataset3=dataset[which (dataset$salary!=999 & dataset$salary!=998), ]
View(dataset3)

#adding a new field to find whether studeent has been placed or not 
dataset3$Placed_B<-ifelse(dataset3$salary==0,0,1)
#removing the dataset field salary so that we acn judge on basis of independent parameters only
dataset3<-dataset3[-c(12)]
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset3$Placed_B, SplitRatio = 0.75)
training_set = subset(dataset3, split == TRUE)
test_set = subset(dataset3, split == FALSE)
# Feature Scaling
training_set[,1:12] = scale(training_set[,1:12])
test_set[,1:12] = scale(test_set[,1:12])
# Fitting Logistic Regression to the Training set
classifier = glm(formula = Placed_B~ .,
                 family = binomial,
                 data = training_set)
# Predicting the Test set results
prob_pred = predict(classifier, type = 'response', newdata = test_set[,1:12])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
# Making the Confusion Matrix
cm = table(test_set[, 13], y_pred > 0.5)
cm
##    
##     FALSE TRUE
##   0    12   10
##   1     9   17