mba_sal <- read.csv("C:/Program Files/RStudio/files/MBA Starting Salaries Data.csv")
View(mba_sal)
summary(mba_sal)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

==> Different age groups

hist(mba_sal$age, main="Ages", xlab="ages", col="cyan")

==> Score Distributions

par(mfrow=c(2,2))
with(mba_sal, hist(mba_sal$gmat_tot,
      main = "GMAT total",
      ylab = "Frequency",
      xlab = "score",
      col = "cyan",
      breaks = 10))
with(mba_sal, hist(mba_sal$gmat_qpc,
      main = "quantitative GMAT percentile",
      ylab = "Frequency",
      xlab = "percentile",
      col = "cyan",
      breaks = 10))
with(mba_sal, hist(mba_sal$gmat_vpc,
      main = "verbal GMAT percentile",
      ylab = "Frequency",
      xlab = "percentile",
      col = "cyan",
      breaks = 10))
with(mba_sal, hist(mba_sal$gmat_tpc,
      main = "overall GMAT percentile",
      ylab = "Frequency",
      xlab = "percentile",
      col = "cyan",
      breaks = 10))

==> Fall MBA average

hist(mba_sal$f_avg, main="Fall MBA average", xlab="average", col="cyan")

==> Spring MBA average

hist(mba_sal$s_avg, main="Spring MBA average", xlab="average", col="cyan")

==> Quartile ranking

hist(mba_sal$quarter, main="Quartile ranking", xlab="ranking", col="cyan")

==> Years of work experience.

hist(mba_sal$work_yrs, main="Years of work experience", xlab="years", col="cyan")

==> Starting salaries.

plot(mba_sal$salary, main="Starting salaries", xlab="salary", col="red")

==> Scatterplot of Years Of Work Experience vs Starting salaries

plot(mba_sal$work_yrs,mba_sal$salary, main = "Years Of Work Experience vs Starting Salary",xlab="Years Of Work Experience", ylab="Starting Salary", col="red")

==> Scatterplot of spring MBA average vs Starting Salary

plot(mba_sal$s_avg,mba_sal$salary, main = "MBA average vs Starting Salary",xlab="Spring MBA Average", ylab="Starting Salary", col="red")

==> Corrgram representing the distributions of the dataset

library(corrgram)
corrgram(mba_sal, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Starting salaries corrgram")

==> Placed dataset

placed <- mba_sal[which(mba_sal$salary>999),]
View(placed)
attach(placed)

==> Regression models y = f(x)

==> GMAT Score

x <-lm(placed$salary~placed$gmat_tot)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$gmat_tot)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40821  -8223  -2543   3756 113261 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     122721.08   21591.40   5.684 1.28e-07 ***
## placed$gmat_tot    -31.96      34.93  -0.915    0.362    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17880 on 101 degrees of freedom
## Multiple R-squared:  0.008221,   Adjusted R-squared:  -0.001598 
## F-statistic: 0.8372 on 1 and 101 DF,  p-value: 0.3624

==> GMAT Quant Percentile

x <-lm(placed$salary~placed$gmat_qpc)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$gmat_qpc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38564  -7621  -2979   3162 117342 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     101525.69   10735.85   9.457 1.41e-15 ***
## placed$gmat_qpc     18.88     132.81   0.142    0.887    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17960 on 101 degrees of freedom
## Multiple R-squared:  0.0002, Adjusted R-squared:  -0.009699 
## F-statistic: 0.0202 on 1 and 101 DF,  p-value: 0.8873

==> GMAT Total percentile

x <-lm(placed$salary~placed$gmat_tpc)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$gmat_tpc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41929  -7964  -2071   4107 109784 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     121147.3    13648.4   8.876 2.66e-14 ***
## placed$gmat_tpc   -214.3      160.1  -1.338    0.184    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17800 on 101 degrees of freedom
## Multiple R-squared:  0.01743,    Adjusted R-squared:  0.0077 
## F-statistic: 1.792 on 1 and 101 DF,  p-value: 0.1837

==> GMAT Verbal Percentile

x <-lm(placed$salary~placed$gmat_vpc)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$gmat_vpc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39116  -8268  -2660   4264 111864 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     114980.7     8747.3  13.145   <2e-16 ***
## placed$gmat_vpc   -152.1      109.1  -1.394    0.166    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17790 on 101 degrees of freedom
## Multiple R-squared:  0.01889,    Adjusted R-squared:  0.009174 
## F-statistic: 1.944 on 1 and 101 DF,  p-value: 0.1663

==> Spring MBA Average

x <-lm(placed$salary~placed$s_avg)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$s_avg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40989  -8087  -2068   3682 119814 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     88179      14558   6.057 2.39e-08 ***
## placed$s_avg     4803       4673   1.028    0.307    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17860 on 101 degrees of freedom
## Multiple R-squared:  0.01035,    Adjusted R-squared:  0.0005508 
## F-statistic: 1.056 on 1 and 101 DF,  p-value: 0.3065

==> Fall MBA Average

x <-lm(placed$salary~placed$f_avg)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$f_avg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38413  -7413  -3384   4087 115645 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    115039      11342  10.143   <2e-16 ***
## placed$f_avg    -3885       3625  -1.072    0.286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17860 on 101 degrees of freedom
## Multiple R-squared:  0.01124,    Adjusted R-squared:  0.001455 
## F-statistic: 1.149 on 1 and 101 DF,  p-value: 0.2864

==> Years of work experience

x <-lm(placed$salary~placed$work_yrs)
summary(x)
## 
## Call:
## lm(formula = placed$salary ~ placed$work_yrs)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34498  -7745   -498   3803  86419 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        93101       2496   37.30  < 2e-16 ***
## placed$work_yrs     2699        526    5.13  1.4e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15990 on 101 degrees of freedom
## Multiple R-squared:  0.2067, Adjusted R-squared:  0.1989 
## F-statistic: 26.32 on 1 and 101 DF,  p-value: 1.403e-06

p-value < 0.05, so it seems statistically significant, and hence is the best model.

==> Chi Square test

chisq.test(placed$salary,placed$work_yrs, correct=FALSE)
## Warning in chisq.test(placed$salary, placed$work_yrs, correct = FALSE):
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  placed$salary and placed$work_yrs
## X-squared = 535.23, df = 451, p-value = 0.003809

==> t-Test

t.test(placed$salary,placed$work_yrs,var.equal=TRUE, paired=FALSE)
## 
##  Two Sample t-test
## 
## data:  placed$salary and placed$work_yrs
## t = 58.516, df = 204, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99555.62 106498.49
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 3.679612e+00

==> Not placed dataset

notplaced <- mba_sal[which(mba_sal$salary==0),]
View(notplaced)
attach(notplaced)
## The following objects are masked from placed:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs

==> New Subset

mba_sub <- subset(mba_sal,mba_sal$salary==0|mba_sal$salary>999)
View(mba_sub)
attach(mba_sub)
## The following objects are masked from notplaced:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs
## The following objects are masked from placed:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs
mba_sub$placed <- ifelse(mba_sub$salary > 0,1,0)
mba_sub$placed <- factor(mba_sub$placed)
x <-glm(mba_sub$placed~., data = mba_sub, family = "binomial"(link='logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(x)
## 
## Call:
## glm(formula = mba_sub$placed ~ ., family = binomial(link = "logit"), 
##     data = mba_sub)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.269e-05  -6.461e-07   2.110e-08   2.110e-08   1.857e-05  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.285e+01  8.710e+05   0.000    1.000
## age         -5.727e-01  1.529e+04   0.000    1.000
## sex          3.012e+00  5.274e+04   0.000    1.000
## gmat_tot     1.089e-03  2.893e+03   0.000    1.000
## gmat_qpc    -4.586e-02  7.209e+03   0.000    1.000
## gmat_vpc     5.529e-02  7.914e+03   0.000    1.000
## gmat_tpc    -4.487e-02  4.255e+03   0.000    1.000
## s_avg        2.796e+00  8.174e+04   0.000    1.000
## f_avg        1.356e+00  8.324e+04   0.000    1.000
## quarter      2.354e+00  2.603e+04   0.000    1.000
## work_yrs    -2.840e-01  1.556e+04   0.000    1.000
## frstlang    -3.253e+00  2.582e+05   0.000    1.000
## salary       6.110e-04  5.454e-01   0.001    0.999
## satis        7.063e-01  3.452e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.6668e+02  on 192  degrees of freedom
## Residual deviance: 1.9027e-09  on 179  degrees of freedom
## AIC: 28
## 
## Number of Fisher Scoring iterations: 25