mba=read.csv("MBA Starting Salaries Data.csv")

library(psych)

describe(mba)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
str(mba)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
mba$frstlang[mba$frstlang == 2] = 'Other'
mba$frstlang[mba$frstlang == 1] = 'Eng'
mba$sex[mba$sex == 1] = 'M'
mba$sex[mba$sex == 2] = 'F'
mba$frstlang = factor(mba$frstlang)
mba$sex = factor(mba$sex)
str(mba)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "F","M": 1 2 2 2 1 2 2 1 2 2 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "Eng","Other": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
placed = mba[which(mba$salary > 0), ]
View(placed)

notplaced=mba[which(mba$salary==0), ]


library(lattice)

histogram(~salary, data = placed, xlab="Starting Salary", col='blue' )

aggregate(cbind(salary, work_yrs, age) ~ sex, 
          data = placed, mean)
##   sex   salary work_yrs      age
## 1   F 68182.96 3.244444 26.17778
## 2   M 54854.72 3.611511 26.99281
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~ work_yrs ,data=placed, xlab="Work Experience", ylab="MBA Starting Salaries", horizontal=TRUE)
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter

boxplot(salary ~ work_yrs ,data=placed,ylab="Work Experience", xlab="MBA Starting Salaries", horizontal=TRUE)

scatterplot(salary ~ sex ,data=mba, ylab="Work Experience", xlab="Starting Salaries", horizontal=FALSE)

scatterplot(salary ~ gmat_tot , data=placed,xlab="GMAT Total", ylab="Salary",labels=row.names(placed))

scatterplot(salary ~ gmat_tot |frstlang, data=placed, 
            xlab="GMAT Total", ylab="Salary",
            labels=row.names(placed))

scatterplot(salary ~ gmat_tot |sex, data=placed, 
            xlab="GMAT Total", ylab="Salary",
            labels=row.names(placed))

scatterplot(salary ~ gmat_tot |satis, data=placed, 
            xlab="GMAT Total", ylab="Salary",
            labels=row.names(placed))
## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth
## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth

## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth

## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth

scatterplot.matrix(~salary+s_avg+gmat_tot, data=placed)
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").

mod1 = lm(salary ~ work_yrs + s_avg + gmat_tot + sex + frstlang + satis , data = placed)
summary(mod1)
## 
## Call:
## lm(formula = salary ~ work_yrs + s_avg + gmat_tot + sex + frstlang + 
##     satis, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96610 -12069   9177  24468 133347 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    69714.585  40393.827   1.726  0.08612 .  
## work_yrs        1962.803   1192.425   1.646  0.10153    
## s_avg          23786.577   7834.309   3.036  0.00276 ** 
## gmat_tot        -104.718     54.670  -1.915  0.05705 .  
## sexM           -5960.444   6853.894  -0.870  0.38567    
## frstlangOther -19178.846   8827.964  -2.173  0.03115 *  
## satis            -72.048      6.793 -10.606  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39010 on 177 degrees of freedom
## Multiple R-squared:  0.4661, Adjusted R-squared:  0.448 
## F-statistic: 25.76 on 6 and 177 DF,  p-value: < 2.2e-16
mod2 = lm(salary ~ work_yrs + sex +frstlang +satis, data = placed)
summary(mod2)
## 
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90235  -6145   9521  24271 123037 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    77570.41    7398.84  10.484   <2e-16 ***
## work_yrs        2700.83    1202.03   2.247   0.0259 *  
## sexM          -10204.55    6911.96  -1.476   0.1416    
## frstlangOther -20680.37    8835.08  -2.341   0.0203 *  
## satis            -73.18       6.95 -10.529   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40050 on 179 degrees of freedom
## Multiple R-squared:  0.4309, Adjusted R-squared:  0.4182 
## F-statistic: 33.88 on 4 and 179 DF,  p-value: < 2.2e-16
mod3= lm(salary ~ work_yrs + s_avg + gmat_tot + sex +  satis , data = placed)
summary(mod3)
## 
## Call:
## lm(formula = salary ~ work_yrs + s_avg + gmat_tot + sex + satis, 
##     data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -92127  -8848  10355  25202 122965 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48820.072  39639.962   1.232 0.219728    
## work_yrs     1757.467   1201.029   1.463 0.145149    
## s_avg       26241.312   7832.985   3.350 0.000986 ***
## gmat_tot      -86.620     54.593  -1.587 0.114370    
## sexM        -4901.693   6907.611  -0.710 0.478876    
## satis         -73.465      6.832 -10.753  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39420 on 178 degrees of freedom
## Multiple R-squared:  0.4519, Adjusted R-squared:  0.4365 
## F-statistic: 29.35 on 5 and 178 DF,  p-value: < 2.2e-16
mod4 <- lm(salary ~ work_yrs + gmat_tot + sex + frstlang , data = placed)
summary(mod4)
## 
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot + sex + frstlang, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94178 -51200  20857  40147 123092 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   143515.31   45158.17   3.178  0.00175 **
## work_yrs        3701.92    1520.74   2.434  0.01590 * 
## gmat_tot        -133.27      69.99  -1.904  0.05850 . 
## sexM          -15021.45    8691.56  -1.728  0.08566 . 
## frstlangOther -32223.83   11244.08  -2.866  0.00466 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50460 on 179 degrees of freedom
## Multiple R-squared:  0.09672,    Adjusted R-squared:  0.07653 
## F-statistic: 4.792 on 4 and 179 DF,  p-value: 0.001074