THE DATA

store <- read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
library(psych)
describe(store)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

DESCRIPTIVE PLOTS

hist(store$age, breaks=20,col="blue",xlab="Age in years", main="Age  distribution")

plot(store$sex,main = "Graph showing number of Males and Females",col="blue")

hist(store$work_yrs, breaks=20,col="blue",xlab="Work Experience in years", main="Work experience distribution")

hist(store$gmat_tot, breaks=40,col="blue",xlab="score out of 800", main="Gmat Score distribution")

plot(store$frstlang,main = "First Language Distribution",col="red")

newdata <- store[ which(store$satis<='7'), ]
hist(newdata$satis, breaks=5,col="magenta",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction  distribution")

SCATTER PLOT

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
pairs(~salary+sex+age+gmat_tpc+frstlang+satis+work_yrs, data=store,main="Comparision of Salary and other variables")

library(car)
scatterplot(salary ~age,     data=store,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

library(car)
scatterplot(salary ~age,     data=store,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

newdata1 <- store[ which(store$salary !="998" & store$salary !="999"), ]
scatterplot(salary ~work_yrs,     data=newdata1,
            main="Scatter plot of salary vs Work exp.",
            xlab="Work experience in years",
            ylab="salary")

library(corrgram)
corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="MBA starting salary analysis Correlogram")

CONTENGIENCY TABLES

sALARY AND AGE

aggregate(salary~age,data=store,mean)
##    age    salary
## 1   22  42500.00
## 2   23  57282.00
## 3   24  49342.24
## 4   25  43395.55
## 5   26  35982.07
## 6   27  31499.37
## 7   28  39809.00
## 8   29  28067.95
## 9   30  55291.25
## 10  31  40599.40
## 11  32  13662.25
## 12  33 118000.00
## 13  34  26250.00
## 14  35      0.00
## 15  36      0.00
## 16  37      0.00
## 17  39  56000.00
## 18  40 183000.00
## 19  42      0.00
## 20  43      0.00
## 21  48      0.00
chisq.test(table(store$salary,store$age))
## Warning in chisq.test(table(store$salary, store$age)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(store$salary, store$age)
## X-squared = 1114.2, df = 880, p-value = 1.178e-07

SALARY AND EXPERIENCE

aggregate(store$salary~store$work_yrs,FUN=mean)
##    store$work_yrs store$salary
## 1               0     31999.67
## 2               1     34677.08
## 3               2     45531.24
## 4               3     38494.21
## 5               4     27510.81
## 6               5     34476.10
## 7               6     62041.33
## 8               7     11221.78
## 9               8     60156.86
## 10              9       499.50
## 11             10     59000.00
## 12             11         0.00
## 13             12         0.00
## 14             13         0.00
## 15             15    183000.00
## 16             16     72333.33
## 17             18         0.00
## 18             22         0.00
chisq.test(table(store$salary,store$work_yrs))
## Warning in chisq.test(table(store$salary, store$work_yrs)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(store$salary, store$work_yrs)
## X-squared = 846.05, df = 748, p-value = 0.007162

SALARY AND GMAT SCORES

aggregate(store$salary~store$gmat_tot,FUN=mean)
##    store$gmat_tot store$salary
## 1             450      499.000
## 2             460      998.000
## 3             480        0.000
## 4             500   105833.000
## 5             510        0.000
## 6             520    78256.000
## 7             530    39800.000
## 8             540    41600.000
## 9             550    42213.625
## 10            560    36047.238
## 11            570    40610.889
## 12            580    53466.333
## 13            590    21999.333
## 14            600    48849.350
## 15            610    26944.000
## 16            620    62664.800
## 17            630    38885.636
## 18            640     9582.667
## 19            650    44562.125
## 20            660    33456.500
## 21            670    41793.471
## 22            680    51332.917
## 23            690      998.500
## 24            700    73400.000
## 25            710    40699.700
## 26            720    21499.500
## 27            730      499.500
## 28            740      748.750
## 29            750        0.000
## 30            760        0.000
## 31            790      999.000
chisq.test(table(store$salary,store$gmat_tot))
## Warning in chisq.test(table(store$salary, store$gmat_tot)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(store$salary, store$gmat_tot)
## X-squared = 1267.7, df = 1320, p-value = 0.8456

REGRESSION ANALYSIS

MODEL 1

mod1 <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = store)
summary(mod1)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc, 
##     data = store)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48199 -41195 -33034  56735 182897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 141539.0    59303.9   2.387   0.0177 *
## gmat_tot      -369.7      222.7  -1.660   0.0980 .
## gmat_qpc       465.7      615.2   0.757   0.4497  
## gmat_vpc       573.4      563.0   1.018   0.3094  
## gmat_tpc       523.2      443.0   1.181   0.2386  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50900 on 269 degrees of freedom
## Multiple R-squared:  0.01651,    Adjusted R-squared:  0.001889 
## F-statistic: 1.129 on 4 and 269 DF,  p-value: 0.343

This shows that only gmat_tot is linearly correlated to salary of mba.Also the overall p value is greater than 0.05,so this model is not a good model. R-squared value shows that there is 1.6 % error in data evaluation and Residual Standard error is 50900.

MODEL 2

mod2<- lm(salary ~satis+work_yrs+frstlang, data = store)
summary(mod2)
## 
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = store)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49117 -47235  -2246  49225 187005 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 59455.437  11221.844   5.298 2.43e-07 ***
## satis         -45.735      7.912  -5.780 2.05e-08 ***
## work_yrs     -458.980    907.095  -0.506    0.613    
## frstlang    -9650.834   9087.065  -1.062    0.289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48150 on 270 degrees of freedom
## Multiple R-squared:  0.1168, Adjusted R-squared:  0.107 
## F-statistic: 11.91 on 3 and 270 DF,  p-value: 2.4e-07

This shows that only satisfaction is linearly correlated to salary of mba.Also the overall p value is less than 0.05,so this model is better than model 1. R-squared value shows that there is 1.1 % error in data evaluation and Residual Standard error is 48150.

MODEL 3

 mod3<- lm(salary ~age+sex, data =store)
summary(mod3)
## 
## Call:
## lm(formula = salary ~ age + sex, data = store)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48601 -38030 -35529  54440 185565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  51953.6    24832.5   2.092   0.0374 *
## age           -833.3      830.9  -1.003   0.3168  
## sex           7906.6     7124.9   1.110   0.2681  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50920 on 271 degrees of freedom
## Multiple R-squared:  0.008421,   Adjusted R-squared:  0.001103 
## F-statistic: 1.151 on 2 and 271 DF,  p-value: 0.3179

This shows that only no variable is correlated to salary of mba.Also the overall p value is greater than 0.05,so this model is not a good model. R-squared value shows that there is 0 % error in data evaluation and Residual Standard error is 50920.