Reading Data

store <- read.csv("~/MBA Starting Salaries Data.csv")
 View(store)

Describing Data

library(psych)
describe(store)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Plots

hist(store$age, breaks=20,col="blue",xlab="Age in years", main="Age  distribution")

plot(store$sex,main = "Graph showing number of Males and Females",col="pink")

hist(store$work_yrs, breaks=20,col="blue",xlab="Work Experience in years", main="Work experience distribution")

hist(store$gmat_tot, breaks=40,col="blue",xlab="score out of 800", main="Gmat Score distribution")

plot(store$frstlang,main = "First Language Distribution",col="red")

newdata <- store[ which(store$satis<='7'), ]
hist(newdata$satis, breaks=5,col="magenta",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction  distribution")

newdata1 <- store[ which(store$salary !="998" & store$salary !="999"), ]
hist(newdata1$salary, breaks=10,col="magenta",xlab="starting salary", main="Salary  distribution")

##Scatter Plots

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~age,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

scatterplot(salary ~frstlang,     data=newdata1,
            main="Scatter plot of salary vs first language",
            xlab="first language",
            ylab="salary")

scatterplot(salary ~gmat_tot,     data=newdata1,
            main="Scatter plot of salary vs Gmat total",
            xlab="Gmat score",
            ylab="salary")

scatterplot(salary ~work_yrs,     data=newdata1,
            main="Scatter plot of salary vs Work exp.",
            xlab="Work experience in years",
            ylab="salary")

scatterplot(salary ~satis,     data=newdata1,
            main="Scatter plot of salary vs satisfaction",
            xlab="Degree of satisfaction",
            ylab="salary")

Corrgram

library(corrgram)
corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="MBA starting salary analysis Correlogram")

Chi-Square Test

chisq.test(store)
## Warning in chisq.test(store): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  store
## X-squared = 8451300, df = 3276, p-value < 2.2e-16

Contingency Tables

salary <-xtabs(~store$work_yrs)
salary
## store$work_yrs
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 18 22 
##  3 24 82 56 43 21 12  9  7  2  2  2  2  1  2  3  1  2
salary <-xtabs(~store$sex)
salary
## store$sex
##   1   2 
## 206  68
salary <-xtabs(~store$gmat_tot)
salary
## store$gmat_tot
## 450 460 480 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 
##   2   1   1   3   2   1   5   5   8  21  18  15   9  20  18  20  22  12 
## 650 660 670 680 690 700 710 720 730 740 750 760 790 
##  16  14  17  12   4   5  10   4   2   4   1   1   1

Regression Analysis

m1 <- lm(salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + work_yrs  + satis, 
         data=store)
summary(m1)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + work_yrs + satis, data = store)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64763 -45163  -3221  43605 186789 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 214629.575  65128.160   3.295  0.00112 ** 
## age          -3948.105   1530.064  -2.580  0.01041 *  
## sex           3730.050   6834.469   0.546  0.58568    
## gmat_tot      -299.779    209.269  -1.433  0.15318    
## gmat_qpc       338.156    582.841   0.580  0.56228    
## gmat_vpc       475.523    527.377   0.902  0.36805    
## gmat_tpc       522.828    417.568   1.252  0.21164    
## work_yrs      3391.302   1754.338   1.933  0.05429 .  
## satis          -47.715      7.856  -6.073 4.33e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47600 on 265 degrees of freedom
## Multiple R-squared:  0.1528, Adjusted R-squared:  0.1272 
## F-statistic: 5.974 on 8 and 265 DF,  p-value: 4.728e-07