setwd("C:/Users/Prabha Shankar/Desktop/Winter Internship/R file")
var1.df <- read.csv("MBA Starting Salaries Data.csv")
library(psych)
## Warning: package 'psych' was built under R version 3.3.3
describe(var1.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Sex_ratio Description

table(var1.df$sex)
## 
##   1   2 
## 206  68
barplot(table(var1.df$sex) , xlab="sex" ,ylab = "frequency")

gmat_toal Description

describe(var1.df$gmat_tot)
##    vars   n   mean    sd median trimmed  mad min max range  skew kurtosis
## X1    1 274 619.45 57.54    620  618.86 59.3 450 790   340 -0.01     0.06
##      se
## X1 3.48
hist(var1.df$gmat_tot , xlab="marks" ,ylab="frequency" ,main = "Marks Distribuion")

boxplot(var1.df$gmat_tot ,horizontal = TRUE ,xlab="gmat_total")

A.variable gmat_total is normally distributed . B.Mean of the gmat_total is 619.45 . c.Median of gmat_total is 620

gmat_qpc

describe(var1.df$gmat_qpc)
##    vars   n  mean    sd median trimmed   mad min max range  skew kurtosis
## X1    1 274 80.64 14.87     83   82.31 14.83  28  99    71 -0.92      0.3
##     se
## X1 0.9
hist(var1.df$gmat_qpc)

boxplot(var1.df$gmat_qpc ,horizontal = TRUE)

A.gmat_qpc is not normally distributed , histogram is left skewed .

gmat_vpc

describe(var1.df$gmat_qpc)
##    vars   n  mean    sd median trimmed   mad min max range  skew kurtosis
## X1    1 274 80.64 14.87     83   82.31 14.83  28  99    71 -0.92      0.3
##     se
## X1 0.9
hist(var1.df$gmat_qpc)

boxplot(var1.df$gmat_qpc ,horizontal = TRUE)

gmat_tpc

describe(var1.df$gmat_tpc)
##    vars   n mean    sd median trimmed   mad min max range  skew kurtosis
## X1    1 274 84.2 14.02     87   86.12 11.86   0  99    99 -2.28     9.02
##      se
## X1 0.85
hist(var1.df$gmat_tpc)

boxplot(var1.df$gmat_tpc ,horizontal = TRUE)

Most values in the data set are higher than average.

S_avg

describe(var1.df$s_avg)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 274 3.03 0.38      3    3.03 0.44   2   4     2 -0.06    -0.38
##      se
## X1 0.02
hist(var1.df$s_avg)

boxplot(var1.df$s_avg ,horizontal = TRUE)

f_avg

describe(var1.df$f_avg)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 274 3.06 0.53      3    3.09 0.37   0   4     4 -2.08    10.85
##      se
## X1 0.03
hist(var1.df$f_avg)

boxplot(var1.df$f_avg ,horizontal = TRUE)

quarter

table(var1.df$quarter)
## 
##  1  2  3  4 
## 69 70 70 65
barplot(table(var1.df$quarter) , xlab = "quarter" , ylab = "frequency")

Working_years

describe(var1.df$work_yrs)
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis  se
## X1    1 274 3.87 3.23      3    3.29 1.48   0  22    22 2.78      9.8 0.2
boxplot(var1.df$work_yrs ,horizontal =  TRUE)

First_Language

table(var1.df$frstlang)
## 
##   1   2 
## 242  32
barplot(table(var1.df$frstlang))

var1.df$salary[var1.df$salary==999] <- NA
var1.df$satis[var1.df$satis==998]<- NA
var1.df$salary[var1.df$salary==998]<-NA

salary

describe(var1.df$salary ,na.rm = TRUE)
##    vars   n     mean       sd median  trimmed   mad min    max  range skew
## X1    1 193 54985.32 53152.39  85000 52726.81 51891   0 220000 220000  0.1
##    kurtosis      se
## X1    -1.45 3825.99
hist(var1.df$salary )

boxplot(var1.df$salary,na.rm=TRUE,horizontal = TRUE)

satis

describe(var1.df$satis ,na.rm = TRUE)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 228 5.57 0.98      6    5.64 1.48   1   7     6 -0.92     2.07
##      se
## X1 0.06
barplot(table(var1.df$satis))

Pearsons Coorelation Test

Corelation Between Salary and satisfaction .

cor.test(var1.df$salary,var1.df$satis)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$salary and var1.df$satis
## t = 2.189, df = 191, p-value = 0.02981
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01554274 0.29124439
## sample estimates:
##       cor 
## 0.1564395
library(car)
## Warning: package 'car' was built under R version 3.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(var1.df$salary,var1.df$satis)

Corelation between Salary and gmat_total .

cor.test(var1.df$gmat_tot,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$gmat_tot and var1.df$salary
## t = -0.00078582, df = 191, p-value = 0.9994
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1412959  0.1411844
## sample estimates:
##           cor 
## -5.685962e-05
scatterplot(var1.df$gmat_tot,var1.df$salary)

corelation between salary and gmat_qpc .

cor.test(var1.df$gmat_qpc,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$gmat_qpc and var1.df$salary
## t = 0.39254, df = 191, p-value = 0.6951
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1133029  0.1689543
## sample estimates:
##        cor 
## 0.02839164
scatterplot(var1.df$gmat_qpc,var1.df$salary)

Corelation between Salary and gmat_vpc .

cor.test(var1.df$gmat_vpc,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$gmat_vpc and var1.df$salary
## t = 0.046851, df = 191, p-value = 0.9627
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1379162  0.1445609
## sample estimates:
##         cor 
## 0.003389965
scatterplot(var1.df$gmat_vpc,var1.df$salary)

Corelation between Salary and gmat_tpc .

cor.test(var1.df$gmat_tpc,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$gmat_tpc and var1.df$salary
## t = 0.84384, df = 191, p-value = 0.3998
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08099267  0.20045926
## sample estimates:
##        cor 
## 0.06094464
scatterplot(var1.df$gmat_vpc,var1.df$salary)

Corelation between Salary and s_avg .

cor.test(var1.df$s_avg,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$s_avg and var1.df$salary
## t = 1.3374, df = 191, p-value = 0.1827
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04553553  0.23437561
## sample estimates:
##        cor 
## 0.09632412
scatterplot(var1.df$s_avg,var1.df$salary)

correlation between salary and f_avg .

cor.test(var1.df$f_avg,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$f_avg and var1.df$salary
## t = 0.12227, df = 191, p-value = 0.9028
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1325591  0.1498995
## sample estimates:
##         cor 
## 0.008846655
scatterplot(var1.df$f_avg,var1.df$salary)

corelation between working years and salary .

cor.test(var1.df$work_yrs,var1.df$salary)
## 
##  Pearson's product-moment correlation
## 
## data:  var1.df$work_yrs and var1.df$salary
## t = -0.73721, df = 191, p-value = 0.4619
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.19305455  0.08864017
## sample estimates:
##         cor 
## -0.05326685
scatterplot(var1.df$work_yrs,var1.df$salary)

COrrelogram

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
library(corrgram)
corrgram(var1.df, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="MBA starting salary analysis Correlogram")

round(cor(var1.df[, 1:13], use="pair"),2)
##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.03    -0.15    -0.22    -0.04    -0.17  0.15 -0.02
## sex      -0.03  1.00    -0.05    -0.16     0.07    -0.01  0.13  0.09
## gmat_tot -0.15 -0.05     1.00     0.72     0.75     0.85  0.11  0.10
## gmat_qpc -0.22 -0.16     0.72     1.00     0.15     0.65 -0.03  0.07
## gmat_vpc -0.04  0.07     0.75     0.15     1.00     0.67  0.20  0.08
## gmat_tpc -0.17 -0.01     0.85     0.65     0.67     1.00  0.12  0.08
## s_avg     0.15  0.13     0.11    -0.03     0.20     0.12  1.00  0.55
## f_avg    -0.02  0.09     0.10     0.07     0.08     0.08  0.55  1.00
## quarter  -0.05 -0.13    -0.09     0.04    -0.17    -0.08 -0.76 -0.45
## work_yrs  0.86 -0.01    -0.18    -0.24    -0.07    -0.17  0.13 -0.04
## frstlang  0.06  0.00    -0.14     0.14    -0.39    -0.10 -0.14 -0.04
## salary   -0.13  0.02     0.00     0.03     0.00     0.06  0.10  0.01
## satis    -0.07  0.01     0.03    -0.10     0.19     0.09  0.06 -0.04
##          quarter work_yrs frstlang salary satis
## age        -0.05     0.86     0.06  -0.13 -0.07
## sex        -0.13    -0.01     0.00   0.02  0.01
## gmat_tot   -0.09    -0.18    -0.14   0.00  0.03
## gmat_qpc    0.04    -0.24     0.14   0.03 -0.10
## gmat_vpc   -0.17    -0.07    -0.39   0.00  0.19
## gmat_tpc   -0.08    -0.17    -0.10   0.06  0.09
## s_avg      -0.76     0.13    -0.14   0.10  0.06
## f_avg      -0.45    -0.04    -0.04   0.01 -0.04
## quarter     1.00    -0.09     0.10  -0.15 -0.03
## work_yrs   -0.09     1.00    -0.03  -0.05  0.03
## frstlang    0.10    -0.03     1.00   0.01 -0.27
## salary     -0.15    -0.05     0.01   1.00  0.16
## satis      -0.03     0.03    -0.27   0.16  1.00

T-Test

Null Hypothesis : Salary Does not depend on any of the factor .

t.test(var1.df$salary,var1.df$PricePremium,var.equal = TRUE,paired = FALSE)
## 
##  One Sample t-test
## 
## data:  var1.df$salary
## t = 14.372, df = 192, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  47438.94 62531.69
## sample estimates:
## mean of x 
##  54985.32

The null hypothesis is rejected because the t-Test gives a very low p-value and therefore salary depends on other factor .

Regression Analysis

var2 <- lm(salary ~ gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg+work_yrs , data=var1.df)
lm(var2)
## 
## Call:
## lm(formula = var2)
## 
## Coefficients:
## (Intercept)     gmat_tot     gmat_qpc     gmat_vpc     gmat_tpc  
##    126361.6       -423.0        702.9        510.7        709.7  
##       s_avg        f_avg     work_yrs  
##     20250.0      -7713.2      -1055.9
summary(var2)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + 
##     s_avg + f_avg + work_yrs, data = var1.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -73977 -52045  29449  43658 190233 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 126361.6    86635.2   1.459    0.146
## gmat_tot      -423.0      310.6  -1.362    0.175
## gmat_qpc       702.9      850.6   0.826    0.410
## gmat_vpc       510.7      801.1   0.637    0.525
## gmat_tpc       709.7      658.0   1.079    0.282
## s_avg        20250.0    12652.7   1.600    0.111
## f_avg        -7713.2     8891.8  -0.867    0.387
## work_yrs     -1055.9     1099.1  -0.961    0.338
## 
## Residual standard error: 53210 on 185 degrees of freedom
##   (81 observations deleted due to missingness)
## Multiple R-squared:  0.03439,    Adjusted R-squared:  -0.002146 
## F-statistic: 0.9413 on 7 and 185 DF,  p-value: 0.476

Summary

  1. Salary depend significantly on f_avg .
  2. model that fits the data is salary ~ gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg+work_yrs