setwd("C:/Users/Prabha Shankar/Desktop/Winter Internship/R file")
var1.df <- read.csv("MBA Starting Salaries Data.csv")
library(psych)
## Warning: package 'psych' was built under R version 3.3.3
describe(var1.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
table(var1.df$sex)
##
## 1 2
## 206 68
barplot(table(var1.df$sex) , xlab="sex" ,ylab = "frequency")
describe(var1.df$gmat_tot)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 274 619.45 57.54 620 618.86 59.3 450 790 340 -0.01 0.06
## se
## X1 3.48
hist(var1.df$gmat_tot , xlab="marks" ,ylab="frequency" ,main = "Marks Distribuion")
boxplot(var1.df$gmat_tot ,horizontal = TRUE ,xlab="gmat_total")
A.variable gmat_total is normally distributed . B.Mean of the gmat_total is 619.45 . c.Median of gmat_total is 620
describe(var1.df$gmat_qpc)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 274 80.64 14.87 83 82.31 14.83 28 99 71 -0.92 0.3
## se
## X1 0.9
hist(var1.df$gmat_qpc)
boxplot(var1.df$gmat_qpc ,horizontal = TRUE)
A.gmat_qpc is not normally distributed , histogram is left skewed .
describe(var1.df$gmat_qpc)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 274 80.64 14.87 83 82.31 14.83 28 99 71 -0.92 0.3
## se
## X1 0.9
hist(var1.df$gmat_qpc)
boxplot(var1.df$gmat_qpc ,horizontal = TRUE)
describe(var1.df$gmat_tpc)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 274 84.2 14.02 87 86.12 11.86 0 99 99 -2.28 9.02
## se
## X1 0.85
hist(var1.df$gmat_tpc)
boxplot(var1.df$gmat_tpc ,horizontal = TRUE)
Most values in the data set are higher than average.
describe(var1.df$s_avg)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 274 3.03 0.38 3 3.03 0.44 2 4 2 -0.06 -0.38
## se
## X1 0.02
hist(var1.df$s_avg)
boxplot(var1.df$s_avg ,horizontal = TRUE)
describe(var1.df$f_avg)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 274 3.06 0.53 3 3.09 0.37 0 4 4 -2.08 10.85
## se
## X1 0.03
hist(var1.df$f_avg)
boxplot(var1.df$f_avg ,horizontal = TRUE)
table(var1.df$quarter)
##
## 1 2 3 4
## 69 70 70 65
barplot(table(var1.df$quarter) , xlab = "quarter" , ylab = "frequency")
describe(var1.df$work_yrs)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 274 3.87 3.23 3 3.29 1.48 0 22 22 2.78 9.8 0.2
boxplot(var1.df$work_yrs ,horizontal = TRUE)
table(var1.df$frstlang)
##
## 1 2
## 242 32
barplot(table(var1.df$frstlang))
var1.df$salary[var1.df$salary==999] <- NA
var1.df$satis[var1.df$satis==998]<- NA
var1.df$salary[var1.df$salary==998]<-NA
describe(var1.df$salary ,na.rm = TRUE)
## vars n mean sd median trimmed mad min max range skew
## X1 1 193 54985.32 53152.39 85000 52726.81 51891 0 220000 220000 0.1
## kurtosis se
## X1 -1.45 3825.99
hist(var1.df$salary )
boxplot(var1.df$salary,na.rm=TRUE,horizontal = TRUE)
describe(var1.df$satis ,na.rm = TRUE)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 228 5.57 0.98 6 5.64 1.48 1 7 6 -0.92 2.07
## se
## X1 0.06
barplot(table(var1.df$satis))
Corelation Between Salary and satisfaction .
cor.test(var1.df$salary,var1.df$satis)
##
## Pearson's product-moment correlation
##
## data: var1.df$salary and var1.df$satis
## t = 2.189, df = 191, p-value = 0.02981
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01554274 0.29124439
## sample estimates:
## cor
## 0.1564395
library(car)
## Warning: package 'car' was built under R version 3.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(var1.df$salary,var1.df$satis)
Corelation between Salary and gmat_total .
cor.test(var1.df$gmat_tot,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$gmat_tot and var1.df$salary
## t = -0.00078582, df = 191, p-value = 0.9994
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1412959 0.1411844
## sample estimates:
## cor
## -5.685962e-05
scatterplot(var1.df$gmat_tot,var1.df$salary)
corelation between salary and gmat_qpc .
cor.test(var1.df$gmat_qpc,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$gmat_qpc and var1.df$salary
## t = 0.39254, df = 191, p-value = 0.6951
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1133029 0.1689543
## sample estimates:
## cor
## 0.02839164
scatterplot(var1.df$gmat_qpc,var1.df$salary)
Corelation between Salary and gmat_vpc .
cor.test(var1.df$gmat_vpc,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$gmat_vpc and var1.df$salary
## t = 0.046851, df = 191, p-value = 0.9627
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1379162 0.1445609
## sample estimates:
## cor
## 0.003389965
scatterplot(var1.df$gmat_vpc,var1.df$salary)
Corelation between Salary and gmat_tpc .
cor.test(var1.df$gmat_tpc,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$gmat_tpc and var1.df$salary
## t = 0.84384, df = 191, p-value = 0.3998
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08099267 0.20045926
## sample estimates:
## cor
## 0.06094464
scatterplot(var1.df$gmat_vpc,var1.df$salary)
Corelation between Salary and s_avg .
cor.test(var1.df$s_avg,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$s_avg and var1.df$salary
## t = 1.3374, df = 191, p-value = 0.1827
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04553553 0.23437561
## sample estimates:
## cor
## 0.09632412
scatterplot(var1.df$s_avg,var1.df$salary)
correlation between salary and f_avg .
cor.test(var1.df$f_avg,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$f_avg and var1.df$salary
## t = 0.12227, df = 191, p-value = 0.9028
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1325591 0.1498995
## sample estimates:
## cor
## 0.008846655
scatterplot(var1.df$f_avg,var1.df$salary)
corelation between working years and salary .
cor.test(var1.df$work_yrs,var1.df$salary)
##
## Pearson's product-moment correlation
##
## data: var1.df$work_yrs and var1.df$salary
## t = -0.73721, df = 191, p-value = 0.4619
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.19305455 0.08864017
## sample estimates:
## cor
## -0.05326685
scatterplot(var1.df$work_yrs,var1.df$salary)
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
library(corrgram)
corrgram(var1.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="MBA starting salary analysis Correlogram")
round(cor(var1.df[, 1:13], use="pair"),2)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.03 -0.15 -0.22 -0.04 -0.17 0.15 -0.02
## sex -0.03 1.00 -0.05 -0.16 0.07 -0.01 0.13 0.09
## gmat_tot -0.15 -0.05 1.00 0.72 0.75 0.85 0.11 0.10
## gmat_qpc -0.22 -0.16 0.72 1.00 0.15 0.65 -0.03 0.07
## gmat_vpc -0.04 0.07 0.75 0.15 1.00 0.67 0.20 0.08
## gmat_tpc -0.17 -0.01 0.85 0.65 0.67 1.00 0.12 0.08
## s_avg 0.15 0.13 0.11 -0.03 0.20 0.12 1.00 0.55
## f_avg -0.02 0.09 0.10 0.07 0.08 0.08 0.55 1.00
## quarter -0.05 -0.13 -0.09 0.04 -0.17 -0.08 -0.76 -0.45
## work_yrs 0.86 -0.01 -0.18 -0.24 -0.07 -0.17 0.13 -0.04
## frstlang 0.06 0.00 -0.14 0.14 -0.39 -0.10 -0.14 -0.04
## salary -0.13 0.02 0.00 0.03 0.00 0.06 0.10 0.01
## satis -0.07 0.01 0.03 -0.10 0.19 0.09 0.06 -0.04
## quarter work_yrs frstlang salary satis
## age -0.05 0.86 0.06 -0.13 -0.07
## sex -0.13 -0.01 0.00 0.02 0.01
## gmat_tot -0.09 -0.18 -0.14 0.00 0.03
## gmat_qpc 0.04 -0.24 0.14 0.03 -0.10
## gmat_vpc -0.17 -0.07 -0.39 0.00 0.19
## gmat_tpc -0.08 -0.17 -0.10 0.06 0.09
## s_avg -0.76 0.13 -0.14 0.10 0.06
## f_avg -0.45 -0.04 -0.04 0.01 -0.04
## quarter 1.00 -0.09 0.10 -0.15 -0.03
## work_yrs -0.09 1.00 -0.03 -0.05 0.03
## frstlang 0.10 -0.03 1.00 0.01 -0.27
## salary -0.15 -0.05 0.01 1.00 0.16
## satis -0.03 0.03 -0.27 0.16 1.00
Null Hypothesis : Salary Does not depend on any of the factor .
t.test(var1.df$salary,var1.df$PricePremium,var.equal = TRUE,paired = FALSE)
##
## One Sample t-test
##
## data: var1.df$salary
## t = 14.372, df = 192, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 47438.94 62531.69
## sample estimates:
## mean of x
## 54985.32
The null hypothesis is rejected because the t-Test gives a very low p-value and therefore salary depends on other factor .
var2 <- lm(salary ~ gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg+work_yrs , data=var1.df)
lm(var2)
##
## Call:
## lm(formula = var2)
##
## Coefficients:
## (Intercept) gmat_tot gmat_qpc gmat_vpc gmat_tpc
## 126361.6 -423.0 702.9 510.7 709.7
## s_avg f_avg work_yrs
## 20250.0 -7713.2 -1055.9
summary(var2)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc +
## s_avg + f_avg + work_yrs, data = var1.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73977 -52045 29449 43658 190233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 126361.6 86635.2 1.459 0.146
## gmat_tot -423.0 310.6 -1.362 0.175
## gmat_qpc 702.9 850.6 0.826 0.410
## gmat_vpc 510.7 801.1 0.637 0.525
## gmat_tpc 709.7 658.0 1.079 0.282
## s_avg 20250.0 12652.7 1.600 0.111
## f_avg -7713.2 8891.8 -0.867 0.387
## work_yrs -1055.9 1099.1 -0.961 0.338
##
## Residual standard error: 53210 on 185 degrees of freedom
## (81 observations deleted due to missingness)
## Multiple R-squared: 0.03439, Adjusted R-squared: -0.002146
## F-statistic: 0.9413 on 7 and 185 DF, p-value: 0.476