sal.df<-read.csv(paste("mba.csv", sep=""))
View(sal.df)
abc.df <- sal.df[sal.df$salary!=998&sal.df$salary!=999,]
aggregate(abc.df$salary,by=list(Gender = abc.df$sex),mean)
## Gender x
## 1 1 54373.45
## 2 2 56560.30
male <- abc.df[ which(abc.df$sex=='1') , ]
female <-abc.df[ which(abc.df$sex=='2') , ]
boxplot(male$salary,female$salary,col=c("green","purple"),main="Starting Salary of male vs female",names=c("Male","Female"))
barplot(table(sal.df$sex),main = "Distribution of data(on gender)",xlab = "1=Male 2=Female",col=c("green","purple"))
barplot(table(sal.df$satis),main = "Satisfaction of students(ratings by them)",xlab = "998=not attended survey",col="blue")
hist(sal.df$gmat_tot, breaks=40,col="green",xlab="score out of 800", main="Gmat Score distribution")
hist(abc.df$salary, breaks=10,col="pink",xlab="starting salary", main="Salary distribution")
library(car)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
scatterplot(salary ~age, data=abc.df,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs age",
xlab="Age", ylab="Salary")
scatterplot(salary ~sex, data=abc.df,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs sex",
xlab="Sex",
ylab="Salary")
scatterplot(salary ~gmat_tot, data=abc.df,
main="Scatter plot of salary vs Gmat total",
xlab="GMAT score",
ylab="Salary")
scatterplot(salary ~work_yrs, data=abc.df,
main="Scatter plot of salary vs Work exp.",
xlab="Work experience in years",
ylab="Salary")
scatterplot(salary ~satis, data=abc.df,
main="Scatter plot of salary vs satisfaction",
xlab="Degree of satisfaction",
ylab="Salary")
chisq.test(sal.df)
## Warning in chisq.test(sal.df): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: sal.df
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
x<-xtabs(~sal.df$sex)
x
## sal.df$sex
## 1 2
## 206 68
mba<- sal.df[which(sal.df$salary!=998 & sal.df$salary !=999 & sal.df$salary !=0),]
m2<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + satis
+ frstlang , data = mba)
summary(m2)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + satis + frstlang, data = mba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27442 -9074 -26 5449 65805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51877.83 47122.04 1.101 0.2737
## age 2720.36 507.19 5.364 5.73e-07 ***
## gmat_tot -27.96 162.79 -0.172 0.8640
## gmat_qpc 841.99 471.63 1.785 0.0774 .
## gmat_vpc 567.02 477.05 1.189 0.2376
## gmat_tpc -1309.36 699.44 -1.872 0.0643 .
## satis -1688.22 2036.28 -0.829 0.4091
## frstlang 4176.03 6703.97 0.623 0.5348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15270 on 95 degrees of freedom
## Multiple R-squared: 0.3198, Adjusted R-squared: 0.2697
## F-statistic: 6.38 on 7 and 95 DF, p-value: 3.716e-06
m2$coefficients
## (Intercept) age gmat_tot gmat_qpc gmat_vpc gmat_tpc
## 51877.82882 2720.35989 -27.95661 841.99273 567.02250 -1309.35553
## satis frstlang
## -1688.21827 4176.02630
abc.df$employed[abc.df$salary==0]<-abc.df$salary[abc.df$salary==0]
abc.df[is.na(abc.df)] <- 1
tb <- xtabs(~employed+s_avg,data = abc.df)
chisq.test(tb)
## Warning in chisq.test(tb): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tb
## X-squared = 33.09, df = 30, p-value = 0.3187
tb1 <- xtabs(~employed+f_avg,data=abc.df)
chisq.test(tb1)
## Warning in chisq.test(tb1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tb1
## X-squared = 14.576, df = 18, p-value = 0.6908
tb2 <- xtabs(~employed+salary,data = abc.df)
chisq.test(tb2)
## Warning in chisq.test(tb2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tb2
## X-squared = 193, df = 42, p-value < 2.2e-16
tb3 <- xtabs(~employed+quarter,data = abc.df)
chisq.test(tb3)
##
## Pearson's Chi-squared test
##
## data: tb3
## X-squared = 4.9172, df = 3, p-value = 0.178
library(psych)
describe(sal.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.