Read in data:faculty <- read.csv("C:/Users/Alan/Desktop/faculty.csv", sep = ",", header = T) #read in data
1) What are the column names: how many obs? how many vars?:names(faculty) #column names
## [1] "AYSALARY" "R1" "R2" "R7" "PRIOREXP" "YRBG"
## [7] "YRRANK" "TERMDEG" "YRDG" "EMINENT" "FEMALE"
nrow(faculty) #obs
## [1] 725
ncol(faculty) #vars
## [1] 11
2) Is annual salary normally distributed?:summary(faculty$AYSALARY) #to get range of annual salary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23800 36800 46700 47800 57600 103000
hist(faculty$AYSALARY, breaks = seq(23000, 110000, by = 1000), xlab = "Annual Salary",
main = "Histogram of Faculty Annual Salary")
not normally distributed, skewed left
3) does it appear that male and female faculty members make the same annual salary?:aggregate(faculty$AYSALARY, list(faculty$FEMALE), mean) #average of annual salary by female variable
## Group.1 x
## 1 0 51453
## 2 1 39080
mean(faculty$AYSALARY[faculty$FEMALE == 1]) #alternative code for taking mean of one var by a single category of second var
## [1] 39080
mean(faculty$AYSALARY[faculty$FEMALE == 0]) #alternative code for taking mean of one var by a single category of second var
## [1] 51453
#women appear to make less, consider doing a difference of means test
t.test(faculty$AYSALARY[faculty$FEMALE == 1], faculty$AYSALARY[faculty$FEMALE ==
0]) #default is unequal variances
##
## Welch Two Sample t-test
##
## data: faculty$AYSALARY[faculty$FEMALE == 1] and faculty$AYSALARY[faculty$FEMALE == 0]
## t = -13.28, df = 523.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -14203 -10543
## sample estimates:
## mean of x mean of y
## 39080 51453
reject null hypothesis of no significant differnce in means
women make significantly less on than men on average
4) does there appear to be a relationship between salary and number of years of employment:salary.yrs.cor <- cor(faculty$AYSALARY, faculty$YRBG) #simple correlation
plot(faculty$AYSALARY ~ faculty$YRBG, main = "Scatterpolt, Salary vs. No. Years Employed",
xlab = "No. Yrs Employed", ylab = "Salary", pch = 19) #scatterplot
abline(lm(faculty$AYSALARY ~ faculty$YRBG), col = "red") # adds regression line (salary as a linear fcn of yrs employed)
print(salary.yrs.cor)
## [1] 0.6166
yes, corrleation = 0.62
5) combine R1, R2, R7 into one categorical variable; does one category appear to have higher salaries?:faculty$RANK[faculty$R1 == 1] <- 3
faculty$RANK[faculty$R2 == 1] <- 2
faculty$RANK[faculty$R7 == 1] <- 1
aggregate(faculty$AYSALARY, list(faculty$RANK), mean) #average salary by category
## Group.1 x
## 1 1 28374
## 2 2 48058
## 3 3 61622
boxplot(faculty$AYSALARY ~ faculty$RANK, main = "Annual Salary by Rank", ylab = "Annual Salary ($)",
xlab = "1 = Instructor/Lecturer, 2 = Associate Professor, 3 = Full Professor",
col = rainbow(3))
full professors have higher salaries