library(dplyr)
library(lattice)
library(plyr)
library(ggplot2)
data <- read.csv("C://Users/Samyam/OneDrive - University of Georgia/Documents/Samyam_AAEC_8610//Salaries.csv")
head(data)
nrow(data)
ncol(data)
sum(is.na(data$rank))
sum(is.na(data$discipline))
sum(is.na(data$yrs.since.phd))
sum(is.na(data$yrs.service))
sum(is.na(data$sex))
sum(is.na(data$salary))
sum(complete.cases(data))
The data set comprises of individual-level information on 397 professors including their rank, discipline, years since Ph.D., years of service, sex, and salary. The data is of dimension 397x7. There is no missing value for any variable for any observation. This sentence is unnecessary but hey, the letters are in italics.
table(data$rank)
table(data$discipline)
summary(data$yrs.since.phd)
summary(data$yrs.service)
table(data$sex)
summary(data$salary)
data$salarybracket <- ifelse(data$salary<100000, "Less than 100,000", ifelse((data$salary>=100000 & data$salary<200000), "100,000 - 200,000", "More than 200,000"))
table(data$sex,data$salarybracket)
##
## 100,000 - 200,000 Less than 100,000 More than 200,000
## Female 21 18 0
## Male 233 122 3
table(data$sex,data$rank)
##
## AssocProf AsstProf Prof
## Female 10 11 18
## Male 54 56 248
hist(data$yrs.since.phd, main="Years since Ph.D.", col="dodgerblue3")
hist(data$yrs.service, main="Years of service", col="gold2")
hist(data$salary, main="Annual salary (US$)", col="darkolivegreen3")
plot(data$yrs.since.phd, data$salary, xlab="Years since Ph.D.", ylab="Annual salary (US$)", col="firebrick3")
plot(data$yrs.service, data$salary, xlab="Years of service", ylab="Annual salary (US$)", col="cadetblue4")
data %>% ggplot(aes(salary, col=factor(sex))) + geom_density(aes(y=..scaled..)) + labs(x="Salary") + labs(y="Percentage")