library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(readr, quietly = TRUE)
salaries = read_csv(file = "C:\\Users\\USER\\Desktop\\Sam Houston Salaries.csv")
names(salaries) = tolower(names(salaries))
library(dplyr)
library(ggplot2)
library(scales)
ggplot(data = salaries, mapping = aes(annual_salary))+
geom_histogram(binwidth=10000)+
ggtitle(label="Distribution of Annual Salaries")+
xlab(label="Annual Salaries")+
ylab(label = "Total Number of Faculty")+
scale_x_continuous(labels=label_dollar())

library(dplyr)
salaries$salcat <-car::Recode(salaries$annual_salary, recodes="6012:75000 ='<75000'; 75000:125000 ='75000 - 125000'; 125000:456216 ='>125000'; else=NA", as.factor=T)
summary(salaries)
## position_title home_organization_desc annual_salary
## Length:2225 Length:2225 Min. : 6012
## Class :character Class :character 1st Qu.: 40950
## Mode :character Mode :character Median : 55485
## Mean : 63822
## 3rd Qu.: 76770
## Max. :456216
## salcat
## <75000 :1635
## >125000 : 117
## 75000 - 125000: 473
##
##
##
salaries %>%
group_by(salcat) %>%
summarise(n = n()) %>%
mutate(percentage = n / sum(n)*100)
## # A tibble: 3 x 3
## salcat n percentage
## <fct> <int> <dbl>
## 1 <75000 1635 73.5
## 2 >125000 117 5.26
## 3 75000 - 125000 473 21.3
library(ggplot2)
library(scales)
ggplot(data = salaries,mapping = aes(annual_salary))+geom_histogram(binwidth = 10000)+ggtitle(label="Distribution of Salaries")+xlab(label="Salaries")+scale_x_continuous(labels=label_dollar())

y <- rnorm(2225,63821.58,36170.47)
library(ggplot2)
ggplot(mapping = aes(y))+
geom_histogram()+
ggtitle(label="Distribution of Simulated Salaries")+
xlab(label="Salaries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qqnorm(y)

y <- rnorm(2225,63821.58,36170.47)
qqnorm(salaries$annual_salary)
