library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(readr, quietly = TRUE)
salaries = read_csv(file = "C:\\Users\\USER\\Desktop\\Sam Houston Salaries.csv")
names(salaries) = tolower(names(salaries))
library(dplyr)
library(ggplot2)
library(scales)
ggplot(data = salaries, mapping =  aes(annual_salary))+
geom_histogram(binwidth=10000)+
ggtitle(label="Distribution of Annual Salaries")+
xlab(label="Annual Salaries")+ 
ylab(label = "Total Number of Faculty")+
scale_x_continuous(labels=label_dollar())

library(dplyr)
salaries$salcat <-car::Recode(salaries$annual_salary, recodes="6012:75000 ='<75000'; 75000:125000 ='75000 - 125000'; 125000:456216  ='>125000'; else=NA", as.factor=T)
summary(salaries)
##  position_title     home_organization_desc annual_salary   
##  Length:2225        Length:2225            Min.   :  6012  
##  Class :character   Class :character       1st Qu.: 40950  
##  Mode  :character   Mode  :character       Median : 55485  
##                                            Mean   : 63822  
##                                            3rd Qu.: 76770  
##                                            Max.   :456216  
##             salcat    
##  <75000        :1635  
##  >125000       : 117  
##  75000 - 125000: 473  
##                       
##                       
## 
salaries %>% 
    group_by(salcat) %>% 
    summarise(n = n()) %>%
    mutate(percentage = n / sum(n)*100)
## # A tibble: 3 x 3
##   salcat             n percentage
##   <fct>          <int>      <dbl>
## 1 <75000          1635      73.5 
## 2 >125000          117       5.26
## 3 75000 - 125000   473      21.3
library(ggplot2)
library(scales)
ggplot(data = salaries,mapping = aes(annual_salary))+geom_histogram(binwidth = 10000)+ggtitle(label="Distribution of Salaries")+xlab(label="Salaries")+scale_x_continuous(labels=label_dollar())

y <- rnorm(2225,63821.58,36170.47)

library(ggplot2)
ggplot(mapping = aes(y))+
  geom_histogram()+
  ggtitle(label="Distribution of Simulated Salaries")+
  xlab(label="Salaries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qqnorm(y)

y <- rnorm(2225,63821.58,36170.47)
qqnorm(salaries$annual_salary)