library(dplyr)         # for manipulating data
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)       # for making graphs
library(knitr)         # for nicer table formatting
library(summarytools)  # for frequency distribution tables
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## system has no X11 capabilities, therefore only ascii graphs will be produced by dfSummary()
load("Datasets/OPM94.RData")
opm94AIF <- opm94 %>% filter(race == "American Indian", male == "female")   # subset data
opm94AIF %>% pander::pander(split.table = Inf)                                # print the resulting dataset nicely formatted
x sal grade patco major age male vet handvet hand yos edyrs promo exit supmgr race minority grade4 promo01 supmgr01 male01 exit01
256 49401 13 Administrative 42 female no no no 14 13 no no yes American Indian 1 grades 13 to 16 0 1 0 0
257 25672 5 Technical 31 female no no no 6 13 no no no American Indian 1 grades 5 to 8 0 0 0 0
258 23316 5 Clerical 46 female no no no 16 12 no no no American Indian 1 grades 5 to 8 0 0 0 0
259 45697 12 Administrative 53 female no no yes 23 15 no no yes American Indian 1 grades 9 to 12 0 1 0 0
260 45383 9 Professional 57 female no no no 36 12 no no no American Indian 1 grades 9 to 12 0 0 0 0
261 24576 5 Technical 62 female no no no 38 10 no no no American Indian 1 grades 5 to 8 0 0 0 0
262 20166 5 Clerical 33 female no no no 6 13 no no no American Indian 1 grades 5 to 8 0 0 0 0
263 42751 11 Professional PUBAF 43 female no no no 16 18 no no no American Indian 1 grades 9 to 12 0 0 0 0
264 24585 6 Administrative 53 female no no no 18 15 yes no no American Indian 1 grades 5 to 8 1 0 0 0
265 20796 5 Technical 32 female no no no 10 13 no no no American Indian 1 grades 5 to 8 0 0 0 0
opm94AIF <- opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01")
opm94AIF
##    age edyrs grade promo01 supmgr01
## 1   42    13    13       0        1
## 2   31    13     5       0        0
## 3   46    12     5       0        0
## 4   53    15    12       0        1
## 5   57    12     9       0        0
## 6   62    10     5       0        0
## 7   33    13     5       0        0
## 8   43    18    11       0        0
## 9   53    15     6       1        0
## 10  32    13     5       0        0
opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01") %>% summary()
##       age            edyrs           grade         promo01       supmgr01  
##  Min.   :31.00   Min.   :10.00   Min.   : 5.0   Min.   :0.0   Min.   :0.0  
##  1st Qu.:35.25   1st Qu.:12.25   1st Qu.: 5.0   1st Qu.:0.0   1st Qu.:0.0  
##  Median :44.50   Median :13.00   Median : 5.5   Median :0.0   Median :0.0  
##  Mean   :45.20   Mean   :13.40   Mean   : 7.6   Mean   :0.1   Mean   :0.2  
##  3rd Qu.:53.00   3rd Qu.:14.50   3rd Qu.:10.5   3rd Qu.:0.0   3rd Qu.:0.0  
##  Max.   :62.00   Max.   :18.00   Max.   :13.0   Max.   :1.0   Max.   :1.0
descr::descr(opm94AIF)
## 
## age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31.00   35.25   44.50   45.20   53.00   62.00 
## 
## edyrs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   12.25   13.00   13.40   14.50   18.00 
## 
## grade
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0     5.0     5.5     7.6    10.5    13.0 
## 
## promo01
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0     0.1     0.0     1.0 
## 
## supmgr01
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0     0.2     0.0     1.0
summarytools::descr(opm94AIF)
## Descriptive Statistics  
## opm94AIF  
## N: 10  
## 
##                        age    edyrs    grade   promo01   supmgr01
## ----------------- -------- -------- -------- --------- ----------
##              Mean    45.20    13.40     7.60      0.10       0.20
##           Std.Dev    10.97     2.17     3.31      0.32       0.42
##               Min    31.00    10.00     5.00      0.00       0.00
##                Q1    33.00    12.00     5.00      0.00       0.00
##            Median    44.50    13.00     5.50      0.00       0.00
##                Q3    53.00    15.00    11.00      0.00       0.00
##               Max    62.00    18.00    13.00      1.00       1.00
##               MAD    14.83     1.48     0.74      0.00       0.00
##               IQR    17.75     2.25     5.50      0.00       0.00
##                CV     0.24     0.16     0.44      3.16       2.11
##          Skewness     0.02     0.59     0.53      2.28       1.28
##       SE.Skewness     0.69     0.69     0.69      0.69       0.69
##          Kurtosis    -1.62    -0.29    -1.66      3.57      -0.37
##           N.Valid    10.00    10.00    10.00     10.00      10.00
##         Pct.Valid   100.00   100.00   100.00    100.00     100.00
#Which of the three outputs for descriptive statistics do you find the most useful? Explain
  # I found the last output to be the most useful.

age <- opm94AIF$age  # save the values in a new variable with the name `age` for less typing

table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32))    # figure out the mode from the table or use which.max()
## 
## 31 32 33 42 43 46 53 57 62 
##  1  1  1  1  1  1  2  1  1
which.max(table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)))
## 53 
##  7
#Median age
sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # find the median from the ordered vector or use R function median()
##  [1] 31 32 33 42 43 46 53 53 57 62
median(opm94AIF$age)
## [1] 44.5
#Mean age:
  (42+31+46+53+57+62+33+43+53+32)/10        # or
## [1] 45.2
sum(opm94AIF$age)/length(opm94AIF$age)
## [1] 45.2
mean(opm94AIF$age)
## [1] 45.2
#Range for age:
  sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32))   
##  [1] 31 32 33 42 43 46 53 53 57 62
range(opm94AIF$age)
## [1] 31 62
#Variance = SSD/(n-1)
age
##  [1] 42 31 46 53 57 62 33 43 53 32
age - mean(age)
##  [1]  -3.2 -14.2   0.8   7.8  11.8  16.8 -12.2  -2.2   7.8 -13.2
(age - mean(age))^2
##  [1]  10.24 201.64   0.64  60.84 139.24 282.24 148.84   4.84  60.84 174.24
sum((age - mean(age))^2)/(10-1) 
## [1] 120.4
var(age)
## [1] 120.4
sd(age)
## [1] 10.97269
sqrt(sum((age - mean(age))^2)/(10-1) )
## [1] 10.97269
# Do the manually calcualted results match the descriptive statistics in the tables above in section 1.1?
  #yes the calculated results match the values.


#Similarly, compute (as appropriate) the mode, median, mean, range, variance, and standard deviation for variables edyrs and supmgr01 (opm94AIF$edyrs: 13 13 12 15 12 10 13 18 15 13, opm94AIF$supmgr01: 1 0 0 1 0 0 0 0 0 0 ) listed for American Indian females. Check your results against the output in 1.1.

edyrs <- (opm94AIF$edyrs)
which.max(edyrs)
## [1] 8
median(edyrs)
## [1] 13
mean(edyrs)
## [1] 13.4
range(edyrs)
## [1] 10 18
var(edyrs)
## [1] 4.711111
sd(edyrs)
## [1] 2.170509
supmgr01 <- (opm94AIF$supmgr01)
which.max(supmgr01)
## [1] 1
median(supmgr01)
## [1] 0
mean(supmgr01)
## [1] 0.2
range(supmgr01)
## [1] 0 1
var(supmgr01)
## [1] 0.1777778
sd(supmgr01)
## [1] 0.421637
summarytools::freq(opm94$edyrs)   # grouped data
## Frequencies  
## opm94$edyrs  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##          10     12      1.20           1.20      1.20           1.20
##          12    330     33.00          34.20     33.00          34.20
##          13    101     10.10          44.30     10.10          44.30
##          14     98      9.80          54.10      9.80          54.10
##          15     39      3.90          58.00      3.90          58.00
##          16    290     29.00          87.00     29.00          87.00
##          18    112     11.20          98.20     11.20          98.20
##          20     18      1.80         100.00      1.80         100.00
##        <NA>      0                               0.00         100.00
##       Total   1000    100.00         100.00    100.00         100.00
summary(opm94$edyrs)  # summary statistics by R
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   12.00   14.00   14.37   16.00   20.00
summary(opm94$supmgr01)  # summary statistics by R
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.179   0.000   1.000
table(opm94$male01) %>% prop.table()*100  
## 
##    0    1 
## 48.8 51.2
mean(opm94$male01)   
## [1] 0.512
freq(opm94$grade4)
## Frequencies  
## opm94$grade4  
## Type: Factor  
## 
##                         Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## --------------------- ------ --------- -------------- --------- --------------
##         grades 1 to 4     70      7.00           7.00      7.00           7.00
##       grades 13 to 16    223     22.30          29.30     22.30          29.30
##         grades 5 to 8    299     29.90          59.20     29.90          59.20
##        grades 9 to 12    408     40.80         100.00     40.80         100.00
##                  <NA>      0                               0.00         100.00
##                 Total   1000    100.00         100.00    100.00         100.00
opm94$race %>% table()
## .
## American Indian           Asian           Black        Hispanic           White 
##              17              31             175              49             728
opm94 %>% filter(race == "White") %>% select(sal) %>% summarise(mean_sal_white = mean(sal, na.rm = T))
##   mean_sal_white
## 1       43294.39
opm94 %>% filter(race == "Black") %>% select(sal) %>% summarise(mean_sal_black = mean(sal, na.rm = T))
##   mean_sal_black
## 1       32712.78
opm94 %>% filter(race == "White") %>% select(edyrs) %>% summarise(mean_edyrs_white = mean(edyrs, na.rm = T))
##   mean_edyrs_white
## 1         14.57692
opm94 %>% filter(race == "Black") %>% select(edyrs) %>% summarise(mean_edyrs_black = mean(edyrs, na.rm = T))
##   mean_edyrs_black
## 1             13.6
opm94 %>% select(race, sal) %>% group_by(race) %>% summarise(mean_sal = mean(sal, na.rm = T))
## # A tibble: 5 x 2
##   race            mean_sal
##   <fct>              <dbl>
## 1 American Indian   32846.
## 2 Asian             38440.
## 3 Black             32713.
## 4 Hispanic          36500.
## 5 White             43294.
opm94 %>% select(race, edyrs) %>% group_by(race) %>% summarise(mean_edyrs = mean(edyrs, na.rm = T))
## # A tibble: 5 x 2
##   race            mean_edyrs
##   <fct>                <dbl>
## 1 American Indian       13.5
## 2 Asian                 14.7
## 3 Black                 13.6
## 4 Hispanic              14.1
## 5 White                 14.6
#opm94 %>% select(race, grade) %>% group_by(race) %>% summarise(mean_grade = mean(grade, na.rm = T))
#opm94 %>% select(race, promo01) %>% group_by(race) %>%  summarise(mean_promo01 = mean(promo01, na.rm = T))
#opm94 %>% select(race, supmgr01) %>% group_by(race) %>%  summarise(mean_supmgr01 = mean(supmgr01, na.rm = T))


#Do whites receive higher rewards (e.g., salaries, grades, supervisory status, promotions) than minorities? Do differences in education and federal experience seem to be partly responsible for these patterns? Write a paragraph discussing differences between the groups (be specific about which groups you compare).
  # Whites do receive higher rewards. If we examine all the variables, includng salary and grades, we can see that white people have greater access to education, whereas in racially black groups, people don't receive the same benefits.