library(dplyr) # for manipulating data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # for making graphs
library(knitr) # for nicer table formatting
library(summarytools) # for frequency distribution tables
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## system has no X11 capabilities, therefore only ascii graphs will be produced by dfSummary()
load("Datasets/OPM94.RData")
opm94AIF <- opm94 %>% filter(race == "American Indian", male == "female") # subset data
opm94AIF %>% pander::pander(split.table = Inf) # print the resulting dataset nicely formatted
| 256 |
49401 |
13 |
Administrative |
|
42 |
female |
no |
no |
no |
14 |
13 |
no |
no |
yes |
American Indian |
1 |
grades 13 to 16 |
0 |
1 |
0 |
0 |
| 257 |
25672 |
5 |
Technical |
|
31 |
female |
no |
no |
no |
6 |
13 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 258 |
23316 |
5 |
Clerical |
|
46 |
female |
no |
no |
no |
16 |
12 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 259 |
45697 |
12 |
Administrative |
|
53 |
female |
no |
no |
yes |
23 |
15 |
no |
no |
yes |
American Indian |
1 |
grades 9 to 12 |
0 |
1 |
0 |
0 |
| 260 |
45383 |
9 |
Professional |
|
57 |
female |
no |
no |
no |
36 |
12 |
no |
no |
no |
American Indian |
1 |
grades 9 to 12 |
0 |
0 |
0 |
0 |
| 261 |
24576 |
5 |
Technical |
|
62 |
female |
no |
no |
no |
38 |
10 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 262 |
20166 |
5 |
Clerical |
|
33 |
female |
no |
no |
no |
6 |
13 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 263 |
42751 |
11 |
Professional |
PUBAF |
43 |
female |
no |
no |
no |
16 |
18 |
no |
no |
no |
American Indian |
1 |
grades 9 to 12 |
0 |
0 |
0 |
0 |
| 264 |
24585 |
6 |
Administrative |
|
53 |
female |
no |
no |
no |
18 |
15 |
yes |
no |
no |
American Indian |
1 |
grades 5 to 8 |
1 |
0 |
0 |
0 |
| 265 |
20796 |
5 |
Technical |
|
32 |
female |
no |
no |
no |
10 |
13 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
opm94AIF <- opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01")
opm94AIF
## age edyrs grade promo01 supmgr01
## 1 42 13 13 0 1
## 2 31 13 5 0 0
## 3 46 12 5 0 0
## 4 53 15 12 0 1
## 5 57 12 9 0 0
## 6 62 10 5 0 0
## 7 33 13 5 0 0
## 8 43 18 11 0 0
## 9 53 15 6 1 0
## 10 32 13 5 0 0
opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01") %>% summary()
## age edyrs grade promo01 supmgr01
## Min. :31.00 Min. :10.00 Min. : 5.0 Min. :0.0 Min. :0.0
## 1st Qu.:35.25 1st Qu.:12.25 1st Qu.: 5.0 1st Qu.:0.0 1st Qu.:0.0
## Median :44.50 Median :13.00 Median : 5.5 Median :0.0 Median :0.0
## Mean :45.20 Mean :13.40 Mean : 7.6 Mean :0.1 Mean :0.2
## 3rd Qu.:53.00 3rd Qu.:14.50 3rd Qu.:10.5 3rd Qu.:0.0 3rd Qu.:0.0
## Max. :62.00 Max. :18.00 Max. :13.0 Max. :1.0 Max. :1.0
descr::descr(opm94AIF)
##
## age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.00 35.25 44.50 45.20 53.00 62.00
##
## edyrs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 12.25 13.00 13.40 14.50 18.00
##
## grade
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 5.0 5.5 7.6 10.5 13.0
##
## promo01
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 0.1 0.0 1.0
##
## supmgr01
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 0.2 0.0 1.0
summarytools::descr(opm94AIF)
## Descriptive Statistics
## opm94AIF
## N: 10
##
## age edyrs grade promo01 supmgr01
## ----------------- -------- -------- -------- --------- ----------
## Mean 45.20 13.40 7.60 0.10 0.20
## Std.Dev 10.97 2.17 3.31 0.32 0.42
## Min 31.00 10.00 5.00 0.00 0.00
## Q1 33.00 12.00 5.00 0.00 0.00
## Median 44.50 13.00 5.50 0.00 0.00
## Q3 53.00 15.00 11.00 0.00 0.00
## Max 62.00 18.00 13.00 1.00 1.00
## MAD 14.83 1.48 0.74 0.00 0.00
## IQR 17.75 2.25 5.50 0.00 0.00
## CV 0.24 0.16 0.44 3.16 2.11
## Skewness 0.02 0.59 0.53 2.28 1.28
## SE.Skewness 0.69 0.69 0.69 0.69 0.69
## Kurtosis -1.62 -0.29 -1.66 3.57 -0.37
## N.Valid 10.00 10.00 10.00 10.00 10.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00
#Which of the three outputs for descriptive statistics do you find the most useful? Explain
# I found the last output to be the most useful.
age <- opm94AIF$age # save the values in a new variable with the name `age` for less typing
table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # figure out the mode from the table or use which.max()
##
## 31 32 33 42 43 46 53 57 62
## 1 1 1 1 1 1 2 1 1
which.max(table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)))
## 53
## 7
#Median age
sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # find the median from the ordered vector or use R function median()
## [1] 31 32 33 42 43 46 53 53 57 62
median(opm94AIF$age)
## [1] 44.5
#Mean age:
(42+31+46+53+57+62+33+43+53+32)/10 # or
## [1] 45.2
sum(opm94AIF$age)/length(opm94AIF$age)
## [1] 45.2
mean(opm94AIF$age)
## [1] 45.2
#Range for age:
sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32))
## [1] 31 32 33 42 43 46 53 53 57 62
range(opm94AIF$age)
## [1] 31 62
#Variance = SSD/(n-1)
age
## [1] 42 31 46 53 57 62 33 43 53 32
age - mean(age)
## [1] -3.2 -14.2 0.8 7.8 11.8 16.8 -12.2 -2.2 7.8 -13.2
(age - mean(age))^2
## [1] 10.24 201.64 0.64 60.84 139.24 282.24 148.84 4.84 60.84 174.24
sum((age - mean(age))^2)/(10-1)
## [1] 120.4
var(age)
## [1] 120.4
sd(age)
## [1] 10.97269
sqrt(sum((age - mean(age))^2)/(10-1) )
## [1] 10.97269
# Do the manually calcualted results match the descriptive statistics in the tables above in section 1.1?
#yes the calculated results match the values.
#Similarly, compute (as appropriate) the mode, median, mean, range, variance, and standard deviation for variables edyrs and supmgr01 (opm94AIF$edyrs: 13 13 12 15 12 10 13 18 15 13, opm94AIF$supmgr01: 1 0 0 1 0 0 0 0 0 0 ) listed for American Indian females. Check your results against the output in 1.1.
edyrs <- (opm94AIF$edyrs)
which.max(edyrs)
## [1] 8
median(edyrs)
## [1] 13
mean(edyrs)
## [1] 13.4
range(edyrs)
## [1] 10 18
var(edyrs)
## [1] 4.711111
sd(edyrs)
## [1] 2.170509
supmgr01 <- (opm94AIF$supmgr01)
which.max(supmgr01)
## [1] 1
median(supmgr01)
## [1] 0
mean(supmgr01)
## [1] 0.2
range(supmgr01)
## [1] 0 1
var(supmgr01)
## [1] 0.1777778
sd(supmgr01)
## [1] 0.421637
summarytools::freq(opm94$edyrs) # grouped data
## Frequencies
## opm94$edyrs
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 10 12 1.20 1.20 1.20 1.20
## 12 330 33.00 34.20 33.00 34.20
## 13 101 10.10 44.30 10.10 44.30
## 14 98 9.80 54.10 9.80 54.10
## 15 39 3.90 58.00 3.90 58.00
## 16 290 29.00 87.00 29.00 87.00
## 18 112 11.20 98.20 11.20 98.20
## 20 18 1.80 100.00 1.80 100.00
## <NA> 0 0.00 100.00
## Total 1000 100.00 100.00 100.00 100.00
summary(opm94$edyrs) # summary statistics by R
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 12.00 14.00 14.37 16.00 20.00
summary(opm94$supmgr01) # summary statistics by R
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.179 0.000 1.000
table(opm94$male01) %>% prop.table()*100
##
## 0 1
## 48.8 51.2
mean(opm94$male01)
## [1] 0.512
freq(opm94$grade4)
## Frequencies
## opm94$grade4
## Type: Factor
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## --------------------- ------ --------- -------------- --------- --------------
## grades 1 to 4 70 7.00 7.00 7.00 7.00
## grades 13 to 16 223 22.30 29.30 22.30 29.30
## grades 5 to 8 299 29.90 59.20 29.90 59.20
## grades 9 to 12 408 40.80 100.00 40.80 100.00
## <NA> 0 0.00 100.00
## Total 1000 100.00 100.00 100.00 100.00
opm94$race %>% table()
## .
## American Indian Asian Black Hispanic White
## 17 31 175 49 728
opm94 %>% filter(race == "White") %>% select(sal) %>% summarise(mean_sal_white = mean(sal, na.rm = T))
## mean_sal_white
## 1 43294.39
opm94 %>% filter(race == "Black") %>% select(sal) %>% summarise(mean_sal_black = mean(sal, na.rm = T))
## mean_sal_black
## 1 32712.78
opm94 %>% filter(race == "White") %>% select(edyrs) %>% summarise(mean_edyrs_white = mean(edyrs, na.rm = T))
## mean_edyrs_white
## 1 14.57692
opm94 %>% filter(race == "Black") %>% select(edyrs) %>% summarise(mean_edyrs_black = mean(edyrs, na.rm = T))
## mean_edyrs_black
## 1 13.6
opm94 %>% select(race, sal) %>% group_by(race) %>% summarise(mean_sal = mean(sal, na.rm = T))
## # A tibble: 5 x 2
## race mean_sal
## <fct> <dbl>
## 1 American Indian 32846.
## 2 Asian 38440.
## 3 Black 32713.
## 4 Hispanic 36500.
## 5 White 43294.
opm94 %>% select(race, edyrs) %>% group_by(race) %>% summarise(mean_edyrs = mean(edyrs, na.rm = T))
## # A tibble: 5 x 2
## race mean_edyrs
## <fct> <dbl>
## 1 American Indian 13.5
## 2 Asian 14.7
## 3 Black 13.6
## 4 Hispanic 14.1
## 5 White 14.6
#opm94 %>% select(race, grade) %>% group_by(race) %>% summarise(mean_grade = mean(grade, na.rm = T))
#opm94 %>% select(race, promo01) %>% group_by(race) %>% summarise(mean_promo01 = mean(promo01, na.rm = T))
#opm94 %>% select(race, supmgr01) %>% group_by(race) %>% summarise(mean_supmgr01 = mean(supmgr01, na.rm = T))
#Do whites receive higher rewards (e.g., salaries, grades, supervisory status, promotions) than minorities? Do differences in education and federal experience seem to be partly responsible for these patterns? Write a paragraph discussing differences between the groups (be specific about which groups you compare).
# Whites do receive higher rewards. If we examine all the variables, includng salary and grades, we can see that white people have greater access to education, whereas in racially black groups, people don't receive the same benefits.