#Setup
library(dplyr) # for manipulating data
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # for making graphs
## Warning: package 'ggplot2' was built under R version 3.6.2
library(knitr) # for nicer table formatting
## Warning: package 'knitr' was built under R version 3.6.2
library(summarytools) # for frequency distribution tables
## Warning: package 'summarytools' was built under R version 3.6.2
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
setwd("C:/Users/ramin/Desktop/2020 winter/Data Analysis/computer asssignment 3/dataset")
load("OPM94.RData")
opm94AIF <- opm94 %>% filter(race == "American Indian", male == "female") # subset data
opm94AIF %>% pander::pander(split.table = Inf) # print the resulting dataset nicely formatted
| 256 |
49401 |
13 |
Administrative |
|
42 |
female |
no |
no |
no |
14 |
13 |
no |
no |
yes |
American Indian |
1 |
grades 13 to 16 |
0 |
1 |
0 |
0 |
| 257 |
25672 |
5 |
Technical |
|
31 |
female |
no |
no |
no |
6 |
13 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 258 |
23316 |
5 |
Clerical |
|
46 |
female |
no |
no |
no |
16 |
12 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 259 |
45697 |
12 |
Administrative |
|
53 |
female |
no |
no |
yes |
23 |
15 |
no |
no |
yes |
American Indian |
1 |
grades 9 to 12 |
0 |
1 |
0 |
0 |
| 260 |
45383 |
9 |
Professional |
|
57 |
female |
no |
no |
no |
36 |
12 |
no |
no |
no |
American Indian |
1 |
grades 9 to 12 |
0 |
0 |
0 |
0 |
| 261 |
24576 |
5 |
Technical |
|
62 |
female |
no |
no |
no |
38 |
10 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 262 |
20166 |
5 |
Clerical |
|
33 |
female |
no |
no |
no |
6 |
13 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
| 263 |
42751 |
11 |
Professional |
PUBAF |
43 |
female |
no |
no |
no |
16 |
18 |
no |
no |
no |
American Indian |
1 |
grades 9 to 12 |
0 |
0 |
0 |
0 |
| 264 |
24585 |
6 |
Administrative |
|
53 |
female |
no |
no |
no |
18 |
15 |
yes |
no |
no |
American Indian |
1 |
grades 5 to 8 |
1 |
0 |
0 |
0 |
| 265 |
20796 |
5 |
Technical |
|
32 |
female |
no |
no |
no |
10 |
13 |
no |
no |
no |
American Indian |
1 |
grades 5 to 8 |
0 |
0 |
0 |
0 |
opm94AIF <- opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01")
opm94AIF
## age edyrs grade promo01 supmgr01
## 1 42 13 13 0 1
## 2 31 13 5 0 0
## 3 46 12 5 0 0
## 4 53 15 12 0 1
## 5 57 12 9 0 0
## 6 62 10 5 0 0
## 7 33 13 5 0 0
## 8 43 18 11 0 0
## 9 53 15 6 1 0
## 10 32 13 5 0 0
opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01") %>% summary()
## age edyrs grade promo01 supmgr01
## Min. :31.00 Min. :10.00 Min. : 5.0 Min. :0.0 Min. :0.0
## 1st Qu.:35.25 1st Qu.:12.25 1st Qu.: 5.0 1st Qu.:0.0 1st Qu.:0.0
## Median :44.50 Median :13.00 Median : 5.5 Median :0.0 Median :0.0
## Mean :45.20 Mean :13.40 Mean : 7.6 Mean :0.1 Mean :0.2
## 3rd Qu.:53.00 3rd Qu.:14.50 3rd Qu.:10.5 3rd Qu.:0.0 3rd Qu.:0.0
## Max. :62.00 Max. :18.00 Max. :13.0 Max. :1.0 Max. :1.0
descr::descr(opm94AIF)
##
## age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.00 35.25 44.50 45.20 53.00 62.00
##
## edyrs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 12.25 13.00 13.40 14.50 18.00
##
## grade
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 5.0 5.5 7.6 10.5 13.0
##
## promo01
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 0.1 0.0 1.0
##
## supmgr01
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 0.2 0.0 1.0
summarytools::descr(opm94AIF)
## Descriptive Statistics
## opm94AIF
## N: 10
##
## age edyrs grade promo01 supmgr01
## ----------------- -------- -------- -------- --------- ----------
## Mean 45.20 13.40 7.60 0.10 0.20
## Std.Dev 10.97 2.17 3.31 0.32 0.42
## Min 31.00 10.00 5.00 0.00 0.00
## Q1 33.00 12.00 5.00 0.00 0.00
## Median 44.50 13.00 5.50 0.00 0.00
## Q3 53.00 15.00 11.00 0.00 0.00
## Max 62.00 18.00 13.00 1.00 1.00
## MAD 14.83 1.48 0.74 0.00 0.00
## IQR 17.75 2.25 5.50 0.00 0.00
## CV 0.24 0.16 0.44 3.16 2.11
## Skewness 0.02 0.59 0.53 2.28 1.28
## SE.Skewness 0.69 0.69 0.69 0.69 0.69
## Kurtosis -1.62 -0.29 -1.66 3.57 -0.37
## N.Valid 10.00 10.00 10.00 10.00 10.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00
#1. Calculating the mode, median, mean, range, variance, and standard deviation
###QUESTION 1.1: Which of the three outputs for descriptive statistics do you find the most useful? Explain
##I find summarytools::descr(opm94AIF) most useful, because it seems to be the most well organized.
age <- opm94AIF$age # save the values in a new variable with the name `age` for less typing
table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # figure out the mode from the table or use which.max()
##
## 31 32 33 42 43 46 53 57 62
## 1 1 1 1 1 1 2 1 1
which.max(table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)))
## 53
## 7
sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # find the median from the ordered vector or use R function median()
## [1] 31 32 33 42 43 46 53 53 57 62
median(opm94AIF$age)
## [1] 44.5
(42+31+46+53+57+62+33+43+53+32)/10 # or
## [1] 45.2
sum(opm94AIF$age)/length(opm94AIF$age)
## [1] 45.2
mean(opm94AIF$age)
## [1] 45.2
sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # or
## [1] 31 32 33 42 43 46 53 53 57 62
range(opm94AIF$age)
## [1] 31 62
##Variance = SSD/(n-1)
age
## [1] 42 31 46 53 57 62 33 43 53 32
age - mean(age)
## [1] -3.2 -14.2 0.8 7.8 11.8 16.8 -12.2 -2.2 7.8 -13.2
(age - mean(age))^2
## [1] 10.24 201.64 0.64 60.84 139.24 282.24 148.84 4.84 60.84 174.24
sum((age - mean(age))^2)/(10-1)
## [1] 120.4
var(age)
## [1] 120.4
###SD = sqrt(var)
sqrt(sum((age - mean(age))^2)/(10-1) )
## [1] 10.97269
sd(age)
## [1] 10.97269
##QUESTION 1.2: Do the manually calcualted results match the descriptive statistics in the tables above in section 1.1?
##Yes, they all match.
###QUESTION 1.3: Similarly, compute (as appropriate) the mode, median, mean, range, variance,
#and standard deviation for variables
# edyrs and supmgr01
#(opm94AIF$edyrs: 13 13 12 15 12 10 13 18 15 13,
#opm94AIF$supmgr01: 1 0 0 1 0 0 0 0 0 0 ) listed for American Indian females.
#Check your results against the output in 1.1.
# edyrs - (10,12,12,13,13,13,13,15,15,18) range = 10-18, median=13, mode =13, mean = 13.4 std dev = 2.06 var = 4.24.
#supmgr01 - (0,0,0,0,0,0,0,0,1,1) range = 0-1, median = 0, mode = 0, mean = .2, std dev = .310, var =.096, the results are the same.
# 2. Calculating mode, median, mean for grouped data
#Let’s generate grouped data (frequency table) that you will use for calculating statistics (mode, median, mean) for variable edyrs from the full dataset opm94:
summarytools::freq(opm94$edyrs) # grouped data
## Frequencies
## opm94$edyrs
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 10 12 1.20 1.20 1.20 1.20
## 12 330 33.00 34.20 33.00 34.20
## 13 101 10.10 44.30 10.10 44.30
## 14 98 9.80 54.10 9.80 54.10
## 15 39 3.90 58.00 3.90 58.00
## 16 290 29.00 87.00 29.00 87.00
## 18 112 11.20 98.20 11.20 98.20
## 20 18 1.80 100.00 1.80 100.00
## <NA> 0 0.00 100.00
## Total 1000 100.00 100.00 100.00 100.00
summary(opm94$edyrs) # summary statistics by R
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 12.00 14.00 14.37 16.00 20.00
## QUESTION 2.1: Similarlly to the example above, find the mode, median, mean
#for variables yos and supmgr01 using the grouped data:
# supmgr01 - mode =0 , median =0 , mean =.2
#3. Calculating mode, median, mean for grouped data (dummy variables)
#QUESTION 3: Male01 and exit01 are dummy variables.
#(They only have two possible values, o and 1.)
#For each, compare its mean to the percentage of cases with the value 1.
#How are these two measures related?
#Percentage of cases with the value 0 and 1 for male01:
table(opm94$male01) %>% prop.table()*100
##
## 0 1
## 48.8 51.2
mean(opm94$male01)
## [1] 0.512
# these two measures are the exact same because the .512 is the same as 51.2 if you put it in percent.
#4. Calcualting the mean for grouped data formulas for intervals
# QUESTION 4: Using the Frequencies output for the entire data set
#(and the grouped data formulas for intervals), calculate the mean
#grade, using GRADE4 instead of grade.Calculate means using the midpoint
#of each interval of grade4
freq(opm94$grade4)
## Frequencies
## opm94$grade4
## Type: Factor
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## --------------------- ------ --------- -------------- --------- --------------
## grades 1 to 4 70 7.00 7.00 7.00 7.00
## grades 13 to 16 223 22.30 29.30 22.30 29.30
## grades 5 to 8 299 29.90 59.20 29.90 59.20
## grades 9 to 12 408 40.80 100.00 40.80 100.00
## <NA> 0 0.00 100.00
## Total 1000 100.00 100.00 100.00 100.00
#since the NA is 0%, and for grade 4 the total % is 7% the mean =7%.
#5. Comparing means for different groups
#Let’s calculate mmeans of a variety of variables
#for black and white workers so that you can describe
#differences between the two groups of workers:
opm94$race %>% table()
## .
## American Indian Asian Black Hispanic White
## 17 31 175 49 728
opm94 %>% filter(race == "White") %>% select(sal) %>% summarise(mean_sal_white = mean(sal, na.rm = T))
## mean_sal_white
## 1 43294.39
opm94 %>% filter(race == "Black") %>% select(sal) %>% summarise(mean_sal_black = mean(sal, na.rm = T))
## mean_sal_black
## 1 32712.78
opm94 %>% filter(race == "White") %>% select(edyrs) %>% summarise(mean_edyrs_white = mean(edyrs, na.rm = T))
## mean_edyrs_white
## 1 14.57692
opm94 %>% filter(race == "Black") %>% select(edyrs) %>% summarise(mean_edyrs_black = mean(edyrs, na.rm = T))
## mean_edyrs_black
## 1 13.6
opm94 %>% select(race, sal) %>% group_by(race) %>% summarise(mean_sal = mean(sal, na.rm = T))
## # A tibble: 5 x 2
## race mean_sal
## <fct> <dbl>
## 1 American Indian 32846.
## 2 Asian 38440.
## 3 Black 32713.
## 4 Hispanic 36500.
## 5 White 43294.
opm94 %>% select(race, edyrs) %>% group_by(race) %>% summarise(mean_edyrs = mean(edyrs, na.rm = T))
## # A tibble: 5 x 2
## race mean_edyrs
## <fct> <dbl>
## 1 American Indian 13.5
## 2 Asian 14.7
## 3 Black 13.6
## 4 Hispanic 14.1
## 5 White 14.6
#opm94 %>% select(race, grade) %>% group_by(race) %>% summarise(mean_grade = mean(grade, na.rm = T))
#opm94 %>% select(race, promo01) %>% group_by(race) %>% summarise(mean_promo01 = mean(promo01, na.rm = T))
#opm94 %>% select(race, supmgr01) %>% group_by(race) %>% summarise(mean_supmgr01 = mean(supmgr01, na.rm = T))
#Question 5: Do whites receive higher rewards (e.g., salaries, grades, supervisory status,
# promotions) than minorities? Do differences in education and federal experience seem to be
#partly responsible for these patterns? Write a paragraph discussing differences between the
#groups (be specific about which groups you compare).
#Whites do receive higher rewards, because they have the highest mean salary. Blacks come in to be the
#second most salary. Education could have something to do with this as well as the enviornment.
#The people who are more educated will get more rewards and the people who are more educated
#are in a better enviornment.