Computer Assignment #3

#Setup

library(dplyr)         # for manipulating data

## Warning: package 'dplyr' was built under R version 3.6.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)       # for making graphs

## Warning: package 'ggplot2' was built under R version 3.6.2

library(knitr)         # for nicer table formatting

## Warning: package 'knitr' was built under R version 3.6.2

library(summarytools)  # for frequency distribution tables

## Warning: package 'summarytools' was built under R version 3.6.2

## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp

setwd("C:/Users/ramin/Desktop/2020 winter/Data Analysis/computer asssignment 3/dataset")

load("OPM94.RData")

opm94AIF <- opm94 %>% filter(race == "American Indian", male == "female")   # subset data
opm94AIF %>% pander::pander(split.table = Inf)                                # print the resulting dataset nicely formatted

x	sal	grade	patco	major	age	male	vet	handvet	hand	yos	edyrs	promo	exit	supmgr	race	minority	grade4	promo01	supmgr01
256	49401	13	Administrative		42	female	no	no	no	14	13	no	no	yes	American Indian	1	grades 13 to 16	0	1
257	25672	5	Technical		31	female	no	no	no	6	13	no	no	no	American Indian	1	grades 5 to 8	0	0
258	23316	5	Clerical		46	female	no	no	no	16	12	no	no	no	American Indian	1	grades 5 to 8	0	0
259	45697	12	Administrative		53	female	no	no	yes	23	15	no	no	yes	American Indian	1	grades 9 to 12	0	1
260	45383	9	Professional		57	female	no	no	no	36	12	no	no	no	American Indian	1	grades 9 to 12	0	0
261	24576	5	Technical		62	female	no	no	no	38	10	no	no	no	American Indian	1	grades 5 to 8	0	0
262	20166	5	Clerical		33	female	no	no	no	6	13	no	no	no	American Indian	1	grades 5 to 8	0	0
263	42751	11	Professional	PUBAF	43	female	no	no	no	16	18	no	no	no	American Indian	1	grades 9 to 12	0	0
264	24585	6	Administrative		53	female	no	no	no	18	15	yes	no	no	American Indian	1	grades 5 to 8	1	0
265	20796	5	Technical		32	female	no	no	no	10	13	no	no	no	American Indian	1	grades 5 to 8	0	0

opm94AIF <- opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01")
opm94AIF

##    age edyrs grade promo01 supmgr01
## 1   42    13    13       0        1
## 2   31    13     5       0        0
## 3   46    12     5       0        0
## 4   53    15    12       0        1
## 5   57    12     9       0        0
## 6   62    10     5       0        0
## 7   33    13     5       0        0
## 8   43    18    11       0        0
## 9   53    15     6       1        0
## 10  32    13     5       0        0

opm94AIF %>% select("age", "edyrs", "grade", "promo01", "supmgr01") %>% summary()

##       age            edyrs           grade         promo01       supmgr01  
##  Min.   :31.00   Min.   :10.00   Min.   : 5.0   Min.   :0.0   Min.   :0.0  
##  1st Qu.:35.25   1st Qu.:12.25   1st Qu.: 5.0   1st Qu.:0.0   1st Qu.:0.0  
##  Median :44.50   Median :13.00   Median : 5.5   Median :0.0   Median :0.0  
##  Mean   :45.20   Mean   :13.40   Mean   : 7.6   Mean   :0.1   Mean   :0.2  
##  3rd Qu.:53.00   3rd Qu.:14.50   3rd Qu.:10.5   3rd Qu.:0.0   3rd Qu.:0.0  
##  Max.   :62.00   Max.   :18.00   Max.   :13.0   Max.   :1.0   Max.   :1.0

descr::descr(opm94AIF)

## 
## age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31.00   35.25   44.50   45.20   53.00   62.00 
## 
## edyrs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   12.25   13.00   13.40   14.50   18.00 
## 
## grade
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0     5.0     5.5     7.6    10.5    13.0 
## 
## promo01
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0     0.1     0.0     1.0 
## 
## supmgr01
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0     0.2     0.0     1.0

summarytools::descr(opm94AIF)

## Descriptive Statistics  
## opm94AIF  
## N: 10  
## 
##                        age    edyrs    grade   promo01   supmgr01
## ----------------- -------- -------- -------- --------- ----------
##              Mean    45.20    13.40     7.60      0.10       0.20
##           Std.Dev    10.97     2.17     3.31      0.32       0.42
##               Min    31.00    10.00     5.00      0.00       0.00
##                Q1    33.00    12.00     5.00      0.00       0.00
##            Median    44.50    13.00     5.50      0.00       0.00
##                Q3    53.00    15.00    11.00      0.00       0.00
##               Max    62.00    18.00    13.00      1.00       1.00
##               MAD    14.83     1.48     0.74      0.00       0.00
##               IQR    17.75     2.25     5.50      0.00       0.00
##                CV     0.24     0.16     0.44      3.16       2.11
##          Skewness     0.02     0.59     0.53      2.28       1.28
##       SE.Skewness     0.69     0.69     0.69      0.69       0.69
##          Kurtosis    -1.62    -0.29    -1.66      3.57      -0.37
##           N.Valid    10.00    10.00    10.00     10.00      10.00
##         Pct.Valid   100.00   100.00   100.00    100.00     100.00

#1. Calculating the mode, median, mean, range, variance, and standard deviation

###QUESTION 1.1: Which of the three outputs for descriptive statistics do you find the most useful? Explain

##I find summarytools::descr(opm94AIF) most useful, because it seems to be the most well organized.

age <- opm94AIF$age  # save the values in a new variable with the name `age` for less typing

table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32))    # figure out the mode from the table or use which.max()

## 
## 31 32 33 42 43 46 53 57 62 
##  1  1  1  1  1  1  2  1  1

which.max(table(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)))

## 53 
##  7

sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32)) # find the median from the ordered vector or use R function median()

##  [1] 31 32 33 42 43 46 53 53 57 62

median(opm94AIF$age)

## [1] 44.5

(42+31+46+53+57+62+33+43+53+32)/10        # or

## [1] 45.2

sum(opm94AIF$age)/length(opm94AIF$age)

## [1] 45.2

mean(opm94AIF$age)

## [1] 45.2

sort(c(42, 31, 46, 53, 57, 62, 33, 43, 53, 32))   # or

##  [1] 31 32 33 42 43 46 53 53 57 62

range(opm94AIF$age)

## [1] 31 62

##Variance = SSD/(n-1)

age

##  [1] 42 31 46 53 57 62 33 43 53 32

age - mean(age)

##  [1]  -3.2 -14.2   0.8   7.8  11.8  16.8 -12.2  -2.2   7.8 -13.2

(age - mean(age))^2

##  [1]  10.24 201.64   0.64  60.84 139.24 282.24 148.84   4.84  60.84 174.24

sum((age - mean(age))^2)/(10-1)

## [1] 120.4

var(age)

## [1] 120.4

###SD = sqrt(var)

sqrt(sum((age - mean(age))^2)/(10-1) )

## [1] 10.97269

sd(age)

## [1] 10.97269

##QUESTION 1.2: Do the manually calcualted results match the descriptive statistics in the tables above in section 1.1?
  
  ##Yes, they all match.

###QUESTION 1.3: Similarly, compute (as appropriate) the mode, median, mean, range, variance, 
#and standard deviation for variables 
# edyrs and supmgr01 
#(opm94AIF$edyrs: 13 13 12 15 12 10 13 18 15 13, 
#opm94AIF$supmgr01: 1 0 0 1 0 0 0 0 0 0 ) listed for American Indian females. 
#Check your results against the output in 1.1.

# edyrs - (10,12,12,13,13,13,13,15,15,18) range = 10-18, median=13, mode =13, mean = 13.4 std dev = 2.06 var = 4.24.
#supmgr01 - (0,0,0,0,0,0,0,0,1,1) range = 0-1, median = 0, mode = 0, mean = .2, std dev = .310, var =.096, the results are the same.

# 2. Calculating mode, median, mean for grouped data
#Let’s generate grouped data (frequency table) that you will use for calculating statistics (mode, median, mean) for variable edyrs from the full dataset opm94:
  
  summarytools::freq(opm94$edyrs)   # grouped data

## Frequencies  
## opm94$edyrs  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##          10     12      1.20           1.20      1.20           1.20
##          12    330     33.00          34.20     33.00          34.20
##          13    101     10.10          44.30     10.10          44.30
##          14     98      9.80          54.10      9.80          54.10
##          15     39      3.90          58.00      3.90          58.00
##          16    290     29.00          87.00     29.00          87.00
##          18    112     11.20          98.20     11.20          98.20
##          20     18      1.80         100.00      1.80         100.00
##        <NA>      0                               0.00         100.00
##       Total   1000    100.00         100.00    100.00         100.00

  summary(opm94$edyrs)  # summary statistics by R

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   12.00   14.00   14.37   16.00   20.00

  ##  QUESTION 2.1: Similarlly to the example above, find the mode, median, mean 
  #for variables yos and supmgr01 using the grouped data:
    
    # supmgr01 - mode =0 , median =0 , mean =.2

#3. Calculating mode, median, mean for grouped data (dummy variables)
  #QUESTION 3: Male01 and exit01 are dummy variables. 
  #(They only have two possible values, o and 1.) 
  #For each, compare its mean to the percentage of cases with the value 1. 
  #How are these two measures related?
    
    #Percentage of cases with the value 0 and 1 for male01:
  table(opm94$male01) %>% prop.table()*100

## 
##    0    1 
## 48.8 51.2

  mean(opm94$male01)

## [1] 0.512

#  these two measures are the exact same because the .512 is the same as 51.2 if you put it in percent.

#4. Calcualting the mean for grouped data formulas for intervals
 # QUESTION 4: Using the Frequencies output for the entire data set 
  #(and the grouped data formulas for intervals), calculate the mean 
  #grade, using GRADE4 instead of grade.Calculate means using the midpoint
  #of each interval of grade4
  
  freq(opm94$grade4)

## Frequencies  
## opm94$grade4  
## Type: Factor  
## 
##                         Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## --------------------- ------ --------- -------------- --------- --------------
##         grades 1 to 4     70      7.00           7.00      7.00           7.00
##       grades 13 to 16    223     22.30          29.30     22.30          29.30
##         grades 5 to 8    299     29.90          59.20     29.90          59.20
##        grades 9 to 12    408     40.80         100.00     40.80         100.00
##                  <NA>      0                               0.00         100.00
##                 Total   1000    100.00         100.00    100.00         100.00

#since the NA is 0%, and for grade 4 the total % is 7% the mean =7%.

#5. Comparing means for different groups
  #Let’s calculate mmeans of a variety of variables 
  #for black and white workers so that you can describe 
  #differences between the two groups of workers:
  
  opm94$race %>% table()

## .
## American Indian           Asian           Black        Hispanic           White 
##              17              31             175              49             728

  opm94 %>% filter(race == "White") %>% select(sal) %>% summarise(mean_sal_white = mean(sal, na.rm = T))

##   mean_sal_white
## 1       43294.39

  opm94 %>% filter(race == "Black") %>% select(sal) %>% summarise(mean_sal_black = mean(sal, na.rm = T))

##   mean_sal_black
## 1       32712.78

  opm94 %>% filter(race == "White") %>% select(edyrs) %>% summarise(mean_edyrs_white = mean(edyrs, na.rm = T))

##   mean_edyrs_white
## 1         14.57692

  opm94 %>% filter(race == "Black") %>% select(edyrs) %>% summarise(mean_edyrs_black = mean(edyrs, na.rm = T))

##   mean_edyrs_black
## 1             13.6

  opm94 %>% select(race, sal) %>% group_by(race) %>% summarise(mean_sal = mean(sal, na.rm = T))

## # A tibble: 5 x 2
##   race            mean_sal
##   <fct>              <dbl>
## 1 American Indian   32846.
## 2 Asian             38440.
## 3 Black             32713.
## 4 Hispanic          36500.
## 5 White             43294.

  opm94 %>% select(race, edyrs) %>% group_by(race) %>% summarise(mean_edyrs = mean(edyrs, na.rm = T))

## # A tibble: 5 x 2
##   race            mean_edyrs
##   <fct>                <dbl>
## 1 American Indian       13.5
## 2 Asian                 14.7
## 3 Black                 13.6
## 4 Hispanic              14.1
## 5 White                 14.6

  #opm94 %>% select(race, grade) %>% group_by(race) %>% summarise(mean_grade = mean(grade, na.rm = T))
  #opm94 %>% select(race, promo01) %>% group_by(race) %>%  summarise(mean_promo01 = mean(promo01, na.rm = T))
  #opm94 %>% select(race, supmgr01) %>% group_by(race) %>%  summarise(mean_supmgr01 = mean(supmgr01, na.rm = T))  
  
  #Question 5: Do whites receive higher rewards (e.g., salaries, grades, supervisory status, 
 # promotions) than minorities? Do differences in education and federal experience seem to be 
#partly responsible for these patterns? Write a paragraph discussing differences between the 
#groups (be specific about which groups you compare).
  
  #Whites do receive higher rewards, because they have the highest mean salary. Blacks come in to be the 
  #second most salary. Education could have something to do with this as well as the enviornment.
  #The people who are more educated will get more rewards and the people who are more educated 
  #are in a better enviornment.

Computer Assignment #3

Ramin Alibeik

2/19/2020