Amit Patel, Jose Magallanes, Scott Atherley
folder = "~/Documents/CoursesMPP/AdvStats/s1"
filename = "salary.dta"
library(foreign) # needed for .dat files
setwd(folder)
data = read.dta(filename)
attach(data)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(data)
## Variables:
## $ id (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ salbeg (int) 8400, 24000, 10200, 8700, 17400, 12996, 6900, 5400, 5...
## $ sex (fctr) males, males, males, males, males, males, males, mal...
## $ time (int) 81, 73, 83, 93, 83, 80, 79, 67, 96, 77, 84, 88, 93, 9...
## $ age (dbl) 28.50, 40.33, 31.08, 31.17, 41.92, 29.50, 28.00, 28.7...
## $ salnow (int) 16080, 41400, 21960, 19200, 28350, 27250, 16080, 1410...
## $ edlevel (int) 16, 16, 15, 16, 19, 18, 15, 15, 15, 12, 15, 12, 17, 1...
## $ work (dbl) 0.25, 12.50, 4.08, 1.83, 13.00, 2.42, 3.17, 0.50, 1.1...
## $ jobcat (fctr) college trainee, exempt employee, exempt employee, c...
## $ minority (fctr) white, white, white, white, white, white, white, whi...
## $ sexrace (fctr) white males, white males, white males, white males, ...
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
describe(data)
## data
##
## 11 Variables 474 Observations
## ---------------------------------------------------------------------------
## id
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 474 237.5 24.65 48.30 119.25 237.50 355.75
## .90 .95
## 426.70 450.35
##
## lowest : 1 2 3 4 5, highest: 470 471 472 473 474
## ---------------------------------------------------------------------------
## salbeg
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 90 6806 4080 4380 4995 6000 6996
## .90 .95
## 11000 13200
##
## lowest : 3600 3900 4020 4080 4200
## highest: 18000 18996 21000 24000 31992
## ---------------------------------------------------------------------------
## sex
## n missing unique
## 474 0 2
##
## males (258, 54%), females (216, 46%)
## ---------------------------------------------------------------------------
## time
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 36 81.11 65 67 72 81 90
## .90 .95
## 94 97
##
## lowest : 63 64 65 66 67, highest: 94 95 96 97 98
## ---------------------------------------------------------------------------
## age
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 259 37.19 24.42 25.19 28.50 32.00 45.98
## .90 .95
## 56.84 60.67
##
## lowest : 23.00 23.25 23.33 23.42 23.58
## highest: 63.75 63.83 63.92 64.25 64.50
## ---------------------------------------------------------------------------
## salnow
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 221 13768 7797 8418 9600 11550 14775
## .90 .95
## 23757 28000
##
## lowest : 6300 6360 6480 6540 6600
## highest: 40000 41400 41500 44250 54000
## ---------------------------------------------------------------------------
## edlevel
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 10 13.49 8 8 12 12 15
## .90 .95
## 17 19
##
## 8 12 14 15 16 17 18 19 20 21
## Frequency 53 190 6 116 59 11 9 27 2 1
## % 11 40 1 24 12 2 2 6 0 0
## ---------------------------------------------------------------------------
## work
## n missing unique Mean .05 .10 .25 .50 .75
## 474 0 208 7.989 0.1105 0.4200 1.6025 4.5800 11.5600
## .90 .95
## 21.6750 26.7855
##
## lowest : 0.00 0.17 0.25 0.33 0.42
## highest: 36.50 37.00 37.58 38.33 39.67
## ---------------------------------------------------------------------------
## jobcat
## n missing unique
## 474 0 7
##
## clerical (227, 48%), office trainee (136, 29%)
## security officer (27, 6%)
## college trainee (41, 9%)
## exempt employee (32, 7%), mba trainee (5, 1%)
## technical (6, 1%)
## ---------------------------------------------------------------------------
## minority
## n missing unique
## 474 0 2
##
## white (370, 78%), nonwhite (104, 22%)
## ---------------------------------------------------------------------------
## sexrace
## n missing unique
## 474 0 4
##
## white males (194, 41%), minority males (64, 14%)
## white females (176, 37%)
## minority females (40, 8%)
## ---------------------------------------------------------------------------
table(jobcat)
## jobcat
## clerical office trainee security officer college trainee
## 227 136 27 41
## exempt employee mba trainee technical
## 32 5 6
cumsum(table(jobcat))
## clerical office trainee security officer college trainee
## 227 363 390 431
## exempt employee mba trainee technical
## 463 468 474
prop.table(table(jobcat))
## jobcat
## clerical office trainee security officer college trainee
## 0.47890 0.28692 0.05696 0.08650
## exempt employee mba trainee technical
## 0.06751 0.01055 0.01266
cbind(Freq = table(jobcat), Cumul = cumsum(table(jobcat)), relative = prop.table(table(jobcat)))
## Freq Cumul relative
## clerical 227 227 0.47890
## office trainee 136 363 0.28692
## security officer 27 390 0.05696
## college trainee 41 431 0.08650
## exempt employee 32 463 0.06751
## mba trainee 5 468 0.01055
## technical 6 474 0.01266
# graph box salnow, over(jobcat)
plot(salnow ~ jobcat)
# gen jobcat_L = (jobcat == 1 | jobcat == 2 | jobcat == 3)
data$jobcat_L = ifelse(as.numeric(jobcat) <= 3, 1, 0)
data$jobcat_M = ifelse(as.numeric(jobcat) %in% c(4, 5, 6), 1, 0)
data$jobcat_H = ifelse(as.numeric(jobcat) == 7, 1, 0)
table(data$jobcat_L)
##
## 0 1
## 84 390
table(data$jobcat_M)
##
## 0 1
## 396 78
table(data$jobcat_H)
##
## 0 1
## 468 6
Regression
# regress salnow sex age work minority salbeg time jobcat_L jobcat_M
reg1 = lm(salnow ~ sex + age + work + minority + salbeg + time + jobcat_L +
jobcat_M, data)
summary(reg1)
##
## Call:
## lm(formula = salnow ~ sex + age + work + minority + salbeg +
## time + jobcat_L + jobcat_M, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10508 -1307 -272 994 18624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.18e+03 2.38e+03 2.60 0.00958 **
## sexfemales -1.08e+03 3.17e+02 -3.42 0.00069 ***
## age -3.73e+01 1.94e+01 -1.92 0.05500 .
## work -4.94e+01 2.68e+01 -1.85 0.06549 .
## minoritynonwhite -3.27e+02 3.21e+02 -1.02 0.30789
## salbeg 1.36e+00 8.09e-02 16.84 < 2e-16 ***
## time 6.42e+01 1.29e+01 4.97 9.2e-07 ***
## jobcat_L -5.42e+03 1.59e+03 -3.40 0.00073 ***
## jobcat_M -5.66e+02 1.36e+03 -0.42 0.67754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2780 on 465 degrees of freedom
## Multiple R-squared: 0.837, Adjusted R-squared: 0.835
## F-statistic: 299 on 8 and 465 DF, p-value: <2e-16
Descriptive Analysis Practice
# by sex, sort : ttest salnow == 0
t.test(salnow[sex == "males"], mu = 0)
##
## One Sample t-test
##
## data: salnow[sex == "males"]
## t = 34.14, df = 257, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 15620 17533
## sample estimates:
## mean of x
## 16577
t.test(salnow[sex == "males"], mu = 0)
##
## One Sample t-test
##
## data: salnow[sex == "males"]
## t = 34.14, df = 257, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 15620 17533
## sample estimates:
## mean of x
## 16577
# ttest salnow, by(sex)
t.test(salnow ~ sex, mu = 0)
##
## Welch Two Sample t-test
##
## data: salnow by sex
## t = 11.69, df = 344.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5127 7201
## sample estimates:
## mean in group males mean in group females
## 16577 10413
# tabulate sex jobcat, chi2
library(gmodels)
CrossTable(sex, jobcat, chisq = T)
## Warning: Chi-squared approximation may be incorrect
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 474
##
##
## | jobcat
## sex | clerical | office trainee | security officer | college trainee | exempt employee | mba trainee | technical | Row Total |
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## males | 110 | 47 | 27 | 34 | 30 | 4 | 6 | 258 |
## | 1.488 | 9.866 | 10.301 | 6.117 | 9.089 | 0.601 | 2.289 | |
## | 0.426 | 0.182 | 0.105 | 0.132 | 0.116 | 0.016 | 0.023 | 0.544 |
## | 0.485 | 0.346 | 1.000 | 0.829 | 0.938 | 0.800 | 1.000 | |
## | 0.232 | 0.099 | 0.057 | 0.072 | 0.063 | 0.008 | 0.013 | |
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## females | 117 | 89 | 0 | 7 | 2 | 1 | 0 | 216 |
## | 1.777 | 11.785 | 12.304 | 7.306 | 10.857 | 0.717 | 2.734 | |
## | 0.542 | 0.412 | 0.000 | 0.032 | 0.009 | 0.005 | 0.000 | 0.456 |
## | 0.515 | 0.654 | 0.000 | 0.171 | 0.062 | 0.200 | 0.000 | |
## | 0.247 | 0.188 | 0.000 | 0.015 | 0.004 | 0.002 | 0.000 | |
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## Column Total | 227 | 136 | 27 | 41 | 32 | 5 | 6 | 474 |
## | 0.479 | 0.287 | 0.057 | 0.086 | 0.068 | 0.011 | 0.013 | |
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 87.23 d.f. = 6 p = 1.139e-16
##
##
##
# tabulate sex minority, chi2
CrossTable(sex, minority, prop.r = F, prop.c = F, prop.t = F, prop.chisq = F,
chi = TRUE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 474
##
##
## | minority
## sex | white | nonwhite | Row Total |
## -------------|-----------|-----------|-----------|
## males | 194 | 64 | 258 |
## -------------|-----------|-----------|-----------|
## females | 176 | 40 | 216 |
## -------------|-----------|-----------|-----------|
## Column Total | 370 | 104 | 474 |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 2.714 d.f. = 1 p = 0.09948
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 2.359 d.f. = 1 p = 0.1245
##
##