1. Setting

Unemployment: dataframe consiting of a cross-section from 1993 of 452 observations of unemployed individuals.

Categorical IV: race = one of nonwhite, white Categorical IV: reason = reason for unemployment, one of new (new entrant), lose (job loser), leave (job leaver), reentr (labor force reentrant) Continuous DV: duration = duration of first spell of unemployment, t, in weeks

This data is from the Ecdat package. I downloaded this package along with Ecfun.

library("Ecdat")

## Loading required package: Ecfun

## 
## Attaching package: 'Ecfun'

## The following object is masked from 'package:base':
## 
##     sign

## 
## Attaching package: 'Ecdat'

## The following object is masked from 'package:datasets':
## 
##     Orange

data("Unemployment")

Below is the first 6 rows of data:

head(Unemployment)

##   duration spell     race    sex reason search pubemp ftp1 ftp2 ftp3 ftp4
## 1        4     1    white   male reentr    yes    yes    1    0    0    0
## 2        7     0    white   male   lose     no     no    1    1    1    1
## 3        1     0 nonwhite   male   lose     no     no    0    0    0    0
## 4        1     1 nonwhite   male reentr     no     no    0    1    0    0
## 5        3     1 nonwhite female reentr     no     no    0    0    0    0
## 6        1     1    white female reentr     no     no    0    0    0    0
##   nobs
## 1    1
## 2    2
## 3    1
## 4    1
## 5    1
## 6    1

Below is the summary of the Unemployment dataframe:

summary(Unemployment)

##     duration          spell              race         sex     
##  Min.   :  0.00   Min.   :0.0000   nonwhite:117   male  :242  
##  1st Qu.:  4.00   1st Qu.:0.0000   white   :335   female:210  
##  Median : 10.00   Median :1.0000                              
##  Mean   : 18.51   Mean   :0.5664                              
##  3rd Qu.: 22.25   3rd Qu.:1.0000                              
##  Max.   :117.00   Max.   :1.0000                              
##     reason    search    pubemp         ftp1             ftp2       
##  new   : 41   no :309   no :360   Min.   :0.0000   Min.   :0.0000  
##  lose  :171   yes:143   yes: 92   1st Qu.:0.0000   1st Qu.:0.0000  
##  leave : 92                       Median :1.0000   Median :0.0000  
##  reentr:148                       Mean   :0.6726   Mean   :0.4336  
##                                   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                                   Max.   :1.0000   Max.   :1.0000  
##       ftp3            ftp4             nobs      
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.000   Median :0.0000   Median :1.000  
##  Mean   :0.354   Mean   :0.3053   Mean   :1.788  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.000   Max.   :1.0000   Max.   :4.000

Factors and Levels

The following 2 factors will be used in th experiment: 1. race 2. reason

str(Unemployment)

## 'data.frame':    452 obs. of  12 variables:
##  $ duration: int  4 7 1 1 3 1 65 4 113 9 ...
##  $ spell   : int  1 0 0 1 1 1 0 0 0 1 ...
##  $ race    : Factor w/ 2 levels "nonwhite","white": 2 2 1 1 1 2 2 2 2 2 ...
##  $ sex     : Factor w/ 2 levels "male","female": 1 1 1 1 2 2 1 2 2 1 ...
##  $ reason  : Factor w/ 4 levels "new","lose","leave",..: 4 2 2 4 4 4 2 4 4 3 ...
##  $ search  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 2 1 1 2 ...
##  $ pubemp  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 2 1 1 1 ...
##  $ ftp1    : int  1 1 0 0 0 0 1 0 0 1 ...
##  $ ftp2    : int  0 1 0 1 0 0 1 0 0 0 ...
##  $ ftp3    : int  0 1 0 0 0 0 1 0 0 0 ...
##  $ ftp4    : int  0 1 0 0 0 0 1 0 0 1 ...
##  $ nobs    : int  1 2 1 1 1 1 4 1 3 1 ...

levels(Unemployment$race)

## [1] "nonwhite" "white"

levels(Unemployment$reason)

## [1] "new"    "lose"   "leave"  "reentr"

Continuous Variables

There is one continuous variable in this experiment, the number of weeks of unemployment, duration.

Response Variable

The response variable for this experiment is the number of weeks unemployed, duration.

The Data: How is it organized and what does it mean?

There are a total of 12 variables, but this experiment will focus on 3. Here is the first 6 rows of data and a summary of the data:

head(Unemployment)

##   duration spell     race    sex reason search pubemp ftp1 ftp2 ftp3 ftp4
## 1        4     1    white   male reentr    yes    yes    1    0    0    0
## 2        7     0    white   male   lose     no     no    1    1    1    1
## 3        1     0 nonwhite   male   lose     no     no    0    0    0    0
## 4        1     1 nonwhite   male reentr     no     no    0    1    0    0
## 5        3     1 nonwhite female reentr     no     no    0    0    0    0
## 6        1     1    white female reentr     no     no    0    0    0    0
##   nobs
## 1    1
## 2    2
## 3    1
## 4    1
## 5    1
## 6    1

summary(Unemployment)

##     duration          spell              race         sex     
##  Min.   :  0.00   Min.   :0.0000   nonwhite:117   male  :242  
##  1st Qu.:  4.00   1st Qu.:0.0000   white   :335   female:210  
##  Median : 10.00   Median :1.0000                              
##  Mean   : 18.51   Mean   :0.5664                              
##  3rd Qu.: 22.25   3rd Qu.:1.0000                              
##  Max.   :117.00   Max.   :1.0000                              
##     reason    search    pubemp         ftp1             ftp2       
##  new   : 41   no :309   no :360   Min.   :0.0000   Min.   :0.0000  
##  lose  :171   yes:143   yes: 92   1st Qu.:0.0000   1st Qu.:0.0000  
##  leave : 92                       Median :1.0000   Median :0.0000  
##  reentr:148                       Mean   :0.6726   Mean   :0.4336  
##                                   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                                   Max.   :1.0000   Max.   :1.0000  
##       ftp3            ftp4             nobs      
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.000   Median :0.0000   Median :1.000  
##  Mean   :0.354   Mean   :0.3053   Mean   :1.788  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.000   Max.   :1.0000   Max.   :4.000

2.(Experimental) Design

How will the experiment be organized and conducted to test the hypothesis?

This experiment will use three factors each with 2 or more levels. The experiment will look at one of these factors’ effects on the duration of unemployment, and will block the other variable. The null hypothesis of this experiment can be set as the duration of unemplyment is not affected by race.

A Type I error ???? of 0.05 was used for this analysis, as it is the most common cutoff used to determine statistical significance. A Type II error ???? of 0.2 was used because we typically prefer a power (1 - ????) of 0.8 or greater. Increasing the power increases the sample size required to draw a statistically significant conclusion.

What is the rationale for this design?

The rationale design for this experiment is to practice blocking in an experiment and some of the ways that we can prove an experiment without using NHST.

Randomize: What is the randomization scheme?

There was no information submitted with the data that gave insight to the randomization scheme. By looking at the data and the patterns we can visually see it seems as though the data was collected somewhat randomly where there are not large amounts of “groupings” of durations.

Replicate: Are there replicates and/or repeated measures

In this dataset there are repeated measures. The variables ftp1, ftp2, ftp3, ftp4 are all whether the individual is searching for a job at either survery 1, 2, 3, or 4.

Blocking: Did you use blocking

The data set had more than the three variables that I included in this experiment, so in that sense I used blocking to disregard these variables. There may be some variation in the response variables caused by one of the variables that is not one of the 3 vairables I chose. This makes that variable a nuisance factor and therefore by not including it in the experiment, I am blocking.

Specifically out of the three variables that I focused on, I purposely blocked one of the variables for the sake of blocking.

Determining Sample Size Using GPower In order to determine the sample size using GPower, we need to know the input parameters: effect size f, error probability ???? (.05), power (.8), and number of groups (2). We will estimate the effect size by using Cohen’s d that we can find using the effsize* package.

library(effsize)
cohen.d(Unemployment$duration, Unemployment$race)

## 
## Cohen's d
## 
## d estimate: 0.02205959 (negligible)
## 95 percent confidence interval:
##        inf        sup 
## -0.1894572  0.2335763

Caption for the picture.

As you can see above, the sample size suggestiion is extremely large. Because of this I decided to stay with the entire dataset to keep the power as high as possible.

If the output had been for a smaller sample size, of say 50 then we would run this code in order to select that sample size:

sample_size <- 50
set.seed(2)
Sample0 <- Unemployment[sample(nrow(Unemployment), sample_size),]

3. (Statistical) Analysis

Exploratory Data Analysis) Graphics and Descriptive Summary

Below you can see the summary of our data again, as well as a histogram of the response variable, duration in order to get a better feel for its distribution.

summary(Unemployment)

##     duration          spell              race         sex     
##  Min.   :  0.00   Min.   :0.0000   nonwhite:117   male  :242  
##  1st Qu.:  4.00   1st Qu.:0.0000   white   :335   female:210  
##  Median : 10.00   Median :1.0000                              
##  Mean   : 18.51   Mean   :0.5664                              
##  3rd Qu.: 22.25   3rd Qu.:1.0000                              
##  Max.   :117.00   Max.   :1.0000                              
##     reason    search    pubemp         ftp1             ftp2       
##  new   : 41   no :309   no :360   Min.   :0.0000   Min.   :0.0000  
##  lose  :171   yes:143   yes: 92   1st Qu.:0.0000   1st Qu.:0.0000  
##  leave : 92                       Median :1.0000   Median :0.0000  
##  reentr:148                       Mean   :0.6726   Mean   :0.4336  
##                                   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                                   Max.   :1.0000   Max.   :1.0000  
##       ftp3            ftp4             nobs      
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.000   Median :0.0000   Median :1.000  
##  Mean   :0.354   Mean   :0.3053   Mean   :1.788  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.000   Max.   :1.0000   Max.   :4.000

hist(Unemployment$duration, main = "Frequencies of Duration of Unemployment", xlab = "Duration")

If you look at the histogram above you can see that this data set is obviously skewed positively. When we are taking into account what that means for this data (that the majority of the data shows a small amount of duration of unemployment) we can say this truly is a positive distibution (positive meaning good).

m1 <- mean(subset(Unemployment$duration, Unemployment$race=="nonwhite"))
m2 <- mean(subset(Unemployment$duration, Unemployment$race=="white"))
me <- m2 - m1

sd1 <- sd(subset(Unemployment$duration, Unemployment$race=="nonwhite"))
sd2 <- sd(subset(Unemployment$duration, Unemployment$race=="white"))

I will be using boxplots in order to analyze the main effects visually. The box plots show the significance of a factor.

boxplot(Unemployment$duration~Unemployment$race, xlab = "Race", ylab = "Duration of Unemployment", main = "Race: nonwhite, white")

As you can see from the calculation and the visual boxplot above, while there is not a huge change in the effect of the individuals duration of unemployment based on their race, there is overall more nonwhite individuals consitently staying unemployed for larger period of times. That being said, we can still see that there are numerous outliers of whites that are also suffering from longer lasting unemployment.

ANOVA

MyAOV <- aov(Unemployment$duration ~ Unemployment$race+Unemployment$reason)
anova(MyAOV)

## Analysis of Variance Table
## 
## Response: Unemployment$duration
##                      Df Sum Sq Mean Sq F value Pr(>F)
## Unemployment$race     1     23   22.54  0.0425 0.8368
## Unemployment$reason   3   3061 1020.42  1.9225 0.1251
## Residuals           447 237259  530.78

Model Adequacy Checking

A normal Q-Q plot and residuals plots will be able to help prove if the dataset meets the assumptions needed to understand our ANOVA model. As you can see in the plot below, the data deviates from the normal distribution assumption.

qqnorm(residuals(MyAOV), main = "Normal Q-Q Plot")
qqline(residuals(MyAOV))

As you can see in the residuals plot below, there seems to be some systematic variation from the model in the form of linearity.

plot(fitted(MyAOV),residuals(MyAOV))

4. Alternatives to Null Hypothesis Statistical Testing

There are many alternatives to NHST, in this section we will look into multiple models and confidence intervals

Multiple Models

Multiple models uses different models to show that a model is significant beyond randomization. There should be other models where you do not recieve te same result.

lm1 = lm(Unemployment$duration ~ Unemployment$race*Unemployment$reason)
qqnorm(residuals(lm1))

plot(fitted(lm1), residuals(lm1))

lm2 = lm(Unemployment$duration ~ ((Unemployment$race)^2)+Unemployment$reason)
qqnorm(residuals(lm2))

plot(fitted(lm2), residuals(lm2))

lm3 = lm(Unemployment$duration ~ ((Unemployment$reason)^2)+Unemployment$race)
qqnorm(residuals(lm3))

plot(fitted(lm3), residuals(lm3))

lm4 = lm(Unemployment$duration ~ (Unemployment$race)^2 + (Unemployment$reason)^2)
qqnorm(residuals(lm4))

plot(fitted(lm4), residuals(lm4))

With all four of these different models, we still see a lack of a normal distribution and an obviously systematically variated and very linear in nature.

Confidence Intervals

As stated in our class notes, confidence intervals are used to estimate the degree to which the observed value is likely to be the true value.

We will create a sample group of 50 observations.

sample_size <- 50
set.seed(2)
sample1 <- Unemployment[sample(nrow(Unemployment), sample_size),]

From this sample we will find the mean and standard deviation which are needed to calculate a confidence interval.

a = mean(sample1$duration)
s = sd(sample1$duration)

n = 50
error = qnorm(.975)*s/sqrt(n)
left = a - error
right = a + error
left

## [1] 9.619177

right

## [1] 20.58082

The true mean has a probability of being in the interval between 9.619177 and 20.58082. Solving for the true mean:

mean = mean(Unemployment$duration)

The true mean is 18.5110619469 which falls within the confidence interval.

My Conclusions

Based on my original model, the other various models I created, and the plots I created my conclusion is to fail to reject the null hypothesis. I cannot say confidently that race or reason affected the duration of the subjects unemployment. I found it interesting to work with a dataset where i failed to reject the null hypothesis. I feel that most of the projects will have data where they can reject the null, but I was able to start by using NHST and then also use two other methods to show that there is not an effect.

5. Appendix

Full R code:

#Read in unemployment data
library("Ecdat")
data("Unemployment")

#Summary of data
head(Unemployment)
summary(Unemployment)

#structure of data and levels of factors
str(Unemployment)
levels(Unemployment$race)
levels(Unemployment$reason)

#Use effsize to solve cohens d
library(effsize)
cohen.d(Unemployment$duration, Unemployment$race)

#Radom data choice of 50
sample_size <- 50
set.seed(2)
Sample0 <- Unemployment[sample(nrow(Unemployment), sample_size),]
hist(Unemployment$duration, main = "Frequencies of Duration of Unemployment", xlab = "Duration")

#solve for main effect
m1 <- mean(subset(Unemployment$duration, Unemployment$race=="nonwhite"))
m2 <- mean(subset(Unemployment$duration, Unemployment$race=="white"))
me <- m2 - m1

#solve for standard deviation
sd1 <- sd(subset(Unemployment$duration, Unemployment$race=="nonwhite"))
sd2 <- sd(subset(Unemployment$duration, Unemployment$race=="white"))

#boxplot to visualize me
boxplot(Unemployment$duration~Unemployment$race, xlab = "Race", ylab = "Duration of Unemployment", main = "Race: nonwhite, white")

#ANOVA
MyAOV <- aov(Unemployment$duration ~ Unemployment$race+Unemployment$reason)
anova(MyAOV)

#Model Adequacy
qqnorm(residuals(MyAOV), main = "Normal Q-Q Plot")
qqline(residuals(MyAOV))
plot(fitted(MyAOV),residuals(MyAOV))

#Multiple models
lm1 = lm(Unemployment$duration ~ Unemployment$race*Unemployment$reason)
qqnorm(residuals(lm1))
plot(fitted(lm1), residuals(lm1))

lm2 = lm(Unemployment$duration ~ ((Unemployment$race)^2)+Unemployment$reason)
qqnorm(residuals(lm2))
plot(fitted(lm2), residuals(lm2))

lm3 = lm(Unemployment$duration ~ ((Unemployment$reason)^2)+Unemployment$race)
qqnorm(residuals(lm3))
plot(fitted(lm3), residuals(lm3))

lm4 = lm(Unemployment$duration ~ (Unemployment$race)^2 + (Unemployment$reason)^2)
qqnorm(residuals(lm4))
plot(fitted(lm4), residuals(lm4))

#Rando, sample for confidence interval
sample_size <- 50
set.seed(2)
sample1 <- Unemployment[sample(nrow(Unemployment), sample_size),]
a = mean(sample1$duration)
s = sd(sample1$duration)
n = 50

#calculate confidence interval
error = qnorm(.975)*s/sqrt(n)
left = a - error
right = a + error
left
right

#mean of data
mean = mean(Unemployment$duration)

Project 2 - Null Hypothesis Statistical Testing

Kristen Cole

November 7, 2016