Data for this assignment came from the NLS Investigator, which can be accessed here.

An Analysis of Work Experience by Sex and Race Amoung 30 Year Olds

1.) As directed, I have created a new project in R and have downloaded the data files from the NLS into my project folder.

new_data <- read.table('mtsassignment2.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','T6651300','Z9065500','Z9065700')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$T6651300 <- factor(data$T6651300, levels=c(26.0,27.0,28.0,29.0,30.0,31.0,32.0), labels=c("26","27","28","29","30","31","32"))
  data$Z9065500 <- cut(data$Z9065500, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
  data$Z9065700 <- cut(data$Z9065700, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "CV_AGE_INT_DATE 2011",
    "CVC_HOURS_WK_TEEN",
    "CVC_HOURS_WK_ADULT"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","sex","race","age","teen_hours","adult_hours")
  return(data)
}

# ********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997        sex             race            age       
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :26.00  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:28.00  
##  Median :4502   Median :1.000   Median :4.000   Median :29.00  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :28.79  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:30.00  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :32.00  
##                                                 NA's   :1561   
##    teen_hours     adult_hours   
##  Min.   :    0   Min.   :    0  
##  1st Qu.: 1255   1st Qu.: 8190  
##  Median : 2741   Median :16396  
##  Mean   : 3105   Mean   :15595  
##  3rd Qu.: 4470   3rd Qu.:22418  
##  Max.   :18829   Max.   :63722  
##  NA's   :707     NA's   :1596
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)

# ************************************************************************************************************

I also loaded a few packages that I anticipated needing to work with:

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(magrittr)
## Loading required package: magrittr
require(ggvis)
## Loading required package: ggvis

In order to take a look at the data to see what I am working with, I created a table data frame and executed the ‘glimpse’ command to verify the variables and their names:

tbl_df(new_data)
## Source: local data frame [8,984 x 6]
## 
##    PUBID_1997   sex  race   age teen_hours adult_hours
##         (int) (int) (int) (int)      (int)       (int)
## 1           1     2     4    29       5831          NA
## 2           2     1     2    29         NA       29712
## 3           3     2     2    28       6489          NA
## 4           4     2     2    30       3292       23390
## 5           5     1     2    29        680       28056
## 6           6     2     2    29         NA       18379
## 7           7     1     2    28       1650          NA
## 8           8     2     4    30       2082       18419
## 9           9     1     4    29        864       19274
## 10         10     1     4    NA          0        6849
## ..        ...   ...   ...   ...        ...         ...
glimpse(new_data)
## Observations: 8,984
## Variables: 6
## $ PUBID_1997  (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ sex         (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,...
## $ race        (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2,...
## $ age         (int) 29, 29, 28, 30, 29, 29, 28, 30, 29, NA, 29, 29, 26...
## $ teen_hours  (int) 5831, NA, 6489, 3292, 680, NA, 1650, 2082, 864, 0,...
## $ adult_hours (int) NA, 29712, NA, 23390, 28056, 18379, NA, 18419, 192...

2.) Next I filtered the data to include data only from individuals that were 30 years old at the time that they were interviewed:

thirty <- subset(new_data, age==30)
tbl_df (thirty)
## Source: local data frame [1,546 x 6]
## 
##    PUBID_1997   sex  race   age teen_hours adult_hours
##         (int) (int) (int) (int)      (int)       (int)
## 1           4     2     2    30       3292       23390
## 2           8     2     4    30       2082       18419
## 3          26     1     1    30        760           0
## 4          27     1     1    30       1592       11060
## 5          32     2     4    30       4611       16981
## 6          33     2     4    30       1862       26551
## 7          38     2     4    30          0          NA
## 8          55     2     2    30       1368       16934
## 9          59     2     1    30       5648          NA
## 10         68     1     1    30       1316          NA
## ..        ...   ...   ...   ...        ...         ...

Note that there are 1,546 individuals that were 30 years of age when they were interviewed. This will be our sample for the remainder of the analysis.

3.) Here I tested the null hypothesis that there is no difference, by sex, between the cumulative work hours of the 30 year olds between the ages of 14 and 19. My null and alternative hypotheses are stated below:

Null Hypothesis: There is no statistically significant difference, by sex, between the means of the cumulative work hour data of 30-year-old interview respondants between the ages of 14 and 19.

Alternate Hypothesis: Sex has a statistically significant impact on the mean cumulative work hour data of 30-year-old interview respondants between the ages of 14 and 19.

I noticed that I had some missing data (which NLS has denoted with negative values) and some extreme values, so I trimed the data in order to give myself the most accurate means. To help with this, I visualized the data with ggivs:

thirty %>% ggvis(~teen_hours) %>% layer_histograms()
## Guessing width = 500 # range / 29

The magrittr package was already loaded, and I used piped commands to trim the data using the ‘filter’ command to include values greater than zero (in order to filter out the missing data, which NLS has marked as negative), and to cut off extreme hours beyond 8000.

trim_thirty <- thirty %>% filter(teen_hours>0 & teen_hours < 8000)
summary(trim_thirty)
##    PUBID_1997        sex             race           age       teen_hours  
##  Min.   :   4   Min.   :1.000   Min.   :1.00   Min.   :30   Min.   :  35  
##  1st Qu.:2513   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:30   1st Qu.:1526  
##  Median :4747   Median :2.000   Median :4.00   Median :30   Median :2917  
##  Mean   :4653   Mean   :1.511   Mean   :2.76   Mean   :30   Mean   :3103  
##  3rd Qu.:6879   3rd Qu.:2.000   3rd Qu.:4.00   3rd Qu.:30   3rd Qu.:4360  
##  Max.   :9006   Max.   :2.000   Max.   :4.00   Max.   :30   Max.   :7912  
##                                                                           
##   adult_hours   
##  Min.   :    0  
##  1st Qu.:13691  
##  Median :20821  
##  Mean   :19559  
##  3rd Qu.:25928  
##  Max.   :63722  
##  NA's   :240

Next I tested the null hypothesis, setting alpha to .05:

t.test(teen_hours~sex, trim_thirty, var.eq=TRUE)
## 
##  Two Sample t-test
## 
## data:  teen_hours by sex
## t = 2.4699, df = 1304, p-value = 0.01364
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   52.92109 461.59563
## sample estimates:
## mean in group 1 mean in group 2 
##        3234.297        2977.039

The t test results supported rejecting the null hypothesis. The t statistic of 2.47, which verified that the difference between the means is greater than the error in estimating. Also, the p value of .01 was less than the specified alpha of .05, satisfying the allowance for type 1 error risk.

4.) I tested the null hypothesis that there is no difference, by sex, between the means of cumulative adult work hours using a similar process, first trimming the data to result in the most accurate means:

thirty %>% ggvis(~adult_hours) %>% layer_histograms()
## Guessing width = 2000 # range / 32

adult_trim <- thirty %>% filter(adult_hours >0 & adult_hours < 8000)   
summary(adult_trim)
##    PUBID_1997        sex            race           age       teen_hours   
##  Min.   :  78   Min.   :1.00   Min.   :1.00   Min.   :30   Min.   :    0  
##  1st Qu.:3252   1st Qu.:1.00   1st Qu.:1.00   1st Qu.:30   1st Qu.:  590  
##  Median :5580   Median :2.00   Median :2.00   Median :30   Median : 1800  
##  Mean   :5163   Mean   :1.52   Mean   :2.44   Mean   :30   Mean   : 2405  
##  3rd Qu.:7470   3rd Qu.:2.00   3rd Qu.:4.00   3rd Qu.:30   3rd Qu.: 3829  
##  Max.   :9004   Max.   :2.00   Max.   :4.00   Max.   :30   Max.   :11753  
##                                                            NA's   :9      
##   adult_hours  
##  Min.   :  80  
##  1st Qu.:2621  
##  Median :4753  
##  Mean   :4488  
##  3rd Qu.:6667  
##  Max.   :7975  
## 

The null and alternate hypotheses are stated below:

Null Hypothesis: There is no statistically significant difference, by sex, between the means of cumulative work hour data of the 30-year-old respondants as measured from age 20 and up.

Alternative Hypothesis: Sex has a statistically significant influence on the mean cumulative work hours of the 30-year-old respondants as measured from age 20 and up.

Once again, alpha was set to .05.

t.test(adult_hours~sex, adult_trim, var.eq=TRUE)
## 
##  Two Sample t-test
## 
## data:  adult_hours by sex
## t = -2.6575, df = 148, p-value = 0.008737
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1810.595  -266.251
## sample estimates:
## mean in group 1 mean in group 2 
##        3947.833        4986.256

The resutls of the t test allow us to reject the null hypothesis. In this test the t value is 2.66, which indicates that the difference between the means is greater than the error in estimating. Our p value is .008, which is less than alpha (.05), enabling us to reject the null hypothesis.

5.) In this analysis Intested the null hypothesis that there is no difference, by race, between the means of cumulative work hours between the ages of 14 and 19. Note that ‘no difference by race’ means no difference between “Non-Black, Non-Hispanic” and “Mixed Race, Black and Hispanic”. I used my trimmed data for this analysis.

First I recoded the race variables so that I can compare the correct means. I also printed a table to double-check that my original 1-4 variables had been recoded to 0s and 1s.

trim_thirty$race <- ifelse(trim_thirty$race > 3, c(0), c(1))
table(trim_thirty$race)
## 
##   0   1 
## 664 642

The null and alternative hypotheses are listed below:

Null Hypothesis: There is no statistically significant difference, by race/ethnicity, between the mean hours worked between the ages of 14 and 19 by interviewees 30 years of age.

Alternative Hypothesis: Race/ethnicity has a statistically significant impact on the mean hours worked between the ages of 14 and 19 by interviewees 30 years of age.

t.test(teen_hours~race, trim_thirty, var.eq=TRUE)
## 
##  Two Sample t-test
## 
## data:  teen_hours by race
## t = 5.4823, df = 1304, p-value = 5.037e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  363.3486 768.3017
## sample estimates:
## mean in group 0 mean in group 1 
##        3381.057        2815.232

The t test results indicated that the difference between the means was greater than the error in estimating (as evidenced by the t statisic of 5.48), and our p value is well below our alpha of .05 with a value of 5.037 e-08. Once again we are able to reject the null hypothesis.

6.) Lastly I looked for a relationship between race/ethnicity and mean cumulative work hours worked as an adult. My null and alternative hypotheses are below:

Null Hypothesis: There is no statistically significant difference, by race/ethnicity, between the mean hours worked after age 20 by interviewees 30 years of age.

Alternative Hypothesis: Race/ethnicity will have a statistically significant influence on mean hours worked after the age of 20 by interviewees 30 years of age.

Once again I set alpha to .05 and run a t test:

adult_trim$race <- ifelse(adult_trim$race > 3, c(0), c(1))
t.test(adult_hours~race, adult_trim, var.eq=TRUE)
## 
##  Two Sample t-test
## 
## data:  adult_hours by race
## t = 3.4798, df = 148, p-value = 0.0006595
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   582.970 2115.245
## sample estimates:
## mean in group 0 mean in group 1 
##        5252.308        3903.200

The t value of 3.4789 tells us that the variance between the means is larger than the error in estimating. We see that the p value is .0006595, smaller than our designated alpha, giving us considerable confidence that the difference between the means is not due to chance. The results of this t test enabled us to reject the null hypothesis.