# Q1 Download the following variables and associated RScript necessary from the NLSY97 data set accessible through https://www.nlsinfo.org/investigator/pages/login.jsp -
#• R05363.00 [KEY!SEX], KEY!SEX, R'S GENDER
#• R14826.00 [KEY!RACE_ETHNICITY], KEY!RACE_ETHNICITY, COMBINED RACE AND ETHNICITY (SYMBOL)
#• T66513.00 [CV_AGE_INT_DATE], R'S AGE AT INTERVIEW DATE (at the time of the 2011 interview)
#• Z90655.00 [CVC_HOURS_WK_TEEN], CUMULATIVE HOURS WORKED FROM AGE 14 THROUGH AGE 19
#• Z90657.00 [CVC_HOURS_WK_ADULT_ALL], CUMULATIVE HOURS R WORKED FROM AGE 20
#Create a project in R with these data. Use the dplyr command tbl_df to create a table data frame. Then, show a glimpse of the data. The inclusion of the correct variables in this glimpse is the focal point for assessment of successful completion of this item in the assignment.
# Set working directory
getwd()
## [1] "/Users/woongbae/Downloads/Assignment2"
new_data <- read.table('Assignment2.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','T6651300','Z9065500','Z9065700')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
data$T6651300 <- factor(data$T6651300, levels=c(26.0,27.0,28.0,29.0,30.0,31.0,32.0), labels=c("26","27","28","29","30","31","32"))
data$Z9065500 <- cut(data$Z9065500, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
data$Z9065700 <- cut(data$Z9065700, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"KEY!SEX (SYMBOL) 1997",
"KEY!RACE_ETHNICITY (SYMBOL) 1997",
"CV_AGE_INT_DATE 2011",
"CVC_HOURS_WK_TEEN",
"CVC_HOURS_WK_ADULT"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("PUBID_1997","gender","race","age","hours_teen","hours_adult")
return(data)
}
#********************************************************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## PUBID_1997 gender race age
## Min. : 1 Min. :1.000 Min. :1.000 Min. :26.00
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:28.00
## Median :4502 Median :1.000 Median :4.000 Median :29.00
## Mean :4504 Mean :1.488 Mean :2.788 Mean :28.79
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:30.00
## Max. :9022 Max. :2.000 Max. :4.000 Max. :32.00
## NA's :1561
## hours_teen hours_adult
## Min. : 0 Min. : 0
## 1st Qu.: 1255 1st Qu.: 8190
## Median : 2741 Median :16396
## Mean : 3105 Mean :15595
## 3rd Qu.: 4470 3rd Qu.:22418
## Max. :18829 Max. :63722
## NA's :707 NA's :1596
#************************************************************************************************************
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
wexperience <-tbl_df(new_data)
glimpse(wexperience)
## Observations: 8,984
## Variables: 6
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ gender (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,...
## $ race (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2,...
## $ age (int) 29, 29, 28, 30, 29, 29, 28, 30, 29, NA, 29, 29, 26...
## $ hours_teen (int) 5831, NA, 6489, 3292, 680, NA, 1650, 2082, 864, 0,...
## $ hours_adult (int) NA, 29712, NA, 23390, 28056, 18379, NA, 18419, 192...
summary(wexperience)
## PUBID_1997 gender race age
## Min. : 1 Min. :1.000 Min. :1.000 Min. :26.00
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:28.00
## Median :4502 Median :1.000 Median :4.000 Median :29.00
## Mean :4504 Mean :1.488 Mean :2.788 Mean :28.79
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:30.00
## Max. :9022 Max. :2.000 Max. :4.000 Max. :32.00
## NA's :1561
## hours_teen hours_adult
## Min. : 0 Min. : 0
## 1st Qu.: 1255 1st Qu.: 8190
## Median : 2741 Median :16396
## Mean : 3105 Mean :15595
## 3rd Qu.: 4470 3rd Qu.:22418
## Max. :18829 Max. :63722
## NA's :707 NA's :1596
# Q2 Filter the data in your project to include only people who were 30 years of age at the time of the 2011 interview. Calculate the number of people included. Perform the remainder of the calculations in this assignment using data from these included people 30 years of age in 2011.
require(magrittr)
## Loading required package: magrittr
wexperience %>%
filter(age == 30) %>%
summarize(wexperience=n())
## Source: local data frame [1 x 1]
##
## wexperience
## (int)
## 1 1546
# Q3 Test the null hypothesis that there is no difference by sex between the mean cumulative hours work from age 14 through age 19.
wexperience <- wexperience %>%
filter(age==30)
wexperience
## Source: local data frame [1,546 x 6]
##
## PUBID_1997 gender race age hours_teen hours_adult
## (int) (int) (int) (int) (int) (int)
## 1 4 2 2 30 3292 23390
## 2 8 2 4 30 2082 18419
## 3 26 1 1 30 760 0
## 4 27 1 1 30 1592 11060
## 5 32 2 4 30 4611 16981
## 6 33 2 4 30 1862 26551
## 7 38 2 4 30 0 NA
## 8 55 2 2 30 1368 16934
## 9 59 2 1 30 5648 NA
## 10 68 1 1 30 1316 NA
## .. ... ... ... ... ... ...
t.test(wexperience$hours_teen ~wexperience$gender)
##
## Welch Two Sample t-test
##
## data: wexperience$hours_teen by wexperience$gender
## t = 4.423, df = 1339.6, p-value = 1.052e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 306.4014 794.8262
## sample estimates:
## mean in group 1 mean in group 2
## 3528.906 2978.293
# Q4 Test the null hypothesis that there is no difference by sex between the mean cumulative hours work from age 20 and older.
t.test(wexperience$hours_adult ~wexperience$gender, var.equal=TRUE)
##
## Two Sample t-test
##
## data: wexperience$hours_adult by wexperience$gender
## t = 4.6742, df = 1210, p-value = 3.283e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1404.148 3435.568
## sample estimates:
## mean in group 1 mean in group 2
## 20671.98 18252.12
# Q5 Test the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours work from age 14 through age 19. In this analysis — and in the analysis for item (6) — code race/ethnicity as "1" if race/ethnicity is "Non-Black, Non-Hispanic" and "0" otherwise.
wexperience$race <- ifelse(wexperience$race==4,1,0)
t.test(wexperience$hours_teen~wexperience$race, var.equal=TRUE)
##
## Two Sample t-test
##
## data: wexperience$hours_teen by wexperience$race
## t = -6.3596, df = 1414, p-value = 2.725e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1027.4393 -543.0211
## sample estimates:
## mean in group 0 mean in group 1
## 2860.372 3645.603
#Q6 Test the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours work from age 20 and older.
t.test(wexperience$hours_adult~wexperience$race, var.equal=TRUE)
##
## Two Sample t-test
##
## data: wexperience$hours_adult by wexperience$race
## t = -4.239, df = 1210, p-value = 2.416e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3215.729 -1180.859
## sample estimates:
## mean in group 0 mean in group 1
## 18321.36 20519.65
#In items (3) through (6), (a) apply a probability of Type 1 error of 0.05 and (b) state the results of null hypothesis tests. Section 4.44 of the Publication Manual of the American Psychological Association (6th ed.) provides standards for reporting the results of t–tests within text.
#3. p-value = .00001028
#The T-test of the null hypothesis that there is no difference by sex between the mean cumulative hours work from age 14 through age 19 was not statistically significant, therefore, the null hypothesis was rejected. t = 4.4273, df = 1414, p-value = 0.00001028, 95% CI [306.6484, 794.5792].
#4. p-value = .000003283
#The T-test of the null hypothesis that there is no difference by sex between the mean cumulative hours work from age 20 was not statistically significant, therefore, the null hypothesis was rejected. t = 4.6742, df = 1210, p-value = 0.000003283, 95% CI [1404.148, 3435.568].
#5. p-value = .0000000002725
#The T-test of the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours work from age 14 through age 19 was not statistically significant, therefore, the null hypothesis was rejected. t = -6.3596, df = 1414, p-value = 0.0000000002725, 95% CI [-1027.4393, -543.0211].
#6. p-value = .00002416
#The T-test of the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours work from age 20 was not statistically significant, therefore, the null hypothesis was rejected. t = -4.239, df = 1210, p-value = 0.00002416, 95% CI [-3215.729, -1180.859].