First I download and run the R file from NLSY97, which executes the following code.

#Set working directory
#setwd()

new_data <- read.table('class11.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','Z9122800')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$Z9122800 <- factor(data$Z9122800, levels=c(1.0,0.0), labels=c("Yes","No"))
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "CVC_TRN_CERT"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","Sex","Race","Cert")
  return(data)
}

#********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997        Sex             Race            Cert      
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :4502   Median :1.000   Median :4.000   Median :1.000  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :0.675  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :1.000  
##                                                 NA's   :3872
#************************************************************************************************************

Then I the require function to access dplyr and magrittr (which will be used for piping commands later), and use the tbl_df command to create a new data frame.

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(magrittr)
## Loading required package: magrittr
new_dataframe <- tbl_df(new_data)
#Here is a glimpse of the required data frame:
glimpse(new_dataframe)
## Observations: 8,984
## Variables: 4
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ Sex        (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ Race       (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, ...
## $ Cert       (int) NA, 1, NA, 1, 1, 1, 1, NA, NA, 1, 1, 1, NA, NA, NA,...
summary(new_dataframe)
##    PUBID_1997        Sex             Race            Cert      
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :4502   Median :1.000   Median :4.000   Median :1.000  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :0.675  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :1.000  
##                                                 NA's   :3872

Next, I clean data and remove NA values.

new_dataDF <- new_dataframe %>% filter (Sex >= 1, Race >= 1, Cert >= 0, na.rm=TRUE)
new_dataDF
## Source: local data frame [5,112 x 4]
## 
##    PUBID_1997   Sex  Race  Cert
##         (int) (int) (int) (int)
## 1           2     1     2     1
## 2           4     2     2     1
## 3           5     1     2     1
## 4           6     2     2     1
## 5           7     1     2     1
## 6          10     1     4     1
## 7          11     2     2     1
## 8          12     1     2     1
## 9          18     1     1     0
## 10         19     1     1     0
## ..        ...   ...   ...   ...
summary(new_dataDF)
##    PUBID_1997        Sex             Race            Cert       
##  Min.   :   2   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:2322   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :4590   Median :2.000   Median :4.000   Median :1.0000  
##  Mean   :4567   Mean   :1.501   Mean   :2.729   Mean   :0.6747  
##  3rd Qu.:6852   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:1.0000  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :1.0000

I run frequency distributions of each variable to make sure there aren’t any unnecessary values.

sexfd <- table(new_dataDF$Sex)
sexfd
## 
##    1    2 
## 2549 2563
racefd <- table(new_dataDF$Race)
racefd
## 
##    1    2    3    4 
## 1442 1058   53 2559
certfd <- table(new_dataDF$Cert)
certfd
## 
##    0    1 
## 1663 3449

I use xtabs function to create crosstabulation of certificate completion by sex and by race.

cert_sex_race <- xtabs(~new_dataDF$Cert + new_dataDF$Sex + new_dataDF$Race)
cert_sex_race
## , , new_dataDF$Race = 1
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0 255 255
##               1 419 513
## 
## , , new_dataDF$Race = 2
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0 157 169
##               1 370 362
## 
## , , new_dataDF$Race = 3
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0   8  11
##               1  15  19
## 
## , , new_dataDF$Race = 4
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0 410 398
##               1 915 836

I ran ftable for a better view of the table.

ftable(cert_sex_race)
##                                new_dataDF$Race   1   2   3   4
## new_dataDF$Cert new_dataDF$Sex                                
## 0               1                              255 157   8 410
##                 2                              255 169  11 398
## 1               1                              419 370  15 915
##                 2                              513 362  19 836

I use the summary function to complete a Chi-square test of three variables to test the null hypothesis.

summary(cert_sex_race)
## Call: xtabs(formula = ~new_dataDF$Cert + new_dataDF$Sex + new_dataDF$Race)
## Number of cases in table: 5112 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 22.38, df = 10, p-value = 0.01328

NULL HYPOTHESIS The null hypothesis being tested is that Sex, Race and Certificate completion are independent. State in another way, knowing a person’s sex, race or certificate completion does not help to predict the other variables – and vice versa. I will set alpha, the probability of Type 1 Error, equal to 0.05.

ALTERNATIVE HYPOTHESIS Sex, Race and Certificate Completion are not independent. One can be used to predict the other.

Results of hypothesis test: The p-value produced for a chi-square value of 22.38 is 0.01328, which is less than alpha. Therefore, we reject the null hypothesis.

Below is an APA compliant table showing the results.