First I download and run the R file from NLSY97, which executes the following code.
#Set working directory
#setwd()
new_data <- read.table('class11.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','Z9122800')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
data$Z9122800 <- factor(data$Z9122800, levels=c(1.0,0.0), labels=c("Yes","No"))
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"KEY!SEX (SYMBOL) 1997",
"KEY!RACE_ETHNICITY (SYMBOL) 1997",
"CVC_TRN_CERT"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("PUBID_1997","Sex","Race","Cert")
return(data)
}
#********************************************************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## PUBID_1997 Sex Race Cert
## Min. : 1 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.000
## Median :4502 Median :1.000 Median :4.000 Median :1.000
## Mean :4504 Mean :1.488 Mean :2.788 Mean :0.675
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:1.000
## Max. :9022 Max. :2.000 Max. :4.000 Max. :1.000
## NA's :3872
#************************************************************************************************************
Then I the require function to access dplyr and magrittr (which will be used for piping commands later), and use the tbl_df command to create a new data frame.
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(magrittr)
## Loading required package: magrittr
new_dataframe <- tbl_df(new_data)
#Here is a glimpse of the required data frame:
glimpse(new_dataframe)
## Observations: 8,984
## Variables: 4
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ Sex (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ Race (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, ...
## $ Cert (int) NA, 1, NA, 1, 1, 1, 1, NA, NA, 1, 1, 1, NA, NA, NA,...
summary(new_dataframe)
## PUBID_1997 Sex Race Cert
## Min. : 1 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.000
## Median :4502 Median :1.000 Median :4.000 Median :1.000
## Mean :4504 Mean :1.488 Mean :2.788 Mean :0.675
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:1.000
## Max. :9022 Max. :2.000 Max. :4.000 Max. :1.000
## NA's :3872
Next, I clean data and remove NA values.
new_dataDF <- new_dataframe %>% filter (Sex >= 1, Race >= 1, Cert >= 0, na.rm=TRUE)
new_dataDF
## Source: local data frame [5,112 x 4]
##
## PUBID_1997 Sex Race Cert
## (int) (int) (int) (int)
## 1 2 1 2 1
## 2 4 2 2 1
## 3 5 1 2 1
## 4 6 2 2 1
## 5 7 1 2 1
## 6 10 1 4 1
## 7 11 2 2 1
## 8 12 1 2 1
## 9 18 1 1 0
## 10 19 1 1 0
## .. ... ... ... ...
summary(new_dataDF)
## PUBID_1997 Sex Race Cert
## Min. : 2 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.:2322 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :4590 Median :2.000 Median :4.000 Median :1.0000
## Mean :4567 Mean :1.501 Mean :2.729 Mean :0.6747
## 3rd Qu.:6852 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:1.0000
## Max. :9022 Max. :2.000 Max. :4.000 Max. :1.0000
I run frequency distributions of each variable to make sure there aren’t any unnecessary values.
sexfd <- table(new_dataDF$Sex)
sexfd
##
## 1 2
## 2549 2563
racefd <- table(new_dataDF$Race)
racefd
##
## 1 2 3 4
## 1442 1058 53 2559
certfd <- table(new_dataDF$Cert)
certfd
##
## 0 1
## 1663 3449
I use xtabs function to create crosstabulation of certificate completion by sex and by race.
cert_sex_race <- xtabs(~new_dataDF$Cert + new_dataDF$Sex + new_dataDF$Race)
cert_sex_race
## , , new_dataDF$Race = 1
##
## new_dataDF$Sex
## new_dataDF$Cert 1 2
## 0 255 255
## 1 419 513
##
## , , new_dataDF$Race = 2
##
## new_dataDF$Sex
## new_dataDF$Cert 1 2
## 0 157 169
## 1 370 362
##
## , , new_dataDF$Race = 3
##
## new_dataDF$Sex
## new_dataDF$Cert 1 2
## 0 8 11
## 1 15 19
##
## , , new_dataDF$Race = 4
##
## new_dataDF$Sex
## new_dataDF$Cert 1 2
## 0 410 398
## 1 915 836
I ran ftable for a better view of the table.
ftable(cert_sex_race)
## new_dataDF$Race 1 2 3 4
## new_dataDF$Cert new_dataDF$Sex
## 0 1 255 157 8 410
## 2 255 169 11 398
## 1 1 419 370 15 915
## 2 513 362 19 836
I use the summary function to complete a Chi-square test of three variables to test the null hypothesis.
summary(cert_sex_race)
## Call: xtabs(formula = ~new_dataDF$Cert + new_dataDF$Sex + new_dataDF$Race)
## Number of cases in table: 5112
## Number of factors: 3
## Test for independence of all factors:
## Chisq = 22.38, df = 10, p-value = 0.01328
NULL HYPOTHESIS The null hypothesis being tested is that Sex, Race and Certificate completion are independent. State in another way, knowing a person’s sex, race or certificate completion does not help to predict the other variables – and vice versa. I will set alpha, the probability of Type 1 Error, equal to 0.05.
ALTERNATIVE HYPOTHESIS Sex, Race and Certificate Completion are not independent. One can be used to predict the other.
Results of hypothesis test: The p-value produced for a chi-square value of 22.38 is 0.01328, which is less than alpha. Therefore, we reject the null hypothesis.
Below is an APA compliant table showing the results.