Class 11 practice

First I download and run the R file from NLSY97, which executes the following code.

#Set working directory
#setwd()

new_data <- read.table('class11.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','Z9122800')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$Z9122800 <- factor(data$Z9122800, levels=c(1.0,0.0), labels=c("Yes","No"))
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "CVC_TRN_CERT"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","Sex","Race","Cert")
  return(data)
}

#********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)

##    PUBID_1997        Sex             Race            Cert      
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :4502   Median :1.000   Median :4.000   Median :1.000  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :0.675  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :1.000  
##                                                 NA's   :3872

#************************************************************************************************************

Then I the require function to access dplyr and magrittr (which will be used for piping commands later), and use the tbl_df command to create a new data frame.

require(dplyr)

## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(magrittr)

## Loading required package: magrittr

new_dataframe <- tbl_df(new_data)
#Here is a glimpse of the required data frame:
glimpse(new_dataframe)

## Observations: 8,984
## Variables: 4
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ Sex        (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ Race       (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, ...
## $ Cert       (int) NA, 1, NA, 1, 1, 1, 1, NA, NA, 1, 1, 1, NA, NA, NA,...

summary(new_dataframe)

##    PUBID_1997        Sex             Race            Cert      
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :4502   Median :1.000   Median :4.000   Median :1.000  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :0.675  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :1.000  
##                                                 NA's   :3872

Next, I clean data and remove NA values.

new_dataDF <- new_dataframe %>% filter (Sex >= 1, Race >= 1, Cert >= 0, na.rm=TRUE)
new_dataDF

## Source: local data frame [5,112 x 4]
## 
##    PUBID_1997   Sex  Race  Cert
##         (int) (int) (int) (int)
## 1           2     1     2     1
## 2           4     2     2     1
## 3           5     1     2     1
## 4           6     2     2     1
## 5           7     1     2     1
## 6          10     1     4     1
## 7          11     2     2     1
## 8          12     1     2     1
## 9          18     1     1     0
## 10         19     1     1     0
## ..        ...   ...   ...   ...

summary(new_dataDF)

##    PUBID_1997        Sex             Race            Cert       
##  Min.   :   2   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:2322   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :4590   Median :2.000   Median :4.000   Median :1.0000  
##  Mean   :4567   Mean   :1.501   Mean   :2.729   Mean   :0.6747  
##  3rd Qu.:6852   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:1.0000  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :1.0000

I run frequency distributions of each variable to make sure there aren’t any unnecessary values.

sexfd <- table(new_dataDF$Sex)
sexfd

## 
##    1    2 
## 2549 2563

racefd <- table(new_dataDF$Race)
racefd

## 
##    1    2    3    4 
## 1442 1058   53 2559

certfd <- table(new_dataDF$Cert)
certfd

## 
##    0    1 
## 1663 3449

I use xtabs function to create crosstabulation of certificate completion by sex and by race.

cert_sex_race <- xtabs(~new_dataDF$Cert + new_dataDF$Sex + new_dataDF$Race)
cert_sex_race

## , , new_dataDF$Race = 1
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0 255 255
##               1 419 513
## 
## , , new_dataDF$Race = 2
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0 157 169
##               1 370 362
## 
## , , new_dataDF$Race = 3
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0   8  11
##               1  15  19
## 
## , , new_dataDF$Race = 4
## 
##                new_dataDF$Sex
## new_dataDF$Cert   1   2
##               0 410 398
##               1 915 836

I ran ftable for a better view of the table.

ftable(cert_sex_race)

##                                new_dataDF$Race   1   2   3   4
## new_dataDF$Cert new_dataDF$Sex                                
## 0               1                              255 157   8 410
##                 2                              255 169  11 398
## 1               1                              419 370  15 915
##                 2                              513 362  19 836

I use the summary function to complete a Chi-square test of three variables to test the null hypothesis.

summary(cert_sex_race)

## Call: xtabs(formula = ~new_dataDF$Cert + new_dataDF$Sex + new_dataDF$Race)
## Number of cases in table: 5112 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 22.38, df = 10, p-value = 0.01328

NULL HYPOTHESIS The null hypothesis being tested is that Sex, Race and Certificate completion are independent. State in another way, knowing a person’s sex, race or certificate completion does not help to predict the other variables – and vice versa. I will set alpha, the probability of Type 1 Error, equal to 0.05.

ALTERNATIVE HYPOTHESIS Sex, Race and Certificate Completion are not independent. One can be used to predict the other.

Results of hypothesis test: The p-value produced for a chi-square value of 22.38 is 0.01328, which is less than alpha. Therefore, we reject the null hypothesis.

Below is an APA compliant table showing the results.

Class 11 practice

Michael Zigner

November 5, 2015