Class11

# Set working directory
# getwd()

new_data <- read.table('class11.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','Z0520200')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$Z0520200 <- factor(data$Z0520200, levels=c(1.0,0.0), labels=c("YES","NO"))
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "SP/PAR STILL PAYING OFF EDUC LOANS?"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","KEY_SEX_1997","KEY_RACE_ETHNICITY_1997","YAST30-5040_NEW_COMB_XRND")
  return(data)
}

#********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)

##    PUBID_1997    KEY_SEX_1997   KEY_RACE_ETHNICITY_1997
##  Min.   :   1   Min.   :1.000   Min.   :1.000          
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000          
##  Median :4502   Median :1.000   Median :4.000          
##  Mean   :4504   Mean   :1.488   Mean   :2.788          
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000          
##  Max.   :9022   Max.   :2.000   Max.   :4.000          
##                                                        
##  YAST30-5040_NEW_COMB_XRND
##  Min.   :0.000            
##  1st Qu.:0.000            
##  Median :0.000            
##  Mean   :0.237            
##  3rd Qu.:0.000            
##  Max.   :1.000            
##  NA's   :5627

# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)

#************************************************************************************************************
require(dplyr)

## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

nls <- tbl_df(new_data)
summary(nls)

##    PUBID_1997    KEY_SEX_1997   KEY_RACE_ETHNICITY_1997
##  Min.   :   1   Min.   :1.000   Min.   :1.000          
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000          
##  Median :4502   Median :1.000   Median :4.000          
##  Mean   :4504   Mean   :1.488   Mean   :2.788          
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000          
##  Max.   :9022   Max.   :2.000   Max.   :4.000          
##                                                        
##  YAST30-5040_NEW_COMB_XRND
##  Min.   :0.000            
##  1st Qu.:0.000            
##  Median :0.000            
##  Mean   :0.237            
##  3rd Qu.:0.000            
##  Max.   :1.000            
##  NA's   :5627

require(magrittr)

## Loading required package: magrittr

nls<-nls%>%filter(`YAST30-5040_NEW_COMB_XRND`>=0)
summary(nls)

##    PUBID_1997    KEY_SEX_1997  KEY_RACE_ETHNICITY_1997
##  Min.   :   3   Min.   :1.00   Min.   :1.000          
##  1st Qu.:2321   1st Qu.:1.00   1st Qu.:2.000          
##  Median :4372   Median :2.00   Median :4.000          
##  Mean   :4395   Mean   :1.52   Mean   :2.988          
##  3rd Qu.:6390   3rd Qu.:2.00   3rd Qu.:4.000          
##  Max.   :9022   Max.   :2.00   Max.   :4.000          
##  YAST30-5040_NEW_COMB_XRND
##  Min.   :0.0000           
##  1st Qu.:0.0000           
##  Median :0.0000           
##  Mean   :0.2371           
##  3rd Qu.:0.0000           
##  Max.   :1.0000

1.Compute in R a three???way contingency table that is a crosstabulation of the three categorical variables you selected.

povtable <- xtabs(~nls$KEY_SEX_1997+nls$KEY_RACE_ETHNICITY_1997+nls$`YAST30-5040_NEW_COMB_XRND`)
ftable(povtable)

##                                              nls$`YAST30-5040_NEW_COMB_XRND`   0   1
## nls$KEY_SEX_1997 nls$KEY_RACE_ETHNICITY_1997                                        
## 1                1                                                           241  89
##                  2                                                           276  77
##                  3                                                            12   3
##                  4                                                           618 297
## 2                1                                                           235  55
##                  2                                                           344  55
##                  3                                                            16   1
##                  4                                                           819 219

summary(povtable)

## Call: xtabs(formula = ~nls$KEY_SEX_1997 + nls$KEY_RACE_ETHNICITY_1997 + 
##     nls$`YAST30-5040_NEW_COMB_XRND`)
## Number of cases in table: 3357 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 81.37, df = 10, p-value = 2.709e-13
##  Chi-squared approximation may be incorrect

Test the statistical null hypothesis that the three variables you selected are independent. Follow the six steps for null hypothesis testing that we described during class meetings.

State the null hypothesis and its alternative. H0: x^2 = 0 H1: x^2 ≠ 0
Select a test statistic. Use cross tabulation analysis among three variables.
Establish a level of α ??? the probability of type 1 error ???that is acceptable. Set α = .05.
Collect the data and calculate the text statistic. Chisq = 81.37, df = 10, p-value = 2.709e-13 alternative hypothesis: x^2 is not equal to 0
Make a decision based on the value of the test statistic. The p-value associated with a chi-squared of 81.37 equals 2.709e-13 which is > α. Therefore, null cannot be rejected at α = .05.
Report the decision to reject or fail to reject the null hypothesis. Sex, race, and education loan are independent.

Class11

Yunsoo Lee

11/05/2015