#WFED540 Class 12 Practice
##By: Andrew Leigey
##Date: November 6, 2015



require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
new_data <- read.table('Class12_3_Variables.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R9859700','R9859900')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R9859700 <- factor(data$R9859700, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
  data$R9859900 <- factor(data$R9859900, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "TRANS_ACAD_CONC HSTR",
    "TRANS_VOC_CONC HSTR"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","sex","acdconc","vocconc")
  return(data)
}

#*************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
#categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
#categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997        sex           acdconc           vocconc       
##  Min.   :   1   Min.   :1.000   Min.   :-8.0000   Min.   :-8.0000  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median :4502   Median :1.000   Median : 0.0000   Median : 0.0000  
##  Mean   :4504   Mean   :1.488   Mean   :-0.1847   Mean   : 0.1171  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.: 1.0000   3rd Qu.: 1.0000  
##  Max.   :9022   Max.   :2.000   Max.   : 1.0000   Max.   : 1.0000  
##                                 NA's   :2752      NA's   :2752
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)

#************************************************************************************************************
#Here I am just taking a look at the data with glimpse and summarise.
glimpse(new_data)
## Observations: 8,984
## Variables: 4
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ sex        (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ acdconc    (int) 1, -7, NA, 0, 1, 0, NA, 1, 1, 1, 0, 0, 0, NA, 0, 0,...
## $ vocconc    (int) 0, 1, NA, 1, 0, 0, NA, 0, 0, 0, 0, 0, 0, NA, 1, 0, ...
summarise(new_data)
## data frame with 0 columns and 0 rows
#Here I am viewing the data in a small table
tbl_df(new_data)
## Source: local data frame [8,984 x 4]
## 
##    PUBID_1997   sex acdconc vocconc
##         (int) (int)   (int)   (int)
## 1           1     2       1       0
## 2           2     1      -7       1
## 3           3     2      NA      NA
## 4           4     2       0       1
## 5           5     1       1       0
## 6           6     2       0       0
## 7           7     1      NA      NA
## 8           8     2       1       0
## 9           9     1       1       0
## 10         10     1       1       0
## ..        ...   ...     ...     ...
#Next I filter out the NA's and anything below a 0 in the academic concecentrators
#and vocational concentrators.  I also filtered out any invalid responses on the 
#gender variable as well.
new_datafltr<-filter(new_data,sex>0,acdconc>-6, vocconc>-6)

#Here I am looking at the filtered data.
tbl_df(new_datafltr)
## Source: local data frame [5,697 x 4]
## 
##    PUBID_1997   sex acdconc vocconc
##         (int) (int)   (int)   (int)
## 1           1     2       1       0
## 2           4     2       0       1
## 3           5     1       1       0
## 4           6     2       0       0
## 5           8     2       1       0
## 6           9     1       1       0
## 7          10     1       1       0
## 8          11     2       0       0
## 9          12     1       0       0
## 10         13     1       0       0
## ..        ...   ...     ...     ...
#Here I am putting the three variables into a comparison table for a chi-square test.
new_datafltrtable<-xtabs(~new_datafltr$sex + new_datafltr$acdconc + new_datafltr$vocconc)

#Heres I am looking at the new table with the data
ftable(new_datafltrtable)
##                                       new_datafltr$vocconc    0    1
## new_datafltr$sex new_datafltr$acdconc                               
## 1                0                                         1122  623
##                  1                                          655  434
## 2                0                                         1143  412
##                  1                                          939  369
#Here I am performing the chi-square test on the hypothesis
summary(new_datafltrtable)
## Call: xtabs(formula = ~new_datafltr$sex + new_datafltr$acdconc + new_datafltr$vocconc)
## Number of cases in table: 5697 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 99.92, df = 4, p-value = 1.021e-20

The null hypothesis is that the three variables sex, academic concentrator status, and vocational concentrator status are independent of each other. In other words, knowing any of these variables doesn’t help to predict any of the other variables. The alternative hypothesis is that these three variables are dependent of each other and one may be able to predict the other when they know one or two of the other variables.

The results of the Chi-Square test is a p-value of 1.021e-20 which is significantly smaller than the alpha set at .05, therefore we reject the null hypothesis.

```