#WFED540 Class 12 Practice
##By: Andrew Leigey
##Date: November 6, 2015



require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
new_data <- read.table('Class12_3_Variables.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R9859700','R9859900')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R9859700 <- factor(data$R9859700, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
  data$R9859900 <- factor(data$R9859900, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "TRANS_ACAD_CONC HSTR",
    "TRANS_VOC_CONC HSTR"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","sex","acdconc","vocconc")
  return(data)
}

#*************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
#categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
#categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997        sex           acdconc           vocconc       
##  Min.   :   1   Min.   :1.000   Min.   :-8.0000   Min.   :-8.0000  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median :4502   Median :1.000   Median : 0.0000   Median : 0.0000  
##  Mean   :4504   Mean   :1.488   Mean   :-0.1847   Mean   : 0.1171  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.: 1.0000   3rd Qu.: 1.0000  
##  Max.   :9022   Max.   :2.000   Max.   : 1.0000   Max.   : 1.0000  
##                                 NA's   :2752      NA's   :2752
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)

#************************************************************************************************************

glimpse(new_data)
## Observations: 8,984
## Variables: 4
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ sex        (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ acdconc    (int) 1, -7, NA, 0, 1, 0, NA, 1, 1, 1, 0, 0, 0, NA, 0, 0,...
## $ vocconc    (int) 0, 1, NA, 1, 0, 0, NA, 0, 0, 0, 0, 0, 0, NA, 1, 0, ...
summarise(new_data)
## data frame with 0 columns and 0 rows
tbl_df(new_data)
## Source: local data frame [8,984 x 4]
## 
##    PUBID_1997   sex acdconc vocconc
##         (int) (int)   (int)   (int)
## 1           1     2       1       0
## 2           2     1      -7       1
## 3           3     2      NA      NA
## 4           4     2       0       1
## 5           5     1       1       0
## 6           6     2       0       0
## 7           7     1      NA      NA
## 8           8     2       1       0
## 9           9     1       1       0
## 10         10     1       1       0
## ..        ...   ...     ...     ...
new_datafltr<-filter(new_data,sex>0,acdconc>-6, vocconc>-6)

tbl_df(new_datafltr)
## Source: local data frame [5,697 x 4]
## 
##    PUBID_1997   sex acdconc vocconc
##         (int) (int)   (int)   (int)
## 1           1     2       1       0
## 2           4     2       0       1
## 3           5     1       1       0
## 4           6     2       0       0
## 5           8     2       1       0
## 6           9     1       1       0
## 7          10     1       1       0
## 8          11     2       0       0
## 9          12     1       0       0
## 10         13     1       0       0
## ..        ...   ...     ...     ...
new_datafltrtable<-xtabs(~new_datafltr$sex + new_datafltr$acdconc + new_datafltr$vocconc)

ftable(new_datafltrtable)
##                                       new_datafltr$vocconc    0    1
## new_datafltr$sex new_datafltr$acdconc                               
## 1                0                                         1122  623
##                  1                                          655  434
## 2                0                                         1143  412
##                  1                                          939  369
summary(new_datafltrtable)
## Call: xtabs(formula = ~new_datafltr$sex + new_datafltr$acdconc + new_datafltr$vocconc)
## Number of cases in table: 5697 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 99.92, df = 4, p-value = 1.021e-20