# Getting the data for Assignment 3, WF ED 540
# This RScript is available as ReadGSSdata.R in 
# the Class 12 folder at https://goo.gl/lSYV9c

# Load the packages that you will need
require(foreign)
## Loading required package: foreign
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
# The packages must be installed, if not already 
# installed, in your instance of R
# Read the SPSS .SAV file in to R
GSSdata <- read.spss("GSS.SAV",  
                     max.value.labels=TRUE, to.data.frame=FALSE,
                     trim.factor.names=FALSE, 
                     reencode=NA, use.missings=to.data.frame)
# Read the newly-read data from GSS.SAV 
# into a data frame in R
GSSdata <- data.frame(GSSdata)
# Convert the data frame to a table frame through
# the dplyr function, tbl_df
GSSdata <- tbl_df(GSSdata)
# Just to be sure you have the data read correctly,
# run some descriptive statistics (mean, standard
# deviation, and count of cases) for an important
# variable we will use, HRS1:
GSSdata %>%
  summarise(mean_hours=mean(HRS1), sd_hours=sd(HRS1), n=n())
## Source: local data frame [1 x 3]
## 
##   mean_hours sd_hours     n
##        (dbl)    (dbl) (int)
## 1   24.07573 24.19891  4820
# chaining vlaues of -1,98, 99 to NA
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 99, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 98, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == -1, NA, GSSdata$HRS1)
# ln(HRS1)
GSSdata$logHRS1 <- log(GSSdata$HRS1)
head(GSSdata$logHRS1)
## [1] 2.708050 3.401197 4.094345       NA       NA       NA

1. Visualizations. Using the ggvis package, create two plots ~

a. One histogram of HRS1; and

GSSdata %>%
  ggvis(~HRS1) %>%
  layer_histograms()%>%
  add_axis("x", title = "Number of hours worked", title_offset = 50)
## Guessing width = 5 # range / 18

b. Another histogram of the natural logarithm (ln) of HRS1, or ln(HRS1).

GSSdata %>%
  ggvis(~logHRS1) %>%
  layer_histograms()
## Guessing width = 0.2 # range / 23

2. Analyses. Conduct two ordinary least squares regression analyses ~

a. Use HRS1 as the dependent variable and at least three variables from the GSS 2012 Cross-Section and Panel Combined data set as independent variables; and

rg1<-lm(HRS1~ SEX + INCOME + CHILDS, data = GSSdata)
summary(rg1)
## 
## Call:
## lm(formula = HRS1 ~ SEX + INCOME + CHILDS, data = GSSdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.187  -6.158   1.352   6.878  52.185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.76283    0.96729  53.513  < 2e-16 ***
## SEX         -6.83234    0.56506 -12.091  < 2e-16 ***
## INCOME      -0.06313    0.01897  -3.329 0.000884 ***
## CHILDS      -0.17509    0.18161  -0.964 0.335080    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.06 on 2850 degrees of freedom
##   (1966 observations deleted due to missingness)
## Multiple R-squared:  0.05383,    Adjusted R-squared:  0.05284 
## F-statistic: 54.05 on 3 and 2850 DF,  p-value: < 2.2e-16
confint(rg1)
##                  2.5 %      97.5 %
## (Intercept) 49.8661762 53.65949281
## SEX         -7.9403083 -5.72437518
## INCOME      -0.1003202 -0.02594343
## CHILDS      -0.5311864  0.18100984

b.Use ln(HRS1) as the dependent variable and the same set of independent variables that you chose for 2.a.

rg2<-lm(logHRS1 ~ SEX + INCOME + CHILDS, data = GSSdata)
summary(rg2)
## 
## Call:
## lm(formula = logHRS1 ~ SEX + INCOME + CHILDS, data = GSSdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7083 -0.0535  0.1376  0.2627  1.0106 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.9450589  0.0348306 113.264  < 2e-16 ***
## SEX         -0.2036731  0.0203469 -10.010  < 2e-16 ***
## INCOME      -0.0026367  0.0006829  -3.861 0.000115 ***
## CHILDS      -0.0093400  0.0065395  -1.428 0.153328    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5425 on 2850 degrees of freedom
##   (1966 observations deleted due to missingness)
## Multiple R-squared:  0.04075,    Adjusted R-squared:  0.03974 
## F-statistic: 40.35 on 3 and 2850 DF,  p-value: < 2.2e-16
confint(rg2)
##                    2.5 %      97.5 %
## (Intercept)  3.876763157  4.01335462
## SEX         -0.243569275 -0.16377695
## INCOME      -0.003975814 -0.00129762
## CHILDS      -0.022162555  0.00348253

Hypothesis tests

A.HRS1

Interpretation

Point estimate

The estimate of the population coefficient is -0.18 (rounded). The means that there is a negative relationship between HRS1 and NUMBER OF CHILDREN – as number of children increases, hours worked decreses.

Interval estimate

How accurate is the estimate of the relationship between number of children and hours worked? Examine the 95% confidence interval for CHILDS:

## 2.5 % 97.5 %

## CHILDS -0.5311864 0.18100984

Our best estimate is that for each one child, hours worked reduces by 0.18 hour. However, we are 95% confident that this reduction in hours worked is between -0.53 and 0.18.

Hypothesis tests (Natural Algorithm)

B. Natural Algorithm of HRS1

Interpretation

Point estimate

The estimate of the population coefficient is -0.01 (rounded). The means that there is a negative relationship between HRS1 and NUMBER OF CHILDREN – as number of children increases, hours worked decreses.

Interval estimate

How accurate is the estimate of the relationship between number of children and hours worked? Examine the 95% confidence interval for CHILDS:

## 2.5 % 97.5 %

## CHILDS -0.022162555 0.00348253

Our best estimate is that for each one child, hours worked reduces by 0.01 hour. However, we are 95% confident that this reduction in hours worked is between -0.022 and 0.003.

3. Reports. Build two APA–correct tables ~

a. One table reporting the regression of HRS1 that you conducted in 2.a.; and

rg11 <- tbl_df(GSSdata)
rg11 %>%
  summarise(mean_HRS1=mean(HRS1, na.rm=TRUE), mean_SEX=mean(SEX), mean_INC=mean(INCOME),
            mean_CH = mean(CHILDS)) 
## Source: local data frame [1 x 4]
## 
##   mean_HRS1 mean_SEX mean_INC  mean_CH
##       (dbl)    (dbl)    (dbl)    (dbl)
## 1  40.26384 1.557676 15.11846 1.931535
rg11 %>%
  summarise(sd_HRS1=sd(HRS1, na.rm=TRUE), sd_SEX=sd(SEX), sd_INCOME=sd(INCOME),
            sd_CHILDS = sd(CHILDS))
## Source: local data frame [1 x 4]
## 
##    sd_HRS1    sd_SEX sd_INCOME sd_CHILDS
##      (dbl)     (dbl)     (dbl)     (dbl)
## 1 15.47928 0.4967138  18.42523  1.686427

The table:

b. Another table reporting the regression of ln(HRS1) that you conducted in 2.b.

rg22 <- tbl_df(GSSdata)
rg22 %>%
  summarise(mean_HRS1=mean(logHRS1, na.rm=TRUE), mean_SEX=mean(SEX), mean_INC=mean(INCOME),
            mean_CH = mean(CHILDS)) 
## Source: local data frame [1 x 4]
## 
##   mean_HRS1 mean_SEX mean_INC  mean_CH
##       (dbl)    (dbl)    (dbl)    (dbl)
## 1  3.584611 1.557676 15.11846 1.931535
rg22 %>%
  summarise(sd_HRS1=sd(logHRS1, na.rm=TRUE), sd_SEX=sd(SEX), sd_INCOME=sd(INCOME),
            sd_CHILDS = sd(CHILDS))
## Source: local data frame [1 x 4]
## 
##     sd_HRS1    sd_SEX sd_INCOME sd_CHILDS
##       (dbl)     (dbl)     (dbl)     (dbl)
## 1 0.5535699 0.4967138  18.42523  1.686427

The table: