# Getting the data for Assignment 3, WF ED 540
# This RScript is available as ReadGSSdata.R in
# the Class 12 folder at https://goo.gl/lSYV9c
# Load the packages that you will need
require(foreign)
## Loading required package: foreign
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
# The packages must be installed, if not already
# installed, in your instance of R
# Read the SPSS .SAV file in to R
GSSdata <- read.spss("GSS.SAV",
max.value.labels=TRUE, to.data.frame=FALSE,
trim.factor.names=FALSE,
reencode=NA, use.missings=to.data.frame)
# Read the newly-read data from GSS.SAV
# into a data frame in R
GSSdata <- data.frame(GSSdata)
# Convert the data frame to a table frame through
# the dplyr function, tbl_df
GSSdata <- tbl_df(GSSdata)
# Just to be sure you have the data read correctly,
# run some descriptive statistics (mean, standard
# deviation, and count of cases) for an important
# variable we will use, HRS1:
GSSdata %>%
summarise(mean_hours=mean(HRS1), sd_hours=sd(HRS1), n=n())
## Source: local data frame [1 x 3]
##
## mean_hours sd_hours n
## (dbl) (dbl) (int)
## 1 24.07573 24.19891 4820
# chaining vlaues of -1,98, 99 to NA
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 99, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 98, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == -1, NA, GSSdata$HRS1)
# ln(HRS1)
GSSdata$logHRS1 <- log(GSSdata$HRS1)
head(GSSdata$logHRS1)
## [1] 2.708050 3.401197 4.094345 NA NA NA
GSSdata %>%
ggvis(~HRS1) %>%
layer_histograms()%>%
add_axis("x", title = "Number of hours worked", title_offset = 50)
## Guessing width = 5 # range / 18
GSSdata %>%
ggvis(~logHRS1) %>%
layer_histograms()
## Guessing width = 0.2 # range / 23
rg1<-lm(HRS1~ SEX + INCOME + CHILDS, data = GSSdata)
summary(rg1)
##
## Call:
## lm(formula = HRS1 ~ SEX + INCOME + CHILDS, data = GSSdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.187 -6.158 1.352 6.878 52.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.76283 0.96729 53.513 < 2e-16 ***
## SEX -6.83234 0.56506 -12.091 < 2e-16 ***
## INCOME -0.06313 0.01897 -3.329 0.000884 ***
## CHILDS -0.17509 0.18161 -0.964 0.335080
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.06 on 2850 degrees of freedom
## (1966 observations deleted due to missingness)
## Multiple R-squared: 0.05383, Adjusted R-squared: 0.05284
## F-statistic: 54.05 on 3 and 2850 DF, p-value: < 2.2e-16
confint(rg1)
## 2.5 % 97.5 %
## (Intercept) 49.8661762 53.65949281
## SEX -7.9403083 -5.72437518
## INCOME -0.1003202 -0.02594343
## CHILDS -0.5311864 0.18100984
rg2<-lm(logHRS1 ~ SEX + INCOME + CHILDS, data = GSSdata)
summary(rg2)
##
## Call:
## lm(formula = logHRS1 ~ SEX + INCOME + CHILDS, data = GSSdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7083 -0.0535 0.1376 0.2627 1.0106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9450589 0.0348306 113.264 < 2e-16 ***
## SEX -0.2036731 0.0203469 -10.010 < 2e-16 ***
## INCOME -0.0026367 0.0006829 -3.861 0.000115 ***
## CHILDS -0.0093400 0.0065395 -1.428 0.153328
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5425 on 2850 degrees of freedom
## (1966 observations deleted due to missingness)
## Multiple R-squared: 0.04075, Adjusted R-squared: 0.03974
## F-statistic: 40.35 on 3 and 2850 DF, p-value: < 2.2e-16
confint(rg2)
## 2.5 % 97.5 %
## (Intercept) 3.876763157 4.01335462
## SEX -0.243569275 -0.16377695
## INCOME -0.003975814 -0.00129762
## CHILDS -0.022162555 0.00348253
The estimate of the population coefficient is -0.18 (rounded). The means that there is a negative relationship between HRS1 and NUMBER OF CHILDREN – as number of children increases, hours worked decreses.
How accurate is the estimate of the relationship between number of children and hours worked? Examine the 95% confidence interval for CHILDS:
## 2.5 % 97.5 %
## CHILDS -0.5311864 0.18100984
Our best estimate is that for each one child, hours worked reduces by 0.18 hour. However, we are 95% confident that this reduction in hours worked is between -0.53 and 0.18.
The estimate of the population coefficient is -0.01 (rounded). The means that there is a negative relationship between HRS1 and NUMBER OF CHILDREN – as number of children increases, hours worked decreses.
How accurate is the estimate of the relationship between number of children and hours worked? Examine the 95% confidence interval for CHILDS:
## 2.5 % 97.5 %
## CHILDS -0.022162555 0.00348253
Our best estimate is that for each one child, hours worked reduces by 0.01 hour. However, we are 95% confident that this reduction in hours worked is between -0.022 and 0.003.
rg11 <- tbl_df(GSSdata)
rg11 %>%
summarise(mean_HRS1=mean(HRS1, na.rm=TRUE), mean_SEX=mean(SEX), mean_INC=mean(INCOME),
mean_CH = mean(CHILDS))
## Source: local data frame [1 x 4]
##
## mean_HRS1 mean_SEX mean_INC mean_CH
## (dbl) (dbl) (dbl) (dbl)
## 1 40.26384 1.557676 15.11846 1.931535
rg11 %>%
summarise(sd_HRS1=sd(HRS1, na.rm=TRUE), sd_SEX=sd(SEX), sd_INCOME=sd(INCOME),
sd_CHILDS = sd(CHILDS))
## Source: local data frame [1 x 4]
##
## sd_HRS1 sd_SEX sd_INCOME sd_CHILDS
## (dbl) (dbl) (dbl) (dbl)
## 1 15.47928 0.4967138 18.42523 1.686427
The table:
rg22 <- tbl_df(GSSdata)
rg22 %>%
summarise(mean_HRS1=mean(logHRS1, na.rm=TRUE), mean_SEX=mean(SEX), mean_INC=mean(INCOME),
mean_CH = mean(CHILDS))
## Source: local data frame [1 x 4]
##
## mean_HRS1 mean_SEX mean_INC mean_CH
## (dbl) (dbl) (dbl) (dbl)
## 1 3.584611 1.557676 15.11846 1.931535
rg22 %>%
summarise(sd_HRS1=sd(logHRS1, na.rm=TRUE), sd_SEX=sd(SEX), sd_INCOME=sd(INCOME),
sd_CHILDS = sd(CHILDS))
## Source: local data frame [1 x 4]
##
## sd_HRS1 sd_SEX sd_INCOME sd_CHILDS
## (dbl) (dbl) (dbl) (dbl)
## 1 0.5535699 0.4967138 18.42523 1.686427
The table: