Nov 19, 2015
Getting the data for Assignment 3, WF ED 540
This RScript is available as ReadGSSdata.R in
Load the packages that you will need
require(foreign)
## Loading required package: foreign
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
The packages must be installed, if not already
installed, in your instance of R
Read the SPSS .SAV file in to R
GSSdata <- read.spss("GSS.SAV",
max.value.labels=TRUE, to.data.frame=FALSE,
trim.factor.names=FALSE,
reencode=NA, use.missings=to.data.frame)
Read the newly-read data from GSS.SAV
into a data frame in R
Convert the data frame to a table frame through
the dplyr function, tbl_df
Just to be sure you have the data read correctly,
run some descriptive statistics (mean, standard
deviation, and count of cases) for an important
variable we will use, HRS1:
## Source: local data frame [1 x 3]
##
## mean_hours sd_hours n
## (dbl) (dbl) (int)
## 1 24.07573 24.19891 4820
1. Visualizations. Using the ggvis package, create two plots ~
a. One histogram of HRS1; and
hist(GSSdata$HRS1, main="Histogram on Hours per Week", xlab="Working Hour")

b. Another histogram of the natural logarithm (ln) of HRS1, or ln(HRS1).
## Source: local data frame [1 x 3]
##
## mean_hours sd_hours n
## (dbl) (dbl) (int)
## 1 24.87863 23.37249 4820
hist((GSSdata$HRS1log0), main="Log Scale of Histogram on Hours per Week", xlab="Working Hour_log")

Hoursweek <- table (GSSdata$HRS1)
Hoursweek
##
## -1 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1935 9 2 1 10 10 6 5 19 95 20 2 20 5 5
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 32 24 3 8 3 104 7 9 2 30 50 9 9 13 2
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 105 6 69 8 10 84 50 36 43 9 831 15 39 17 35
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 149 29 11 66 3 219 3 25 6 1 84 18 9 11 2
## 60 62 63 64 65 66 67 68 70 72 74 75 77 80 82
## 166 8 3 4 30 1 3 9 51 9 1 6 2 27 1
## 85 86 89 98 99
## 3 2 21 2 29
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 99, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 98, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == -1, NA, GSSdata$HRS1)
Hoursweek1 <- table (GSSdata$HRS1)
Hoursweek1
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 9 2 1 10 10 6 5 19 95 20 2 20 5 5 32 24 3 8
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 3 104 7 9 2 30 50 9 9 13 2 105 6 69 8 10 84 50
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 36 43 9 831 15 39 17 35 149 29 11 66 3 219 3 25 6 1
## 55 56 57 58 59 60 62 63 64 65 66 67 68 70 72 74 75 77
## 84 18 9 11 2 166 8 3 4 30 1 3 9 51 9 1 6 2
## 80 82 85 86 89
## 27 1 3 2 21
Income <- table (GSSdata$INCOME)
GSSdata$INCOME <- ifelse (GSSdata$INCOME == 13, NA, GSSdata$INCOME)
GSSdata$INCOME <- ifelse (GSSdata$INCOME == 98, NA,GSSdata$INCOME)
Income1<- table (GSSdata$INCOME)
Income1
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 58 46 35 28 23 44 52 98 314 248 303 3125
I reclassfied the “SEX”" category: Male=o, Female=1
Sex <- table(GSSdata$SEX)
Sex
##
## 1 2
## 2132 2688
GSSdata$Sexdicho <- 0
GSSdata$Sexdicho <- ifelse(GSSdata$SEX == 2, 1, GSSdata$Sexdicho)
Sexdicho <- table(GSSdata$Sexdicho)
Sexdicho
##
## 0 1
## 2132 2688
Race <- table(GSSdata$RACE)
Race
##
## 1 2 3
## 3700 722 398
I reclassfied the “RACE”" category: Black=1, Non-black=0
GSSdata$Racedicho <- 0
GSSdata$Racedicho <- ifelse(GSSdata$RACE == 2, 1, GSSdata$Racedicho)
Racedicho <- table(GSSdata$Racedicho)
Racedicho
##
## 0 1
## 4098 722
2. Analyses. Conduct two ordinary least squares regression analyses ~
a. Use HRS1 as the dependent variable and at least three variables from the
rg1<-lm (HRS1~ INCOME+Sexdicho+Racedicho, data=GSSdata)
summary(rg1)
##
## Call:
## lm(formula = HRS1 ~ INCOME + Sexdicho + Racedicho, data = GSSdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.393 -4.844 1.572 7.023 53.413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.0531 2.0854 14.411 < 2e-16 ***
## INCOME 1.1950 0.1769 6.757 1.73e-11 ***
## Sexdicho -6.4159 0.5752 -11.155 < 2e-16 ***
## Racedicho 0.4507 0.8345 0.540 0.589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.82 on 2667 degrees of freedom
## (2149 observations deleted due to missingness)
## Multiple R-squared: 0.06237, Adjusted R-squared: 0.06132
## F-statistic: 59.14 on 3 and 2667 DF, p-value: < 2.2e-16
confint(rg1)
## 2.5 % 97.5 %
## (Intercept) 25.9640079 34.142242
## INCOME 0.8482067 1.541831
## Sexdicho -7.5436800 -5.288098
## Racedicho -1.1856423 2.087056
α= .05,then the p-value, 2.2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between the dependent variable and the entire sent of independent variables.
α= .05, then the p-value, 1.73e-11, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’ and ‘INCOME’.
α= .05, then the p-value, 2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’ and ‘Sexdicho’.
Our estimate is that for each 1 hour of additional hours per week, total family income increase by 1.20(rounded) . However, we are 95% confident that this increasing is between 0.85 and 1.54.
Our estimate is that women work less 6.42(rounded) hours per week than men. However, we are 95% confident that women work between 7.54 and 5.29 hours per week less than men.
b. Use ln(HRS1) as the dependent variable and the same set of independent variables that you chose for 2.a.
rg2<-lm(HRS1log0~INCOME+Sexdicho+Racedicho, data=GSSdata)
summary(rg2)
##
## Call:
## lm(formula = HRS1log0 ~ INCOME + Sexdicho + Racedicho, data = GSSdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.049 -23.637 4.281 15.363 102.281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9907 1.8006 1.106 0.269
## INCOME 2.5381 0.1530 16.586 <2e-16 ***
## Sexdicho -7.8103 0.6754 -11.564 <2e-16 ***
## Racedicho 0.6012 0.9485 0.634 0.526
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.12 on 4370 degrees of freedom
## (446 observations deleted due to missingness)
## Multiple R-squared: 0.09198, Adjusted R-squared: 0.09136
## F-statistic: 147.6 on 3 and 4370 DF, p-value: < 2.2e-16
confint(rg2)
## 2.5 % 97.5 %
## (Intercept) -1.539289 5.520719
## INCOME 2.238061 2.838068
## Sexdicho -9.134319 -6.486186
## Racedicho -1.258299 2.460609
α= .05, then the p-value, 2.2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between the dependent variable and the entire sent of independent variables.
α= .05, then the p-value, <2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’(log) and ‘INCOME’.
α= .05, then the p-value, 2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’(log) and ‘Sexdicho’.
Our estimate is that for each 1 hour of additional hours per week, total family income increase by 2.54(rounded) . However, we are 95% confident that this increasing is between 2.24 and 2.84(rounded).
Our estimate is that women work less 7.81(rounded) hours per week than men. However, we are 95% confident that women work between 9.13 and 6.49 hours per week less than men.
3. Reports. Build two APA–correct tables ~
HRS1 <- tbl_df(GSSdata)
HRS1 %>%
summarise(mean_Hoursweek1=mean(Hoursweek1), mean_Income1=mean(Income1), mean_Sex=mean(Sexdicho),mean_Race = mean(Racedicho))
## Source: local data frame [1 x 4]
##
## mean_Hoursweek1 mean_Income1 mean_Sex mean_Race
## (dbl) (dbl) (dbl) (dbl)
## 1 37.06494 364.5 0.5576763 0.1497925
HRS1 %>%
summarise(sd_Hoursweek1=sd(Hoursweek1), sd_Income1=sd(Income1), sd_Sex=mean(Sexdicho),sd_Race = sd(Racedicho))
## Source: local data frame [1 x 4]
##
## sd_Hoursweek1 sd_Income1 sd_Sex sd_Race
## (dbl) (dbl) (dbl) (dbl)
## 1 100.0032 876.225 0.5576763 0.356905
Hoursweek1log <- ifelse(Hoursweek1 < 1, abs(Hoursweek1), Hoursweek1)
Hoursweek1log <- ifelse(Hoursweek1log == 0, (Hoursweek1log + .1), Hoursweek1log)
Income1log <- ifelse(Income1 < 1, abs(Income1), Income1)
Income1log <- ifelse(Income1log == 0, (Income1log + .1), Income1log)
Sexdicholog <- ifelse(Sexdicho < 1, abs(Sexdicho), Sexdicho)
Sexdicholog <- ifelse(Sexdicholog == 0, (Sexdicholog + .1), Sexdicholog)
Racedicholog <- ifelse(Racedicho < 1, abs(Sexdicho), Sexdicho)
Racedicholog <- ifelse(Racedicholog == 0, (Racedicholog + .1), Racedicholog)
HRS1 <- tbl_df(GSSdata)
HRS1 %>%
summarise(mean_Hoursweek1log=mean(Hoursweek1log), mean_Income1log=mean(Income1log), mean_Sexdicholog=mean(Sexdicholog),mean_Racedicholog = mean(Racedicholog))
## Source: local data frame [1 x 4]
##
## mean_Hoursweek1log mean_Income1log mean_Sexdicholog mean_Racedicholog
## (dbl) (dbl) (dbl) (dbl)
## 1 37.06494 364.5 2410 2410
HRS1 %>%
summarise(sd_Hoursweek1log=sd(Hoursweek1log), sd_Income1log=sd(Income1log), sd_Sexdicholog=sd(Sexdicholog),sd_Racedicholog = sd(Racedicholog))
## Source: local data frame [1 x 4]
##
## sd_Hoursweek1log sd_Income1log sd_Sexdicholog sd_Racedicholog
## (dbl) (dbl) (dbl) (dbl)
## 1 100.0032 876.225 393.1514 393.1514
a. One table reporting the regression of HRS1 that you conducted in 2.a.; and

b. Another table reporting the regression of ln(HRS1) that you conducted in 2.b.
