Assignment #3

Woongbae Park

Nov 19, 2015

Getting the data for Assignment 3, WF ED 540
This RScript is available as ReadGSSdata.R in
the Class 12 folder at https://goo.gl/lSYV9c
Load the packages that you will need
require(foreign)
## Loading required package: foreign
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
The packages must be installed, if not already
installed, in your instance of R
Read the SPSS .SAV file in to R
GSSdata <- read.spss("GSS.SAV",  
                     max.value.labels=TRUE, to.data.frame=FALSE,
                     trim.factor.names=FALSE, 
                     reencode=NA, use.missings=to.data.frame)
Read the newly-read data from GSS.SAV
into a data frame in R
Convert the data frame to a table frame through
the dplyr function, tbl_df
Just to be sure you have the data read correctly,
run some descriptive statistics (mean, standard
deviation, and count of cases) for an important
variable we will use, HRS1:
## Source: local data frame [1 x 3]
## 
##   mean_hours sd_hours     n
##        (dbl)    (dbl) (int)
## 1   24.07573 24.19891  4820
1. Visualizations. Using the ggvis package, create two plots ~
a. One histogram of HRS1; and
GSSdata %>%
  ggvis(~HRS1, fill := "#fff8dc") %>%
  layer_histograms() %>%
  add_axis("x", title = "Hours Worked", title_offset = 50) %>%
  add_axis("y", title = "Count", title_offset = 50) %>%
  add_axis("x", orient = "top", ticks = 0, title = "Histogram on Hours per Week",
           properties = axis_props(
             axis = list(stroke = "white"),
             labels = list(fontSize = 0)))
## Guessing width = 5 # range / 21

b. Another histogram of the natural logarithm (ln) of HRS1, or ln(HRS1).
## Warning in log(GSSdata$HRS1): NaNs produced
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.00   -1.00   25.00   24.08   40.00   99.00
GSSdata %>%
  ggvis(~LogHRS1, fill := "#fff8dc") %>%
  layer_histograms() %>%
  add_axis("x", title = "Log Hours Worked", title_offset = 50) %>%
  add_axis("y", title = "Count", title_offset = 50) %>%
  add_axis("x", orient = "top", ticks = 0, title = "Log Scale of Histogram on Hours per Week",
           properties = axis_props(
             axis = list(stroke = "white"),
             labels = list(fontSize = 0)))
## Guessing width = 0.2 # range / 23

Hoursweek <- table (GSSdata$HRS1)
Hoursweek
## 
##   -1    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 1935    9    2    1   10   10    6    5   19   95   20    2   20    5    5 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##   32   24    3    8    3  104    7    9    2   30   50    9    9   13    2 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##  105    6   69    8   10   84   50   36   43    9  831   15   39   17   35 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##  149   29   11   66    3  219    3   25    6    1   84   18    9   11    2 
##   60   62   63   64   65   66   67   68   70   72   74   75   77   80   82 
##  166    8    3    4   30    1    3    9   51    9    1    6    2   27    1 
##   85   86   89   98   99 
##    3    2   21    2   29
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 99, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == 98, NA, GSSdata$HRS1)
GSSdata$HRS1 <- ifelse (GSSdata$HRS1 == -1, NA, GSSdata$HRS1)
Hoursweek1 <- table (GSSdata$HRS1)
Hoursweek1
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   9   2   1  10  10   6   5  19  95  20   2  20   5   5  32  24   3   8 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   3 104   7   9   2  30  50   9   9  13   2 105   6  69   8  10  84  50 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##  36  43   9 831  15  39  17  35 149  29  11  66   3 219   3  25   6   1 
##  55  56  57  58  59  60  62  63  64  65  66  67  68  70  72  74  75  77 
##  84  18   9  11   2 166   8   3   4  30   1   3   9  51   9   1   6   2 
##  80  82  85  86  89 
##  27   1   3   2  21
Income <- table (GSSdata$INCOME)
GSSdata$INCOME <- ifelse (GSSdata$INCOME == 13, NA, GSSdata$INCOME)
GSSdata$INCOME <- ifelse (GSSdata$INCOME == 98, NA,GSSdata$INCOME)
Income1<- table (GSSdata$INCOME)
Income1
## 
##    1    2    3    4    5    6    7    8    9   10   11   12 
##   58   46   35   28   23   44   52   98  314  248  303 3125
I reclassfied the “SEX”" category: Male=o, Female=1
Sex <- table(GSSdata$SEX)
Sex
## 
##    1    2 
## 2132 2688
GSSdata$Sexdicho <- 0 
GSSdata$Sexdicho <- ifelse(GSSdata$SEX == 2, 1, GSSdata$Sexdicho)

Sexdicho <- table(GSSdata$Sexdicho)
Sexdicho
## 
##    0    1 
## 2132 2688
Race <- table(GSSdata$RACE)
Race
## 
##    1    2    3 
## 3700  722  398
I reclassfied the “RACE”" category: Black=1, Non-black=0
GSSdata$Racedicho <- 0 
GSSdata$Racedicho <- ifelse(GSSdata$RACE == 2, 1, GSSdata$Racedicho)
Racedicho <- table(GSSdata$Racedicho)
Racedicho
## 
##    0    1 
## 4098  722
2. Analyses. Conduct two ordinary least squares regression analyses ~
a. Use HRS1 as the dependent variable and at least three variables from the
rg1<-lm (HRS1~ INCOME+Sexdicho+Racedicho, data=GSSdata)
summary(rg1)
## 
## Call:
## lm(formula = HRS1 ~ INCOME + Sexdicho + Racedicho, data = GSSdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.393  -4.844   1.572   7.023  53.413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.0531     2.0854  14.411  < 2e-16 ***
## INCOME        1.1950     0.1769   6.757 1.73e-11 ***
## Sexdicho     -6.4159     0.5752 -11.155  < 2e-16 ***
## Racedicho     0.4507     0.8345   0.540    0.589    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.82 on 2667 degrees of freedom
##   (2149 observations deleted due to missingness)
## Multiple R-squared:  0.06237,    Adjusted R-squared:  0.06132 
## F-statistic: 59.14 on 3 and 2667 DF,  p-value: < 2.2e-16
confint(rg1)
##                  2.5 %    97.5 %
## (Intercept) 25.9640079 34.142242
## INCOME       0.8482067  1.541831
## Sexdicho    -7.5436800 -5.288098
## Racedicho   -1.1856423  2.087056
α= .05,then the p-value, 2.2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between the dependent variable and the entire sent of independent variables.
α= .05, then the p-value, 1.73e-11, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’ and ‘INCOME’.
α= .05, then the p-value, 2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’ and ‘Sexdicho’.
Our estimate is that for each 1 hour of additional hours per week, total family income increase by 1.20(rounded) . However, we are 95% confident that this increasing is between 0.85 and 1.54.
Our estimate is that women work less 6.42(rounded) hours per week than men. However, we are 95% confident that women work between 7.54 and 5.29 hours per week less than men.
b. Use ln(HRS1) as the dependent variable and the same set of independent variables that you chose for 2.a.
rg2<-lm(LogHRS1~INCOME+Sexdicho+Racedicho, data=GSSdata)
summary(rg2)
## 
## Call:
## lm(formula = LogHRS1 ~ INCOME + Sexdicho + Racedicho, data = GSSdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7116 -0.0611  0.1132  0.2690  1.4604 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.283825   0.074352  44.166  < 2e-16 ***
## INCOME       0.035646   0.006305   5.653 1.74e-08 ***
## Sexdicho    -0.184760   0.020601  -8.968  < 2e-16 ***
## Racedicho    0.048889   0.029819   1.640    0.101    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5329 on 2689 degrees of freedom
##   (2127 observations deleted due to missingness)
## Multiple R-squared:  0.04193,    Adjusted R-squared:  0.04087 
## F-statistic: 39.23 on 3 and 2689 DF,  p-value: < 2.2e-16
confint(rg2)
##                    2.5 %      97.5 %
## (Intercept)  3.138032529  3.42961817
## INCOME       0.023282737  0.04800964
## Sexdicho    -0.225156229 -0.14436467
## Racedicho   -0.009581511  0.10735885
α= .05, then the p-value, 2.2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between the dependent variable and the entire sent of independent variables.
α= .05, then the p-value, <2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’(log) and ‘INCOME’.
α= .05, then the p-value, 2e-16, is less than α. Therefore, we reject the null hypothesis that there is no relationship between ‘HRS1’(log) and ‘Sexdicho’.
Our estimate is that for each 1 hour of additional hours per week, total family income increase by 2.54(rounded) . However, we are 95% confident that this increasing is between 2.24 and 2.84(rounded).
Our estimate is that women work less 7.81(rounded) hours per week than men. However, we are 95% confident that women work between 9.13 and 6.49 hours per week less than men.
3. Reports. Build two APA–correct tables ~
HRS1 <- tbl_df(GSSdata)
HRS1 %>%
  summarise(mean_Hoursweek1=mean(Hoursweek1), mean_Income1=mean(Income1), mean_Sex=mean(Sexdicho),mean_Race = mean(Racedicho)) 
## Source: local data frame [1 x 4]
## 
##   mean_Hoursweek1 mean_Income1  mean_Sex mean_Race
##             (dbl)        (dbl)     (dbl)     (dbl)
## 1        37.06494        364.5 0.5576763 0.1497925
HRS1 %>%
  summarise(sd_Hoursweek1=sd(Hoursweek1), sd_Income1=sd(Income1), sd_Sex=mean(Sexdicho),sd_Race = sd(Racedicho)) 
## Source: local data frame [1 x 4]
## 
##   sd_Hoursweek1 sd_Income1    sd_Sex  sd_Race
##           (dbl)      (dbl)     (dbl)    (dbl)
## 1      100.0032    876.225 0.5576763 0.356905
Hoursweek1log <- ifelse(Hoursweek1 < 1, abs(Hoursweek1), Hoursweek1)
Hoursweek1log <- ifelse(Hoursweek1log == 0, (Hoursweek1log + .1), Hoursweek1log)
Income1log <- ifelse(Income1 < 1, abs(Income1), Income1)
Income1log <- ifelse(Income1log == 0, (Income1log + .1), Income1log)

Sexdicholog <- ifelse(Sexdicho < 1, abs(Sexdicho), Sexdicho)
Sexdicholog <- ifelse(Sexdicholog == 0, (Sexdicholog + .1), Sexdicholog)

Racedicholog <- ifelse(Racedicho < 1, abs(Sexdicho), Sexdicho)
Racedicholog <- ifelse(Racedicholog == 0, (Racedicholog + .1), Racedicholog)



HRS1 <- tbl_df(GSSdata)
HRS1 %>%
  summarise(mean_Hoursweek1log=mean(Hoursweek1log), mean_Income1log=mean(Income1log), mean_Sexdicholog=mean(Sexdicholog),mean_Racedicholog = mean(Racedicholog)) 
## Source: local data frame [1 x 4]
## 
##   mean_Hoursweek1log mean_Income1log mean_Sexdicholog mean_Racedicholog
##                (dbl)           (dbl)            (dbl)             (dbl)
## 1           37.06494           364.5             2410              2410
HRS1 %>%
  summarise(sd_Hoursweek1log=sd(Hoursweek1log), sd_Income1log=sd(Income1log), sd_Sexdicholog=sd(Sexdicholog),sd_Racedicholog = sd(Racedicholog)) 
## Source: local data frame [1 x 4]
## 
##   sd_Hoursweek1log sd_Income1log sd_Sexdicholog sd_Racedicholog
##              (dbl)         (dbl)          (dbl)           (dbl)
## 1         100.0032       876.225       393.1514        393.1514
a. One table reporting the regression of HRS1 that you conducted in 2.a.; and

b. Another table reporting the regression of ln(HRS1) that you conducted in 2.b.