#Title: WFED540 Assignment-3
#Author: Andrew Leigey
#Date: November 17, 2015
#Output: html_document


# Loading packages
require(foreign)
## Loading required package: foreign
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
require(knitr)
## Loading required package: knitr
require(rmarkdown)
## Loading required package: rmarkdown
# The packages must be installed, if not already 
# installed, in your instance of R

# Read the SPSS .SAV file in to R
GSSdata <- read.spss("GSS.SAV",  
                     max.value.labels=TRUE, to.data.frame=FALSE,
                     trim.factor.names=FALSE, 
                     reencode=NA, use.missings=to.data.frame)


# Read the newly-read data from GSS.SAV 
# into a data frame in R
GSSdata <- data.frame(GSSdata)

# Convert the data frame to a table frame through
# the dplyr function, tbl_df
GSSdata <- tbl_df(GSSdata)
# Just to be sure you have the data read correctly,
# run some descriptive statistics (mean, standard
# deviation, and count of cases) for an important
# variable we will use, HRS1:
GSSdata %>%
  summarise(mean_hours=mean(HRS1), sd_hours=sd(HRS1), n=n())
## Source: local data frame [1 x 3]
## 
##   mean_hours sd_hours     n
##        (dbl)    (dbl) (int)
## 1   24.07573 24.19891  4820

Here I am taking a look at the HRS1 variable in the GSSdata

GSSdata$HRS1

#1. A.) Here is the histogram for HRS1
hist(GSSdata$HRS1)

##The Picture of the histogram

#Here I am taking the natural logarithm of HRS1 for the next Histogram.
ln_HRS1<-log(GSSdata$HRS1)
## Warning in log(GSSdata$HRS1): NaNs produced
#1. B.) Here is the histogram for the natural logarithm of HRS1
hist(ln_HRS1)

##The Picture of the histogram for the natural logarithm of HRS1

#2. A.) Here is the ordinary least squares regression using HRS1 as the dependent
#variable and I_AGE, I_SEX, and I_RELIGION as the independent variables.
hrs_factors<-lm(HRS1~I_AGE+I_SEX+I_RELIGION, data = GSSdata)

hrs_factors
## 
## Call:
## lm(formula = HRS1 ~ I_AGE + I_SEX + I_RELIGION, data = GSSdata)
## 
## Coefficients:
## (Intercept)        I_AGE        I_SEX   I_RELIGION  
##     56.6000      -6.8802      -8.5905       0.1637
summary(hrs_factors)
## 
## Call:
## lm(formula = HRS1 ~ I_AGE + I_SEX + I_RELIGION, data = GSSdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.948 -19.942  -1.313  16.749  85.774 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  56.6000     1.5229  37.166   <2e-16 ***
## I_AGE        -6.8802     0.2850 -24.141   <2e-16 ***
## I_SEX        -8.5905     0.6540 -13.135   <2e-16 ***
## I_RELIGION    0.1637     0.2379   0.688    0.491    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.36 on 4752 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:  0.1453, Adjusted R-squared:  0.1448 
## F-statistic: 269.3 on 3 and 4752 DF,  p-value: < 2.2e-16
confint(hrs_factors)
##                  2.5 %     97.5 %
## (Intercept) 53.6144339 59.5856647
## I_AGE       -7.4389696 -6.3214998
## I_SEX       -9.8726597 -7.3083293
## I_RELIGION  -0.3026711  0.6301078
#2. B.) Here is the ordinary least squares regression using the natural logarithym 
#of HRS1 as the dependent variable and the same set of independent variabls that I 
#chose in 2.A.
ln_hrs_factors<-lm(ln_HRS1~I_AGE+I_SEX+I_RELIGION, data = GSSdata)

ln_hrs_factors
## 
## Call:
## lm(formula = ln_HRS1 ~ I_AGE + I_SEX + I_RELIGION, data = GSSdata)
## 
## Coefficients:
## (Intercept)        I_AGE        I_SEX   I_RELIGION  
##     4.02723     -0.04005     -0.20584     -0.01001
summary(ln_hrs_factors)
## 
## Call:
## lm(formula = ln_HRS1 ~ I_AGE + I_SEX + I_RELIGION, data = GSSdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7213 -0.0697  0.1334  0.2608  1.1496 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.027234   0.048433  83.151  < 2e-16 ***
## I_AGE       -0.040045   0.011034  -3.629 0.000289 ***
## I_SEX       -0.205845   0.020678  -9.955  < 2e-16 ***
## I_RELIGION  -0.010010   0.007311  -1.369 0.171058    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5513 on 2847 degrees of freedom
##   (1969 observations deleted due to missingness)
## Multiple R-squared:  0.03755,    Adjusted R-squared:  0.03653 
## F-statistic: 37.02 on 3 and 2847 DF,  p-value: < 2.2e-16
confint(ln_hrs_factors)
##                   2.5 %       97.5 %
## (Intercept)  3.93226715  4.122200192
## I_AGE       -0.06167991 -0.018410186
## I_SEX       -0.24639054 -0.165298482
## I_RELIGION  -0.02434620  0.004325626
#Here I am getting the means and standard deviations of all variables 
#in the HRS1 equation

#HRS1
hrs_factorstbl <- tbl_df(GSSdata)
hrs_factorstbl %>%
  summarise(mean_HRS1=mean(HRS1), mean_I_AGE=mean(I_AGE,na.rm=TRUE), mean_I_SEX=mean(I_SEX),
            mean_I_RELIGION = mean(I_RELIGION,na.rm=TRUE)) 
## Source: local data frame [1 x 4]
## 
##   mean_HRS1 mean_I_AGE mean_I_SEX mean_I_RELIGION
##       (dbl)      (dbl)      (dbl)           (dbl)
## 1  24.07573   2.831411   1.557676        2.150115
hrs_factorstbl %>%
  summarise(sd_HRS1=sd(HRS1), sd_I_AGE=sd(I_AGE,na.rm=TRUE), sd_I_SEX=sd(I_SEX),
            sd_I_RELIGION = sd(I_RELIGION,na.rm=TRUE))
## Source: local data frame [1 x 4]
## 
##    sd_HRS1 sd_I_AGE  sd_I_SEX sd_I_RELIGION
##      (dbl)    (dbl)     (dbl)         (dbl)
## 1 24.19891 1.170855 0.4967138      1.404815
#Here I am getting the means and standard deviations of all variables 
#in the logarithym of HRS1 equation

#logarithym of HRS1

ln_hrs_factorstbl <- tbl_df(GSSdata)
ln_hrs_factorstbl %>%
  summarise(mean_ln_HRS1=mean(ln_HRS1,na.rm=TRUE), mean_I_AGE=mean(I_AGE,na.rm=TRUE), mean_I_SEX=mean(I_SEX,na.rm=TRUE),
            mean_I_RELIGION = mean(I_RELIGION,na.rm=TRUE)) 
## Source: local data frame [1 x 4]
## 
##   mean_ln_HRS1 mean_I_AGE mean_I_SEX mean_I_RELIGION
##          (dbl)      (dbl)      (dbl)           (dbl)
## 1     3.595463   2.831411   1.557676        2.150115
ln_hrs_factorstbl %>%
  summarise(sd_ln_HRS1=sd(ln_HRS1,na.rm=TRUE), sd_I_AGE=sd(I_AGE,na.rm=TRUE), sd_I_SEX=sd(I_SEX,na.rm=TRUE),
            sd_I_RELIGION = sd(I_RELIGION,na.rm=TRUE))
## Source: local data frame [1 x 4]
## 
##   sd_ln_HRS1 sd_I_AGE  sd_I_SEX sd_I_RELIGION
##        (dbl)    (dbl)     (dbl)         (dbl)
## 1   0.560348 1.170855 0.4967138      1.404815

Table1. Relationship Between HRS1 and I_AGE, I_SEX, and I_RELIGION

Null Hypothesis

There is no relationship between the dependent variable and the

entire set of independent variables.

Results: Since the alpha is set at .05, and the p-value being 2.2e-16 which is less than alpha, we reject the null hypothesis that there is no relationship between the dependent variable and the entire set of independent variables.

5.) The estimate of the population coefficient for I_AGE in the table is -6.88 (rounded). This means that there is a negative relationship between ‘HRS1’ and ‘I_AGE’ – as I_AGE increases, HRS1 decreases.

Table2. Relationship Between ln_HRS1 and I_AGE, I_SEX, and I_RELIGION

Null Hypothesis

There is no relationship between the dependent variable and the

entire set of independent variables.

Results: Since the alpha is set at .05, and the p-value being 2.2e-16 which is less than alpha, we reject the null hypothesis that there is no relationship between the dependent variable and the entire set of independent variables.

5.) The estimate of the population coefficient for I_SEX in the table is -0.21 (rounded). This means that there is a negative relationship between ‘lnHRS1’ and ‘I_SEX’ – as I_SEX increases, lnHRS1 decreases.

```