##load libraries

> library(haven)
> library(readr)
> library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
> library(knitr)
> library(tidyverse)
-- Attaching packages ----------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2     v purrr   0.3.4
v tibble  3.0.1     v stringr 1.4.0
v tidyr   1.1.0     v forcats 0.5.0
-- Conflicts -------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
> library(readxl)
> library(ggplot2)
> library(broom)
> library(nortest)
Warning: package 'nortest' was built under R version 4.0.3
> library(lmtest)
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
> library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
> library(readr)
> library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:purrr':

    some
The following object is masked from 'package:dplyr':

    recode
> library(psych)

Attaching package: 'psych'
The following object is masked from 'package:car':

    logit
The following objects are masked from 'package:ggplot2':

    %+%, alpha
> library(haven)
> df <- read_dta("psid_cds.dta")
> View(df)
> 
> #1). Recode tenure so that 1=owning a house, 0=renting a house, and missing is set to missing (NA). (2 point)
> df1<- df %>%
+   mutate(
+     tenure2=ifelse(tenure==1,1,
+                          ifelse(tenure==5,0,NA)))
> #2). Check whether crace3 (child’s race) variable is a factor variable. (1 point)
> attach(df1)
> str(crace3)
 dbl+lbl [1:4243] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
 @ label       : chr "RECODE of g3race"
 @ format.stata: chr "%9.0g"
 @ labels      : Named num [1:3] 1 2 3
  ..- attr(*, "names")= chr [1:3] "White" "Black" "other"
> df1$crace3f<-factor(crace3)
> 
> #3). Run a test to see if there are significant differences in body mass index among kids of different racial backgrounds. (3 points) 
> ob1<-aov(cbmi~crace3f, data=df1)
> ob1
Call:
   aov(formula = cbmi ~ crace3f, data = df1)

Terms:
                  crace3f Residuals
Sum of Squares     601.69 298229.59
Deg. of Freedom         2      3935

Residual standard error: 8.705686
Estimated effects may be unbalanced
305 observations deleted due to missingness
> anova(ob1)
Analysis of Variance Table

Response: cbmi
            Df Sum Sq Mean Sq F value  Pr(>F)  
crace3f      2    602 300.843  3.9695 0.01896 *
Residuals 3935 298230  75.789                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> #4). Run a test to see if there is a significant difference in body mass index between girls and boys. (3 points)
> t.test(cbmi~csex, data=df1)

    Welch Two Sample t-test

data:  cbmi by csex
t = 0.56235, df = 4125.5, p-value = 0.5739
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3814424  0.6882736
sample estimates:
mean in group 1 mean in group 2 
       16.44939        16.29597 
> #5). Review relevant literature and identify three family socioeconomic variables from the variable list that are relevant to child’s body mass index. Be sure to cite at least TWO references to support your claim. (3 points)
> 
>     #1.- Ogden CL, Carroll MD, Fakhouri TH, et al. Prevalence of Obesity Among Youths by Household Income and Education Level of Head of Household — United     States 2011–2014. MMWR Morb Mortal Wkly Rep 2018;67:186–189. DOI: http://dx.doi.org/10.15585/mmwr.mm6706a3external icon.
> 
>     #2.- Ziol-Guest KM, Dunifon RE, Kalil A. Parental employment and children's body weight: Mothers, others, and mechanisms. Soc Sci Med. 2013;95:52-59.       doi:10.1016/j.socscimed.2012.09.004
> 
> #6). Examine the means, medians, and standard deviation for variables you identified in 5). (3 points)
> attach(df1)
The following objects are masked from df1 (pos = 3):

    adjfinc, adjwlth1, adjwlth2, age, cage, cbmi, crace3, csex, educ,
    emp, emp2, lbw, lifesat, marpi, smk, tenure, tenure2, tkids
>  describe(tenure2)
   vars    n mean  sd median trimmed mad min max range  skew kurtosis   se
X1    1 4023 0.51 0.5      1    0.51   0   0   1     1 -0.03       -2 0.01
>  describe(educ)
   vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
X1    1 4198 13.6 3.05     13   13.37 1.48   0  20    20 0.33      1.1 0.05
>  describe(adjfinc)
   vars    n     mean       sd   median  trimmed      mad min     max   range
X1    1 4243 55378.59 110723.7 40088.82 44814.45 35028.41   0 4824656 4824656
    skew kurtosis      se
X1 26.94    982.2 1699.82
> #7). Estimate a regression model with all independent variables you identified in 5), and interpret the output of regression analysis. (6 points)
>  cbmi.lm <- lm(cbmi ~ factor(tenure2) + educ + adjfinc, data = df1)
> summary( cbmi.lm )

Call:
lm(formula = cbmi ~ factor(tenure2) + educ + adjfinc, data = df1)

Residuals:
    Min      1Q  Median      3Q     Max 
-17.473  -1.778   0.665   4.392  38.649 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       1.718e+01  6.450e-01  26.633  < 2e-16 ***
factor(tenure2)1  2.973e-01  2.945e-01   1.010 0.312725    
educ             -5.510e-02  4.838e-02  -1.139 0.254744    
adjfinc          -4.347e-06  1.277e-06  -3.404 0.000672 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.738 on 3875 degrees of freedom
  (364 observations deleted due to missingness)
Multiple R-squared:  0.003854,  Adjusted R-squared:  0.003083 
F-statistic: 4.997 on 3 and 3875 DF,  p-value: 0.001847
>     #The result for tenure indicates that if family home is owned, expected childhood bmi increases by .3 relative to family home being rented (0), all else held constant. This is not significant since the p-value is high. 
>     #For every 1 year increase in mother's educational attainment, expected childhood bmi decreases by .06, all else held constant.This is not significant since the p-value is high. 
>     #For every 1 dollar increase in family income, expected childhood bmi decreases by approximately .000004, all else held constant.This is significant since the p-value <.05
> 
>     #The low R2 value suggest <1% of the variance in childhood bmi is explained by the model. 
>     #The F-stat is low, suggesting a weak relationship between childhood bmi and explanatory variables.
> 
> #8). Provide appropriate descriptive statistics for child’s age, sex, race, and low birth weight status.  (3 points)
> attach(df1)
The following objects are masked from df1 (pos = 3):

    adjfinc, adjwlth1, adjwlth2, age, cage, cbmi, crace3, crace3f,
    csex, educ, emp, emp2, lbw, lifesat, marpi, smk, tenure, tenure2,
    tkids
The following objects are masked from df1 (pos = 4):

    adjfinc, adjwlth1, adjwlth2, age, cage, cbmi, crace3, csex, educ,
    emp, emp2, lbw, lifesat, marpi, smk, tenure, tenure2, tkids
>  describe(cage)
   vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
X1    1 4236 8.57 4.54      8    8.48 5.93   1  17    16 0.15    -1.08 0.07
>   gmean<-mean(df1$cbmi,na.rm = T)
> df1 %>%
+   group_by(csex)%>%
+   summarise(mean=mean(cbmi,na.rm = T), gmean=gmean)
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 2 x 3
   csex  mean gmean
  <dbl> <dbl> <dbl>
1     1  16.4  16.4
2     2  16.3  16.4
> prop.table(table(csex))
csex
        1         2 
0.4930474 0.5069526 
> df1 %>%
+   group_by(crace3f)%>%
+   summarise(mean=mean(cbmi,na.rm = T), gmean=gmean)
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 4 x 3
  crace3f  mean gmean
  <fct>   <dbl> <dbl>
1 1        16.2  16.4
2 2        16.8  16.4
3 3        15.4  16.4
4 <NA>     15.6  16.4
> prop.table(table(crace3f))
crace3f
         1          2          3 
0.54181098 0.40969817 0.04849085 
> df1 %>%
+   group_by(lbw)%>%
+   summarise(mean=mean(cbmi,na.rm = T), gmean=gmean)
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 3 x 3
    lbw  mean gmean
  <dbl> <dbl> <dbl>
1     0  16.3  16.4
2     1  16.0  16.4
3    NA  17.5  16.4
> prop.table(table(lbw))
lbw
        0         1 
0.8754166 0.1245834 
> #9). Estimate another regression model with all the independent variables you identified in 5) and these child’s demographic variables. (3 points)
>  cbmi.lm2 <- lm(cbmi ~ factor(tenure2) + educ + adjfinc+cage+factor(csex)+crace3f+factor(lbw), data = df1)
> summary( cbmi.lm2 )

Call:
lm(formula = cbmi ~ factor(tenure2) + educ + adjfinc + cage + 
    factor(csex) + crace3f + factor(lbw), data = df1)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.402  -1.605   1.333   4.035  41.242 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       1.150e+01  7.548e-01  15.240  < 2e-16 ***
factor(tenure2)1 -1.686e-01  3.114e-01  -0.541  0.58820    
educ             -4.764e-03  4.797e-02  -0.099  0.92089    
adjfinc          -5.586e-06  1.208e-06  -4.623 3.91e-06 ***
cage              6.399e-01  3.115e-02  20.545  < 2e-16 ***
factor(csex)2    -1.415e-01  2.795e-01  -0.506  0.61288    
crace3f2          1.194e-01  3.205e-01   0.373  0.70951    
crace3f3         -1.717e+00  6.648e-01  -2.583  0.00982 ** 
factor(lbw)1     -3.160e-01  4.944e-01  -0.639  0.52274    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.174 on 3422 degrees of freedom
  (812 observations deleted due to missingness)
Multiple R-squared:  0.1149,    Adjusted R-squared:  0.1129 
F-statistic: 55.54 on 8 and 3422 DF,  p-value: < 2.2e-16
> #10). Interpret the coefficients of sex, child’s age and race variables from the model from 9). (8 points)
> 
>     #The result for sex indicates that for female children, expected childhood bmi is lower by .14 relative to males, all else held constant. This is not significant since the p-value is high. 
>     #For every 1 year increase in the child's age, expected childhood bmi increases by .6, all else held constant. This is significant since the p-value <.05
>     #The result for race indicates that for black children, expected childhood bmi is higher by .12 relative to white children, all else held constant. This is not significant since the p-value is high. 
>     #The result for race indicates that for other race children, expected childhood bmi is lower by 1.7 relative to white children, all else held constant. This is significant since the p-value <.05. 
> 
> #11). Compare the regression model from 7) to the more advanced model from 9), which model is preferred? Why? (Make sure you run proper test to support your claim.) (3 points)
> 
> df1_nona <- na.omit(df1)#I had to remove na from the data since ANOVA would not work with the original data the models were fitted :/
>  cbmi.lm_nona <- lm(cbmi ~ factor(tenure2) + educ + adjfinc, data = df1_nona)
>  cbmi.lm2_nona <- lm(cbmi ~ factor(tenure2) + educ + adjfinc+cage+factor(csex)+crace3f+factor(lbw), data = df1_nona)
> 
> anova(cbmi.lm_nona,cbmi.lm2_nona)
Analysis of Variance Table

Model 1: cbmi ~ factor(tenure2) + educ + adjfinc
Model 2: cbmi ~ factor(tenure2) + educ + adjfinc + cage + factor(csex) + 
    crace3f + factor(lbw)
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1   3396 254154                                  
2   3391 226094  5     28060 84.171 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>     #The results of the ANOVA test suggest the more complex model is significantly better than the simpler model as indicates by the very low p-value<.05
>     #The higher R2 and adjusted R2 further support the complex model as the better model.
> 
> #12). Interpret the R-square and adjusted R-square from the preferred model. (4 points)
> summary( cbmi.lm2 )

Call:
lm(formula = cbmi ~ factor(tenure2) + educ + adjfinc + cage + 
    factor(csex) + crace3f + factor(lbw), data = df1)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.402  -1.605   1.333   4.035  41.242 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       1.150e+01  7.548e-01  15.240  < 2e-16 ***
factor(tenure2)1 -1.686e-01  3.114e-01  -0.541  0.58820    
educ             -4.764e-03  4.797e-02  -0.099  0.92089    
adjfinc          -5.586e-06  1.208e-06  -4.623 3.91e-06 ***
cage              6.399e-01  3.115e-02  20.545  < 2e-16 ***
factor(csex)2    -1.415e-01  2.795e-01  -0.506  0.61288    
crace3f2          1.194e-01  3.205e-01   0.373  0.70951    
crace3f3         -1.717e+00  6.648e-01  -2.583  0.00982 ** 
factor(lbw)1     -3.160e-01  4.944e-01  -0.639  0.52274    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.174 on 3422 degrees of freedom
  (812 observations deleted due to missingness)
Multiple R-squared:  0.1149,    Adjusted R-squared:  0.1129 
F-statistic: 55.54 on 8 and 3422 DF,  p-value: < 2.2e-16
>     #The R2 is .114, suggesting approximately 11% of the variance in childhood bmi is explained by the model.
>     #The adjusted R2 is .112. The adjusted R2 is lower than the R2 since it adjusts for the multiple independent variables.
> 
> #13). Evaluate whether the preferred model violates any linear regression assumptions. If violation(s) exists in the preferred model, propose reasonable solution(s). (5 points)
> plot(cbmi.lm2,1)

>     #The plot of residuals vs fitted values indicates non-linearity exists in the model. Transformations of one or more of the continuous variables may help improve the fit of the model.
>   
> #14). Based on the analysis you’ve done so far, write a short paragraph to summarize your findings regarding the relationship between family socioeconomic background and child’s body mass index. (5 points)
> 
>     #According to the research outcomes, family income is the only socioeconomic factor considered that affects childhood bmi in a significant way. However, the effect is very small, a decrease in bmi of .04 for every $10,000 of family income. The model of only socioeconomic independent variables was much worse at explaining childhood bmi than the model including demographic variables. This may suggest demographic factors have stronger deterministic relationship with childhood bmi than socioeconomic factors. However, both model displayed R2 values below .12, indicating inadequacy of the models. More variables should be included in determining causes of childhood bmi than what was presented in these findings.