SECTION 5 STARTS HERE:

library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library (ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(knitr)
library(stats)
library(lm.beta)
6-0 : READ AND CLEAN DATA
Data <- read_sav("/Users/misschelsita/Documents/Winter 2023/PSY 212/Week 1/MyData.sav")
 
  
colnames(Data) <- c("id","birth_year","education", "sex","marital_status","extraversion","openness","neuroticism","conscientiousness","agreeableness","depression")

Data$depression <- replace (Data$depression,Data$depression<=0, NA)
Data$agreeableness <- replace (Data$agreeableness,Data$agreeableness<=0, NA)
Data$neuroticism <- replace (Data$neuroticism,Data$neuroticism<=0, NA)
Data$conscientiousness <- replace (Data$conscientiousness,Data$conscientiousness<=0, NA)
Data$extraversion <- replace (Data$extraversion,Data$extraversion<=0, NA)
Data$openness <- replace (Data$openness,Data$openness<=0, NA)

Today, we only work with depression and neuroticism

df <- Data%>% dplyr::select(depression)
df$neuroticism <- Data$neuroticism
6-1 : NON-LINEAR RELATIONSHIPS

Graph the scatter-plot of depression on neuroticism + best linear line estimate # need to add geom_point

Data%>%ggplot(aes(x=neuroticism, y=depression))+
  geom_point(methods="lm", alpha=.3)
## Warning in geom_point(methods = "lm", alpha = 0.3): Ignoring unknown parameters:
## `methods`
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
## Warning: Removed 4047 rows containing missing values (`geom_point()`).

Graph the scatter-plot of depression on neuroticism + quadratic curve estimate. (to do so, add “formula=y ~ poly(x, 2)” as an argument to geom_smooth() method. poly(x,2) here means a polynomial with second power!)

Data%>%ggplot(aes(x=neuroticism, y=depression))+
  geom_smooth(formula=y ~ poly(x, 2))
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
## `geom_smooth()` using method = 'gam'
## Warning: Removed 4047 rows containing non-finite values (`stat_smooth()`).

Graph the scatter-plot of depression on neuroticism + cubic curve estimate

Data%>%ggplot(aes(x=neuroticism, y=depression))+
  geom_smooth(methods="lm", formula=y ~ poly(x, 3))
## Warning in geom_smooth(methods = "lm", formula = y ~ poly(x, 3)): Ignoring
## unknown parameters: `methods`
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
## `geom_smooth()` using method = 'gam'
## Warning: Removed 4047 rows containing non-finite values (`stat_smooth()`).

P6.1 - Starting with cubic model, create your cubic, quadratic, and linear model and test the significance of cubic effect, quadratic effect, and linear effect. Interpret the raw coefficients. Follow the steps below

step 1: create two new variables: neuroticism^2 and neuroticism

df$X <- df$neuroticism
df$X2 <-df$neuroticism^2
df$X3 <- df$neuroticism^3

b0 - value of Y when X is 0 #slope of change = b1 +2b2x+2b3x^2 step 2: create the cubic model. find the R-squared of this model

modelcubic <-lm(formula=df$depression ~ df$X + df$X2 +df$X3)
summary(modelcubic)
## 
## Call:
## lm(formula = df$depression ~ df$X + df$X2 + df$X3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.458  -9.050  -3.231   5.696  99.950 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.509918   2.066934   0.247    0.805    
## df$X         6.048608   1.129294   5.356 8.81e-08 ***
## df$X2       -0.857762   0.180411  -4.754 2.03e-06 ***
## df$X3        0.047964   0.008656   5.541 3.13e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.5 on 6266 degrees of freedom
##   (4047 observations deleted due to missingness)
## Multiple R-squared:  0.1335, Adjusted R-squared:  0.1331 
## F-statistic: 321.8 on 3 and 6266 DF,  p-value: < 2.2e-16

Step 3: create the quadratic model. find the R-squared of this model.

modelquad <-lm(formula=df$depression ~ df$X + df$X2)
summary(modelquad)
## 
## Call:
## lm(formula = df$depression ~ df$X + df$X2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.376  -8.979  -3.212   5.516 101.021 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.74002    0.93140  11.531  < 2e-16 ***
## df$X         0.02148    0.30419   0.071    0.944    
## df$X2        0.13457    0.02182   6.166 7.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.53 on 6267 degrees of freedom
##   (4047 observations deleted due to missingness)
## Multiple R-squared:  0.1292, Adjusted R-squared:  0.129 
## F-statistic: 465.1 on 2 and 6267 DF,  p-value: < 2.2e-16

Step 4: test if the change in R-squared was significant.

anova(modelcubic,modelquad)
## Analysis of Variance Table
## 
## Model 1: df$depression ~ df$X + df$X2 + df$X3
## Model 2: df$depression ~ df$X + df$X2
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1   6266 1316938                                  
## 2   6267 1323390 -1   -6452.6 30.701 3.132e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

if it is significant, what would it mean? what about when it is not significant?

Step 5: if n.s., create the linear model. what is the R-squared. it means the quadratic is not better than cubic - its eliminated - parimousous - if quadratic is significant then the quad is better - if it is not signignant linear is better but if it is then quad is better

modelline<-lm(formula=df$depression ~ df$X)
summary(modelline)
## 
## Call:
## lm(formula = df$depression ~ df$X)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.963  -9.103  -3.105   5.467 100.897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.67335    0.43978   12.90   <2e-16 ***
## df$X         1.85745    0.06237   29.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.57 on 6268 degrees of freedom
##   (4047 observations deleted due to missingness)
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1238 
## F-statistic: 886.9 on 1 and 6268 DF,  p-value: < 2.2e-16

Step 6: test if the change from quadratic to linear model is or is not significant.

anova(modelquad,modelline) 
## Analysis of Variance Table
## 
## Model 1: df$depression ~ df$X + df$X2
## Model 2: df$depression ~ df$X
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1   6267 1323390                                  
## 2   6268 1331419 -1   -8028.2 38.018 7.446e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 7. make a decision on the best model.

#the best model is the Quad 

Step 8. Interpret the raw coefficients.

#b0 - y when x =0 and b1 = 6 when x = 0 

P6.2 - What do you think about a spline regression. test a spline model with a joint at X=4. What is the R-squared? how would you interpret the raw coefficients?

Step 1: create a J1 variable, appropriate for this model.

Data$J1 <- ifelse(df$neuroticism<4,0, df$neuroticism-4)
summary(Data$J1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   2.000   2.643   4.000   8.000    2084

Step 2: run your spline model.

SPINE <-lm(depression ~ neuroticism + J1, data = Data)
summary(SPINE)
## 
## Call:
## lm(formula = depression ~ neuroticism + J1, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.505  -8.812  -3.375   5.576 101.625 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.2490     1.2111   7.637 2.56e-14 ***
## neuroticism   0.7815     0.3453   2.264  0.02364 *  
## J1            1.2347     0.3897   3.168  0.00154 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.56 on 6267 degrees of freedom
##   (4047 observations deleted due to missingness)
## Multiple R-squared:  0.1254, Adjusted R-squared:  0.1251 
## F-statistic: 449.1 on 2 and 6267 DF,  p-value: < 2.2e-16

Step 3: compare it to your linear model. what is the increase in R-squared? is this increase significant?

summary(SPINE)
## 
## Call:
## lm(formula = depression ~ neuroticism + J1, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.505  -8.812  -3.375   5.576 101.625 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.2490     1.2111   7.637 2.56e-14 ***
## neuroticism   0.7815     0.3453   2.264  0.02364 *  
## J1            1.2347     0.3897   3.168  0.00154 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.56 on 6267 degrees of freedom
##   (4047 observations deleted due to missingness)
## Multiple R-squared:  0.1254, Adjusted R-squared:  0.1251 
## F-statistic: 449.1 on 2 and 6267 DF,  p-value: < 2.2e-16
summary(modelline)
## 
## Call:
## lm(formula = df$depression ~ df$X)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.963  -9.103  -3.105   5.467 100.897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.67335    0.43978   12.90   <2e-16 ***
## df$X         1.85745    0.06237   29.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.57 on 6268 degrees of freedom
##   (4047 observations deleted due to missingness)
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1238 
## F-statistic: 886.9 on 1 and 6268 DF,  p-value: < 2.2e-16

Step 4: report and interpret the results.

#the spine model was better.