## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::%+%() masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#CARROT Data
carrT1 <- read.csv("C:/Users/mmatt/Desktop/Projects/TADS/Data/T1/t1_carrot_c.csv")
#Get difference of cards sorted in reward condition
carrT1$cDif <- carrT1$cct3 - carrT1$cct2
mean(carrT1$cDif,na.rm=TRUE)
## [1] 1.977578
#Puberty Data
pdsT1 <- read.csv("C:/Users/mmatt/Desktop/Projects/TADS/Data/T1/t1_pds_c.csv")
table(pdsT1$chsex)
##
## 1 2
## 95 67
#merge cDif and sex
df <- merge(carrT1[c('id','cDif')], pdsT1[c('id','chsex')], by = 'id')
cards_on_sex <- lm(cDif ~ chsex, df)
summary(cards_on_sex)
##
## Call:
## lm(formula = cDif ~ chsex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.3736 -2.3736 -0.8413 3.5095 20.6264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9060 1.6159 1.798 0.0741 .
## chsex -0.5324 1.0827 -0.492 0.6237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.606 on 152 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.001588, Adjusted R-squared: -0.004981
## F-statistic: 0.2417 on 1 and 152 DF, p-value: 0.6237
#Check mean of residuals approximates 0
mean(cards_on_sex$residuals)
## [1] -1.40017e-16
#Check assumptions
check_model(cards_on_sex)
For the Unstandardized regression, the mean of residuals does approximate zero. The normality of residuals slightly resembles a normal distribution, though it is leptokurtic. Also, there is not homogeneity of variance. Being female predicted having a smaller difference of cards sorted in reward vs. non-reward condition relative to being male (Beta = -0.52, p = .62, R^2 = .002)
#Standardize sex and cDif
df <- mutate(df, zcDif = ((cDif - mean(cDif, na.rm = TRUE)) / sd(cDif, na.rm = TRUE)),
zSex = ((chsex - mean(chsex, na.rm = TRUE)) / sd(chsex, na.rm = TRUE)))
#Check standardization
psych::describe(df[4:5])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## zcDif 1 186 0 1 -0.14 -0.01 0.68 -2.58 3.21 5.80 0.26 1.10 0.07
## zSex 2 155 0 1 -0.82 -0.05 0.00 -0.82 1.20 2.03 0.38 -1.87 0.08
#Standardized regression
std_c_on_s <- lm(zcDif ~ zSex, df)
summary(std_c_on_s)
##
## Call:
## lm(formula = zcDif ~ zSex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6511 -0.3622 -0.1284 0.5355 3.1474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03384 0.08123 0.417 0.678
## zSex -0.04003 0.08141 -0.492 0.624
##
## Residual standard error: 1.008 on 152 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.001588, Adjusted R-squared: -0.004981
## F-statistic: 0.2417 on 1 and 152 DF, p-value: 0.6237
mean(std_c_on_s$residuals)
## [1] 7.406368e-18
#Check assumptions
check_model(std_c_on_s)
Standardization did not appear to substantially alter assumptions of the regression.
#Check for influencing observations
CoS_check <- gvlma(std_c_on_s, alphalevel = 0.000001)
deletion.gvlma(CoS_check)
##
## Global test deletion statistics.
##
## Linear Model:
## lm(formula = zcDif ~ zSex, data = df)
##
## gvlma call:
## gvlma(x = std_c_on_s, alphalevel = 1e-06)
##
##
## Summary values:
## Min. 1st Qu. Median Mean
## DeltaGlobalStat -2.739476e+01 -4.131114e+00 -1.928080e+00 -3.494878e-01
## GStatpvalue 5.431028e-03 1.345549e-02 1.502547e-02 1.460487e-02
## DeltaStat1 -8.088605e+01 -5.469839e+00 0.000000e+00 5.863813e-01
## Stat1pvalue 7.590465e-02 1.911187e-01 2.001505e-01 2.030745e-01
## DeltaStat2 -2.032737e+01 -4.186064e+00 -2.016283e+00 -5.106629e-01
## Stat2pvalue 3.857236e-04 1.030071e-03 1.158422e-03 1.118822e-03
## DeltaStat3 -9.955385e+04 -4.558196e+02 -1.001258e+02 -1.106231e+03
## Stat3pvalue 9.999997e-01 1.000000e+00 1.000000e+00 1.000000e+00
## DeltaStat4 -9.935229e+01 -1.385804e+01 0.000000e+00 8.741940e-01
## Stat4pvalue 5.818378e-01 6.635525e-01 6.858036e-01 6.897019e-01
## 3rd Qu. Max.
## DeltaGlobalStat 1.083903e-01 1.665506e+01
## GStatpvalue 1.692472e-02 5.788250e-02
## DeltaStat1 4.124836e+00 9.195308e+01
## Stat1pvalue 2.129140e-01 5.754108e-01
## DeltaStat2 0.000000e+00 1.696316e+01
## Stat2pvalue 1.314704e-03 3.393330e-03
## DeltaStat3 1.382486e+02 8.883238e+03
## Stat3pvalue 1.000000e+00 1.000000e+00
## DeltaStat4 1.562460e+01 8.530012e+01
## Stat4pvalue 7.073061e-01 9.740264e-01
##
##
## Unusual observations for Global Stat:
## Delta Global Stat (%) Global Stat p-value
## 20 13.29795 0.006537792
## 33 14.29156 0.006189002
## 38 -26.72254 0.055907961
## 63 13.76325 0.006372122
## 78 -27.39476 0.057882496
## 79 12.86895 0.006694275
## 107 16.65506 0.005431028
## 108 13.12984 0.006598678
## 116 14.24853 0.006203722
## 125 14.60652 0.006082302
#Find IDs of outliers
outlier.info <- influence.measures(std_c_on_s)
rowid <- data.frame(seq(1,154,1))
outlier.info_df <- cbind(data.frame(outlier.info$infmat), data.frame(outlier.info$is.inf), rowid)
outlier.info_sum <- summary(influence.measures(std_c_on_s))
## Potentially influential observations of
## lm(formula = zcDif ~ zSex, data = df) :
##
## dfb.1_ dfb.zSex dffit cov.r cook.d hat
## 3 -0.19 0.16 -0.25 0.95_* 0.03 0.01
## 27 0.18 -0.15 0.24 0.96_* 0.03 0.01
## 33 -0.21 -0.26 -0.33 0.94_* 0.05 0.02
## 38 0.26 0.31 0.40_* 0.91_* 0.08 0.02
## 51 -0.22 0.18 -0.28 0.93_* 0.04 0.01
## 78 0.26 -0.22 0.34 0.90_* 0.05 0.01
## 116 -0.19 0.16 -0.25 0.95_* 0.03 0.01
## 184 0.25 -0.21 0.32 0.91_* 0.05 0.01
#Estimating the model without outliers
std_c_on_s_r <- lm(zcDif ~ zSex, std_c_on_s$model[-c(3, 27, 33, 38, 51, 78, 116, 184), ])
plot(std_c_on_s_r)
coefficients(summary(std_c_on_s_r))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04744734 0.07509991 0.6317895 0.5285192
## zSex -0.06711088 0.07529339 -0.8913250 0.3742315
CoS_check_r <- gvlma(std_c_on_s_r, alphalevel = 0.0001)
deletion.gvlma(CoS_check_r)
##
## Global test deletion statistics.
##
## Linear Model:
## lm(formula = zcDif ~ zSex, data = std_c_on_s$model[-c(3, 27, 33, 38, 51, 78, 116, 184), ])
##
## gvlma call:
## gvlma(x = std_c_on_s_r, alphalevel = 1e-04)
##
##
## Summary values:
## Min. 1st Qu. Median Mean
## DeltaGlobalStat -4.449556e+01 -3.750682e+00 -2.179607566 -0.511544010
## GStatpvalue 6.998844e-04 2.406153e-03 0.003332498 0.003706182
## DeltaStat1 -7.380414e+01 -3.778968e+00 -2.622280690 -0.307510070
## Stat1pvalue 2.128140e-02 6.870124e-02 0.077314977 0.076933720
## DeltaStat2 -4.645805e+01 -4.461875e+00 -4.080078620 -0.618996575
## Stat2pvalue 4.569142e-04 1.263614e-03 0.001749817 0.001649880
## DeltaStat3 -1.746712e+03 -2.380280e+02 -56.718108142 499.786870857
## Stat3pvalue 9.999997e-01 9.999999e-01 0.999999987 0.999999970
## DeltaStat4 -8.050263e+01 -5.986761e+00 0.033399419 -0.348091853
## Stat4pvalue 5.821549e-02 8.925806e-02 0.099475990 0.102835000
## 3rd Qu. Max.
## DeltaGlobalStat 2.359864e+00 1.938708e+01
## GStatpvalue 3.728734e-03 6.229961e-02
## DeltaStat1 3.410590e+00 6.550526e+01
## Stat1pvalue 7.909534e-02 3.595559e-01
## DeltaStat2 1.794675e+00 2.029113e+01
## Stat2pvalue 1.787318e-03 1.937283e-02
## DeltaStat3 6.027247e+02 2.371497e+04
## Stat3pvalue 1.000000e+00 1.000000e+00
## DeltaStat4 6.438799e+00 3.223307e+01
## Stat4pvalue 1.102526e-01 4.670414e-01
##
##
## Unusual observations for Global Stat:
## Delta Global Stat (%) Global Stat p-value
## 67 -38.79197 0.04263333
## 144 -44.49556 0.06229961
#Check Assumptions
check_model(std_c_on_s_r)
gvlma::gvlma(std_c_on_s_r)
##
## Call:
## lm(formula = zcDif ~ zSex, data = std_c_on_s$model[-c(3, 27,
## 33, 38, 51, 78, 116, 184), ])
##
## Coefficients:
## (Intercept) zSex
## 0.04745 -0.06711
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma::gvlma(x = std_c_on_s_r)
##
## Value p-value Decision
## Global Stat 1.613e+01 0.002851 Assumptions NOT satisfied!
## Skewness 3.204e+00 0.073436 Assumptions acceptable.
## Kurtosis 1.021e+01 0.001396 Assumptions NOT satisfied!
## Link Function 5.751e-16 1.000000 Assumptions acceptable.
## Heteroscedasticity 2.713e+00 0.099532 Assumptions acceptable.
Removing outliers did improve the model, but all assumptions are still not satisfied. Mainly, the distribution of the residuals remained leptokurtic.If we accept the model, it can be interpreted as “Being female predicted having a smaller difference of cards sorted in reward vs. non-reward condition relative to being male (Standardized Beta = -0.067, p = .37, R^2 = .005)”