HW 4 EDA 7400

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readxl)
data <- read_excel("C:/Users/mgmetz/Downloads/onlinecourse.xlsx")

str(data)

## tibble [94 × 13] (S3: tbl_df/tbl/data.frame)
##  $ ID           : chr [1:94] "respondent_1" "respondent_2" "respondent_3" "respondent_4" ...
##  $ cstanding    : num [1:94] 3 3 2 3 1 3 3 2 4 3 ...
##  $ transfer     : chr [1:94] "yes" "yes" "no" "yes" ...
##  $ first        : chr [1:94] "yes" "yes" "no" "yes" ...
##  $ classpref    : num [1:94] 3 7 7 6 3 1 7 6 3 1 ...
##  $ stress       : num [1:94] 4 7 7 6 7 2 7 5 5 2 ...
##  $ academic     : num [1:94] 4 6 7 6 6 1 4 2 2 1 ...
##  $ relationships: num [1:94] 7 7 7 7 7 3 7 7 6 3 ...
##  $ wellbeing    : num [1:94] 5 6 7 7 3 1 5 7 1 1 ...
##  $ gender       : chr [1:94] "female" "female" "male" "female" ...
##  $ race         : chr [1:94] "latino" "asian" "latino" "latino" ...
##  $ employed     : chr [1:94] "unemployed" "part" "unemployed" "part" ...
##  $ age          : num [1:94] 21 19 19 19 25 21 20 19 21 21 ...

colnames(data)

##  [1] "ID"            "cstanding"     "transfer"      "first"        
##  [5] "classpref"     "stress"        "academic"      "relationships"
##  [9] "wellbeing"     "gender"        "race"          "employed"     
## [13] "age"

manifest_vars <- c("classpref", "stress", "academic", "relationships", "wellbeing")

data <- data %>%
  mutate(depd = rowSums(select(., all_of(manifest_vars)), na.rm = TRUE))
head(data)

## # A tibble: 6 × 14
##   ID           cstanding transfer first classpref stress academic relationships
##   <chr>            <dbl> <chr>    <chr>     <dbl>  <dbl>    <dbl>         <dbl>
## 1 respondent_1         3 yes      yes           3      4        4             7
## 2 respondent_2         3 yes      yes           7      7        6             7
## 3 respondent_3         2 no       no            7      7        7             7
## 4 respondent_4         3 yes      yes           6      6        6             7
## 5 respondent_5         1 yes      yes           3      7        6             7
## 6 respondent_6         3 yes      yes           1      2        1             3
## # ℹ 6 more variables: wellbeing <dbl>, gender <chr>, race <chr>,
## #   employed <chr>, age <dbl>, depd <dbl>

summary(data$depd)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   22.25   27.00   26.16   31.00   35.00

library(corrr)

## Warning: package 'corrr' was built under R version 4.4.2

correlation_data <- data %>%
  select(all_of(manifest_vars), depd)


correlation_matrix <- cor(correlation_data, use = "pairwise.complete.obs")


print(correlation_matrix)

##               classpref    stress  academic relationships wellbeing      depd
## classpref     1.0000000 0.5671495 0.5954152     0.6168661 0.5947816 0.8319547
## stress        0.5671495 1.0000000 0.6119831     0.3947068 0.6220138 0.7898455
## academic      0.5954152 0.6119831 1.0000000     0.4879970 0.7047065 0.8551581
## relationships 0.6168661 0.3947068 0.4879970     1.0000000 0.5860952 0.7153559
## wellbeing     0.5947816 0.6220138 0.7047065     0.5860952 1.0000000 0.8681474
## depd          0.8319547 0.7898455 0.8551581     0.7153559 0.8681474 1.0000000

correlation_table <- correlation_data %>%
  correlate() %>%
  fashion()

## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'

correlation_table

##            term classpref stress academic relationships wellbeing depd
## 1     classpref              .57      .60           .62       .59  .83
## 2        stress       .57             .61           .39       .62  .79
## 3      academic       .60    .61                    .49       .70  .86
## 4 relationships       .62    .39      .49                     .59  .72
## 5     wellbeing       .59    .62      .70           .59            .87
## 6          depd       .83    .79      .86           .72       .87

library(psych)

reliability_data <- data %>%
  select(all_of(manifest_vars))

cronbach_result <- psych::alpha(reliability_data)

cronbach_result

## 
## Reliability analysis   
## Call: psych::alpha(x = reliability_data)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N  ase mean  sd median_r
##       0.87      0.87    0.86      0.58 6.9 0.02  5.2 1.4      0.6
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.82  0.87  0.91
## Duhachek  0.83  0.87  0.91
## 
##  Reliability if an item is dropped:
##               raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## classpref          0.84      0.84    0.81      0.57 5.3    0.024 0.0121  0.60
## stress             0.84      0.86    0.83      0.60 5.9    0.024 0.0048  0.60
## academic           0.83      0.84    0.81      0.56 5.2    0.027 0.0073  0.59
## relationships      0.86      0.87    0.83      0.62 6.4    0.023 0.0022  0.60
## wellbeing          0.82      0.83    0.80      0.55 4.8    0.028 0.0077  0.58
## 
##  Item statistics 
##                n raw.r std.r r.cor r.drop mean  sd
## classpref     94  0.83  0.83  0.77   0.71  4.9 1.9
## stress        94  0.79  0.79  0.71   0.67  5.4 1.6
## academic      94  0.86  0.84  0.79   0.74  4.5 2.0
## relationships 94  0.72  0.76  0.67   0.62  6.3 1.1
## wellbeing     94  0.87  0.86  0.83   0.77  5.1 1.8
## 
## Non missing response frequency for each item
##                  1    2    3    4    5    6    7 miss
## classpref     0.05 0.07 0.18 0.07 0.14 0.22 0.26    0
## stress        0.00 0.06 0.10 0.13 0.14 0.23 0.34    0
## academic      0.09 0.14 0.07 0.17 0.16 0.17 0.20    0
## relationships 0.01 0.00 0.02 0.01 0.18 0.18 0.60    0
## wellbeing     0.05 0.09 0.06 0.10 0.20 0.19 0.31    0

cronbach_result$total$raw_alpha

## [1] 0.8670815

cronbach_result$total$std.alpha

## [1] 0.8726627

independent_vars <- data %>%
  select(classpref, stress, relationships)

head(independent_vars)

## # A tibble: 6 × 3
##   classpref stress relationships
##       <dbl>  <dbl>         <dbl>
## 1         3      4             7
## 2         7      7             7
## 3         7      7             7
## 4         6      6             7
## 5         3      7             7
## 6         1      2             3

descriptive_stats <- psych::describe(independent_vars)


descriptive_stats

##               vars  n mean   sd median trimmed  mad min max range  skew
## classpref        1 94 4.86 1.89      5    5.01 2.97   1   7     6 -0.47
## stress           2 94 5.40 1.60      6    5.58 1.48   2   7     5 -0.68
## relationships    3 94 6.28 1.10      7    6.46 0.00   1   7     6 -1.94
##               kurtosis   se
## classpref        -1.07 0.20
## stress           -0.76 0.16
## relationships     4.90 0.11

data <- data %>%
  mutate(depd = rowSums(select(., classpref, stress, relationships), na.rm = TRUE))

bivariate_data <- data %>%
  select(classpref, stress, relationships, depd)  # Include only relevant variables

head(bivariate_data)

## # A tibble: 6 × 4
##   classpref stress relationships  depd
##       <dbl>  <dbl>         <dbl> <dbl>
## 1         3      4             7    14
## 2         7      7             7    21
## 3         7      7             7    21
## 4         6      6             7    19
## 5         3      7             7    17
## 6         1      2             3     6

correlation_matrix <- cor(bivariate_data, use = "pairwise.complete.obs")

print(correlation_matrix)

##               classpref    stress relationships      depd
## classpref     1.0000000 0.5671495     0.6168661 0.9061107
## stress        0.5671495 1.0000000     0.3947068 0.8087657
## relationships 0.6168661 0.3947068     1.0000000 0.7553981
## depd          0.9061107 0.8087657     0.7553981 1.0000000

library(corrr)
bivariate_correlations <- bivariate_data %>%
  correlate()

## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'

bivariate_correlations

## # A tibble: 4 × 5
##   term          classpref stress relationships   depd
##   <chr>             <dbl>  <dbl>         <dbl>  <dbl>
## 1 classpref        NA      0.567         0.617  0.906
## 2 stress            0.567 NA             0.395  0.809
## 3 relationships     0.617  0.395        NA      0.755
## 4 depd              0.906  0.809         0.755 NA

ggplot(bivariate_data, aes(x = classpref, y = depd)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Classpref vs. depd", x = "Class Preference", y = "depd")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(bivariate_data, aes(x = stress, y = depd)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Stress vs. depd", x = "Stress", y = "depd")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(bivariate_data, aes(x = relationships, y = depd)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "green") +
  labs(title = "Relationships vs. depd", x = "Relationships", y = "depd")

## `geom_smooth()` using formula = 'y ~ x'

data <- data %>%
  mutate(depd = rowSums(select(., classpref, stress, relationships), na.rm = TRUE))

model_data <- data %>%
  select(classpref, stress, relationships, depd)

str(model_data)

## tibble [94 × 4] (S3: tbl_df/tbl/data.frame)
##  $ classpref    : num [1:94] 3 7 7 6 3 1 7 6 3 1 ...
##  $ stress       : num [1:94] 4 7 7 6 7 2 7 5 5 2 ...
##  $ relationships: num [1:94] 7 7 7 7 7 3 7 7 6 3 ...
##  $ depd         : num [1:94] 14 21 21 19 17 6 21 18 14 6 ...

model <- lm(depd ~ classpref + stress + relationships, data = model_data)

summary(model)

## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable

## 
## Call:
## lm(formula = depd ~ classpref + stress + relationships, data = model_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.491e-15 -2.151e-15 -9.100e-16  2.100e-17  7.979e-14 
## 
## Coefficients:
##                Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)   2.932e-15  5.718e-15 5.130e-01    0.609    
## classpref     1.000e+00  6.885e-16 1.453e+15   <2e-16 ***
## stress        1.000e+00  6.998e-16 1.429e+15   <2e-16 ***
## relationships 1.000e+00  1.061e-15 9.429e+14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.845e-15 on 90 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 5.834e+30 on 3 and 90 DF,  p-value: < 2.2e-16

model_interaction <- lm(depd ~ classpref + stress * relationships, data = model_data)

summary(model_interaction)

## Warning in summary.lm(model_interaction): essentially perfect fit: summary may
## be unreliable

## 
## Call:
## lm(formula = depd ~ classpref + stress * relationships, data = model_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.405e-15 -2.020e-15 -8.550e-16  3.120e-16  7.952e-14 
## 
## Coefficients:
##                        Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)          -5.130e-15  1.521e-14 -3.370e-01    0.737    
## classpref             1.000e+00  7.006e-16  1.427e+15   <2e-16 ***
## stress                1.000e+00  3.200e-15  3.125e+14   <2e-16 ***
## relationships         1.000e+00  2.589e-15  3.862e+14   <2e-16 ***
## stress:relationships -2.472e-16  5.245e-16 -4.710e-01    0.639    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.883e-15 on 89 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 4.338e+30 on 4 and 89 DF,  p-value: < 2.2e-16

ggplot(model_data, aes(x = stress, y = depd, color = relationships)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Interaction Between Stress and Relationships",
       x = "Stress",
       y = "Dependent Variable (depd)")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# 1. Residual Plot (Linearity and Homoscedasticity)
ggplot(model, aes(.fitted, .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals")

# 2. Normal Q-Q Plot (Normality of Residuals)
ggplot(model, aes(sample = .stdresid)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Normal Q-Q Plot", x = "Theoretical Quantiles", y = "Standardized Residuals")

# 3. Variance Inflation Factor (Multicollinearity)
library(car)

## Warning: package 'car' was built under R version 4.4.2

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.4.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

vif(model)

## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable

##     classpref        stress relationships 
##      2.018644      1.481275      1.622032

r_squared <- summary(model)$r.squared

## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable

adj_r_squared <- summary(model)$adj.r.squared

## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable

cat("R-squared:", r_squared, "\n")

## R-squared: 1

cat("Adjusted R-squared:", adj_r_squared, "\n")

## Adjusted R-squared: 1

f_statistic <- summary(model)$fstatistic

## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable

f_value <- f_statistic[1]
df1 <- f_statistic[2]
df2 <- f_statistic[3]
p_value <- pf(f_value, df1, df2, lower.tail = FALSE)


cat("F-Statistic:", f_value, "\n")

## F-Statistic: 5.834357e+30

cat("Degrees of Freedom 1 (Numerator):", df1, "\n")

## Degrees of Freedom 1 (Numerator): 3

cat("Degrees of Freedom 2 (Denominator):", df2, "\n")

## Degrees of Freedom 2 (Denominator): 90

cat("p-value:", p_value, "\n")

## p-value: 0

HW 4 EDA 7400

Missy Metz

2024-11-24