Kim_Eric_Research

Initial setup

packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "MASS", "aod", "interactions", "flextable", "scales", "dplyr", "performance", "ggeffects", "marginaleffects", "ggplot2", "see" )

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: 패키지 'infer'는 R 버전 4.3.3에서 작성되었습니다

## Warning: 패키지 'effects'는 R 버전 4.3.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: carData

## Warning: 패키지 'carData'는 R 버전 4.3.3에서 작성되었습니다

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## Warning: 패키지 'survey'는 R 버전 4.3.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: grid
## 필요한 패키지를 로딩중입니다: Matrix
## 
## 다음의 패키지를 부착합니다: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 필요한 패키지를 로딩중입니다: survival
## 
## 다음의 패키지를 부착합니다: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## 다음의 패키지를 부착합니다: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

## Warning: 패키지 'aod'는 R 버전 4.3.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'aod'
## 
## The following object is masked from 'package:survival':
## 
##     rats

## Warning: 패키지 'interactions'는 R 버전 4.3.3에서 작성되었습니다

## Warning: 패키지 'flextable'는 R 버전 4.3.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## 
## 다음의 패키지를 부착합니다: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

## Warning: 패키지 'performance'는 R 버전 4.3.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'ggeffects'
## 
## The following object is masked from 'package:interactions':
## 
##     johnson_neyman

## Warning: 패키지 'marginaleffects'는 R 버전 4.3.3에서 작성되었습니다

## Warning: 패키지 'see'는 R 버전 4.3.3에서 작성되었습니다

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "aod"          "MASS"         "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [11] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "interactions" "aod"          "MASS"         "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "flextable"    "interactions" "aod"          "MASS"         "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "scales"       "flextable"    "interactions" "aod"          "MASS"        
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "scales"       "flextable"    "interactions" "aod"          "MASS"        
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[13]]
##  [1] "performance"  "scales"       "flextable"    "interactions" "aod"         
##  [6] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
## [11] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [16] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [21] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [26] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [31] "methods"      "base"        
## 
## [[14]]
##  [1] "ggeffects"    "performance"  "scales"       "flextable"    "interactions"
##  [6] "aod"          "MASS"         "survey"       "survival"     "Matrix"      
## [11] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [16] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [21] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [26] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [31] "datasets"     "methods"      "base"        
## 
## [[15]]
##  [1] "marginaleffects" "ggeffects"       "performance"     "scales"         
##  [5] "flextable"       "interactions"    "aod"             "MASS"           
##  [9] "survey"          "survival"        "Matrix"          "grid"           
## [13] "effects"         "carData"         "modelsummary"    "fst"            
## [17] "infer"           "lubridate"       "forcats"         "stringr"        
## [21] "dplyr"           "purrr"           "readr"           "tidyr"          
## [25] "tibble"          "ggplot2"         "tidyverse"       "stats"          
## [29] "graphics"        "grDevices"       "utils"           "datasets"       
## [33] "methods"         "base"           
## 
## [[16]]
##  [1] "marginaleffects" "ggeffects"       "performance"     "scales"         
##  [5] "flextable"       "interactions"    "aod"             "MASS"           
##  [9] "survey"          "survival"        "Matrix"          "grid"           
## [13] "effects"         "carData"         "modelsummary"    "fst"            
## [17] "infer"           "lubridate"       "forcats"         "stringr"        
## [21] "dplyr"           "purrr"           "readr"           "tidyr"          
## [25] "tibble"          "ggplot2"         "tidyverse"       "stats"          
## [29] "graphics"        "grDevices"       "utils"           "datasets"       
## [33] "methods"         "base"           
## 
## [[17]]
##  [1] "see"             "marginaleffects" "ggeffects"       "performance"    
##  [5] "scales"          "flextable"       "interactions"    "aod"            
##  [9] "MASS"            "survey"          "survival"        "Matrix"         
## [13] "grid"            "effects"         "carData"         "modelsummary"   
## [17] "fst"             "infer"           "lubridate"       "forcats"        
## [21] "stringr"         "dplyr"           "purrr"           "readr"          
## [25] "tidyr"           "tibble"          "ggplot2"         "tidyverse"      
## [29] "stats"           "graphics"        "grDevices"       "utils"          
## [33] "datasets"        "methods"         "base"

Loading data for Italy and adjusting my dataset

italy_data <- read_fst("italy_data.fst")

rp <- italy_data # renaming italy_data to rp (research project) for convenience

rp$weight <- rp$dweight * rp$pweight
survey_design <- svydesign(ids = ~1, data = rp, weights = ~weight)

# cleaning each variable of interest in accordance with its maximum relevant value

rp$netusoft <- ifelse(rp$netusoft > 5, NA, rp$netusoft)
rp$happy <- ifelse(rp$happy > 10, NA, rp$happy)
rp$sclmeet <- ifelse(rp$sclmeet > 7, NA, rp$sclmeet)
rp$health <- ifelse(rp$health > 5, NA, rp$health)

rp$netusoft <- scales::rescale(rp$netusoft, to=c(1, 5), na.rm = TRUE)
rp$happy <- scales::rescale(rp$happy, to=c(0, 10), na.rm = TRUE)
rp$sclmeet <- scales::rescale(rp$sclmeet, to=c(1, 7), na.rm = TRUE)

# fliping this variable because it is originally higher value = worse health

rp$healthy <- scales::rescale(rp$health, to=c(5, 1), na.rm = TRUE)

# creating a new numeric variable that encompasses subjective well-being variables

rp$subwel <- scales::rescale(rp$happy + rp$sclmeet + rp$healthy, to=c(0, 100), na.rm = TRUE)

# making sure variables and subsets are cleaned

rp <- rp %>% filter(!is.na(netusoft))
rp <- rp %>% filter(!is.na(happy))
rp <- rp %>% filter(!is.na(sclmeet))
rp <- rp %>% filter(!is.na(healthy))
rp <- rp %>% filter(!is.na(subwel))

table(rp$netusoft)

## 
##    1    2    3    4    5 
## 1529  630  510 1104 4144

table(rp$happy)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   62   25   96  187  269  541 1074 2083 2305  761  514

table(rp$sclmeet)

## 
##    1    2    3    4    5    6    7 
##  235  712  617 1793 1515 2170  875

table(rp$healthy)

## 
##    1    2    3    4    5 
##   72  363 1979 3617 1886

table(rp$subwel)

## 
##    0    5   10   15   20   25   30   35   40   45   50   55   60   65   70   75 
##    6    8   13   19   28   56   67  128  192  260  346  541  746  986 1090 1083 
##   80   85   90   95  100 
##  963  748  407  179   51

# re-coding the frequency of internet use into five categories

rp <- rp %>%
  mutate(
    freqnet = case_when(
      netusoft == 1 ~ "Never",
      netusoft == 2 ~ "Only occasionally",
      netusoft == 3 ~ "A few times a week",
      netusoft == 4 ~ "Most days",
      netusoft == 5 ~ "Every day",
      TRUE ~ as.character(netusoft)
      ),
    freqnet = fct_relevel(factor(freqnet), # putting categories in an descending order
                          "Never",
                          "Only occasionally",
                          "A few times a week",
                          "Most days",
                          "Every day")
  )


rp <- rp %>% filter(!is.na(freqnet)) # making sure the variable is cleaned

table(rp$freqnet)

## 
##              Never  Only occasionally A few times a week          Most days 
##               1529                630                510               1104 
##          Every day 
##               4144

# re-coding year of birth into four generations

rp <- rp %>%
  mutate(
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(yrbrn)  
    ),
  gen = factor(gen,
    levels = c("1", "2", "3", "4"),
    labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")), # renaming 1, 2, 3, 4 to correlated generations
  )

rp <- rp %>% filter(!is.na(yrbrn)) # making sure the variable is cleaned
rp <- rp %>% filter(!is.na(gen)) # making sure the variable is cleaned

table(rp$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##          984         2390         2070         1659

Running a modelsummary function, just to identify the number of observations contained in my sample after clearing variables and subset

model1 <- lm(subwel ~ freqnet, data = rp, weights = weight)

modelsummary(
 list(model1),
 fmt = 1,
 estimate  = c( "{estimate} ({std.error}){stars}"),
 statistic = NULL,
 title = "Simple Linear Regression Model With freqnet as Predictor",
 
 # not renaming for coefficients because the purpose of this table is to simply identify the number of observations contained in my sample
 
 output = "flextable")

Simple Linear Regression Model With freqnet as Predictor
	(1)
(Intercept)	57.5 (0.4)***
freqnetOnly occasionally	4.4 (0.7)***
freqnetA few times a week	6.8 (0.7)***
freqnetMost days	9.9 (0.6)***
freqnetEvery day	14.9 (0.4)***
Num.Obs.	7103
R2	0.155
R2 Adj.	0.154
AIC	57730.8
BIC	57772.0
Log.Lik.	-28859.378
RMSE	14.01

# Below is what I did for saving Table 1 as Word

# model1_sum <- modelsummary(
# list(model1),
# fmt = 1,
# estimate  = c( "{estimate} ({std.error}){stars}"),
# statistic = NULL,
# title = "Simple Linear Regression Model With netusoft as Predictor",
# output = "flextable")

# flextable::save_as_docx(model1_sum, path = "Kim_Eric_Research_Project_Table_0.docx",
#                       width = 7.0, height = 7.0)

I have 7,103 observations in my dataset.

Presenting a table of descriptive statistics for all my variables of interest with a title

rp <- rp %>%
  mutate(
    freqnet = netusoft,
    gen = yrbrn,
    )

rp <- rp %>% filter(!is.na(freqnet))
rp <- rp %>% filter(!is.na(gen))

table1 <- datasummary_skim(rp %>% dplyr::select(freqnet, gen, subwel), title = "Table 1: Descriptive Statistics of Variables of Interest", output = "flextable", histogram = FALSE, weights = weight)

table1

Table 1: Descriptive Statistics of Variables of Interest
	Unique (#)	Mean	SD	Min	Median	Max
freqnet	5	3.6	1.6	1.0	4.0	5.0
gen	74	1965.4	17.0	1921.0	1966.0	1996.0
subwel	21	67.1	15.2	0.0	70.0	100.0

# Below is what I did for saving Table 1 as Word

# flextable::save_as_docx(table1, path = "Kim_Eric_Research_Project_Table_1.docx", width = 7.0, height = 7.0)

Outputting null distribution

Before proceeding

# I have to re-do everything from the start

# This is because I turned the categorical variables freqnet and gen into numeric variables for providing Table 1

# This may seem overworking, but I wanted to prevent any differences to occur in my sample (e.g., number of observations, because if it changes, the mean, SD, etc., would get affected too)

italy_data <- read_fst("italy_data.fst")

rp <- italy_data # renaming italy_data to rp (research project) for convenience

rp$weight <- rp$dweight * rp$pweight
survey_design <- svydesign(ids = ~1, data = rp, weights = ~weight)

# cleaning each variable of interest in accordance with its maximum relevant value

rp$netusoft <- ifelse(rp$netusoft > 5, NA, rp$netusoft)
rp$happy <- ifelse(rp$happy > 10, NA, rp$happy)
rp$sclmeet <- ifelse(rp$sclmeet > 7, NA, rp$sclmeet)
rp$health <- ifelse(rp$health > 5, NA, rp$health)

rp$netusoft <- scales::rescale(rp$netusoft, to=c(1, 5), na.rm = TRUE)
rp$happy <- scales::rescale(rp$happy, to=c(0, 10), na.rm = TRUE)
rp$sclmeet <- scales::rescale(rp$sclmeet, to=c(1, 7), na.rm = TRUE)

# fliping this variable because it is originally higher value = worse health

rp$healthy <- scales::rescale(rp$health, to=c(5,1), na.rm = TRUE)

# creating a new numeric variable that encompasses subjective well-being variables

rp$subwel <- scales::rescale(rp$happy + rp$sclmeet + rp$healthy, to=c(0, 100), na.rm = TRUE)

# making sure variables and subsets are cleaned

rp <- rp %>% filter(!is.na(netusoft))
rp <- rp %>% filter(!is.na(happy))
rp <- rp %>% filter(!is.na(sclmeet))
rp <- rp %>% filter(!is.na(healthy))
rp <- rp %>% filter(!is.na(subwel))

table(rp$netusoft)

## 
##    1    2    3    4    5 
## 1529  630  510 1104 4144

table(rp$happy)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   62   25   96  187  269  541 1074 2083 2305  761  514

table(rp$sclmeet)

## 
##    1    2    3    4    5    6    7 
##  235  712  617 1793 1515 2170  875

table(rp$healthy)

## 
##    1    2    3    4    5 
##   72  363 1979 3617 1886

table(rp$subwel)

## 
##    0    5   10   15   20   25   30   35   40   45   50   55   60   65   70   75 
##    6    8   13   19   28   56   67  128  192  260  346  541  746  986 1090 1083 
##   80   85   90   95  100 
##  963  748  407  179   51

# re-coding the frequency of internet use into five categories

rp <- rp %>%
  mutate(
    freqnet = case_when(
      netusoft == 1 ~ "Never",
      netusoft == 2 ~ "Only occasionally",
      netusoft == 3 ~ "A few times a week",
      netusoft == 4 ~ "Most days",
      netusoft == 5 ~ "Every day",
      TRUE ~ as.character(netusoft)
      ),
    freqnet = fct_relevel(factor(freqnet), # putting categories in an descending order
                          "Never",
                          "Only occasionally",
                          "A few times a week",
                          "Most days",
                          "Every day")
  )

rp <- rp %>% filter(!is.na(freqnet)) # making sure the variable is cleaned

table(rp$freqnet)

## 
##              Never  Only occasionally A few times a week          Most days 
##               1529                630                510               1104 
##          Every day 
##               4144

# re-coding year of birth into four generations

rp <- rp %>%
  mutate(
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(yrbrn)  
    ),
  gen = factor(gen,
    levels = c("1", "2", "3", "4"),
    labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")), # renaming 1, 2, 3, 4 to correlated generations
  )

rp <- rp %>% filter(!is.na(yrbrn)) # making sure the variable is cleaned
rp <- rp %>% filter(!is.na(gen)) # making sure the variable is cleaned

table(rp$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##          984         2390         2070         1659

Step 1: Calculating the test statistic of my sample

test_stat <- rp %>%
  specify(explanatory = freqnet, # independent
          response = subwel) %>% # dependent
  hypothesize(null = "independence") %>%
  calculate(stat = "F") # using ANOVA because I have the numeric variable subwel as response (dependent) and the categorical variable freqnet as explanatory (independent) with more than three categories (five)

print(test_stat$stat)

## [1] 324.024

A test statistic of 324 is very, very, very large and thus indicates a significant difference in subjective well-being scale between individuals in different groups of frequency of internet use. It suggests that the mean subjective well-being scale is different across the five groups.

Step 2: Simulating the null distribution

null_dist <- rp %>%
  specify(explanatory = freqnet,
          response = subwel) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "F")

Step 3: Calculating the p-value of my sample

p_val <- null_dist %>%
  get_pvalue(obs_stat = test_stat, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

# Below is what I did to obtain the critical value

# c_val

The p-value of 0 (not essentially 0, but very close to) indicates that there is a statistically significant association between frequency of internet use and subjective well-being in my dataset.

Also, the test statistic of 324 overly exceeds the critical value of 7.815, which means it is a strong evidence to reject the null hypothesis of this study

Step 4: Visualizing

null_dist %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

## Warning: F usually corresponds to right-tailed tests. Proceed with caution.

Note: the test statistics is so extreme that the default visualization does not display well.

null_dist

## Response: subwel (numeric)
## Explanatory: freqnet (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.329
##  2         2 1.04 
##  3         3 0.516
##  4         4 1.17 
##  5         5 0.876
##  6         6 0.906
##  7         7 2.54 
##  8         8 0.702
##  9         9 1.64 
## 10        10 1.16 
## # ℹ 990 more rows

test_stat <- 324

conf_int <- null_dist %>%
  get_confidence_interval(level = 0.95, type = "percentile")

# visualizing the null distribution with manual x-axis limits

null_dist %>%
  visualize(bins = 15, method = "simulation", dens_color = "black") +
  geom_vline(aes(xintercept = test_stat), color = "red", linetype = "dashed", size = 1) +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int) +
  coord_cartesian(xlim = c(0, max(null_dist$stat) * 1.1)) + # Set manual x-axis limits
  labs(title = "Figure 1: Simulation-Based Null Distribution",
       x = "ANOVA",
       y = "Count")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in min(diff(unique_loc)): min에 전달되는 인자들 중 누락이 있어 Inf를
## 반환합니다

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

I will use the plot above for the report and indicate that test statistic falls way outside the simulated null distribution.

Running three models at once

reg_model1 <- lm(subwel ~ freqnet, data = rp, weights = weight) # SLR
reg_model2 <- lm(subwel ~ freqnet + gen, data = rp, weights = weight) # MLR
reg_model3 <- lm(subwel ~ freqnet + gen + freqnet*gen, data = rp, weights = weight) # MLR + Interaction

modelsummary(
  list(reg_model1, reg_model2, reg_model3),
  fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  coef_rename = c("freqnetOnly occasionally" = "Only occasionally", "freqnetA few times a week" = "A few times a week", "freqnetMost days" = "Most days", "freqnetEvery day" = "Every day", "genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"), # renaming for a clearer output
  statistic = NULL,
  title = "Table 2: Regression models predicting subjective well-being",
  output = "flextable")

Table 2: Regression models predicting subjective well-being
	(1)	(2)	(3)
(Intercept)	57.5 (0.4)***	56.0 (0.5)***	55.7 (0.5)***
Only occasionally	4.4 (0.7)***	2.6 (0.7)***	3.6 (1.7)*
A few times a week	6.8 (0.7)***	4.5 (0.8)***	7.3 (2.5)**
Most days	9.9 (0.6)***	7.0 (0.6)***	7.9 (2.3)***
Every day	14.9 (0.4)***	11.2 (0.6)***	12.2 (1.6)***
Baby Boomers		2.8 (0.6)***	3.7 (0.7)***
Gen X		3.9 (0.7)***	3.4 (1.5)*
Millennials		8.4 (0.7)***	2.4 (4.1)
Only occasionally:Baby Boomers			-1.6 (1.9)
A few times a week:Baby Boomers			-2.9 (2.7)
Most days:Baby Boomers			-1.0 (2.5)
Every day:Baby Boomers			-2.5 (1.8)
Only occasionally:Gen X			-0.1 (2.4)
A few times a week:Gen X			-1.8 (3.0)
Most days:Gen X			-0.4 (2.8)
Every day:Gen X			-0.2 (2.2)
Only occasionally:Millennials			4.0 (4.8)
A few times a week:Millennials			0.2 (5.0)
Most days:Millennials			4.7 (4.7)
Every day:Millennials			5.7 (4.4)
Num.Obs.	7103	7103	7103
R2	0.155	0.177	0.179
R2 Adj.	0.154	0.176	0.177
AIC	57730.8	57547.9	57555.3
BIC	57772.0	57609.7	57699.5
Log.Lik.	-28859.378	-28764.950	-28756.636
RMSE	14.01	13.82	13.80

# summarizing regression models to see p-values of each coefficient

summary(reg_model1)

## 
## Call:
## lm(formula = subwel ~ freqnet, data = rp, weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -97.184 -10.345   3.486  11.034  61.718 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                57.5178     0.3622 158.800  < 2e-16 ***
## freqnetOnly occasionally    4.3556     0.6714   6.487 9.32e-11 ***
## freqnetA few times a week   6.7973     0.7398   9.188  < 2e-16 ***
## freqnetMost days            9.9083     0.5667  17.486  < 2e-16 ***
## freqnetEvery day           14.8629     0.4332  34.313  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.58 on 7098 degrees of freedom
## Multiple R-squared:  0.1547, Adjusted R-squared:  0.1542 
## F-statistic: 324.7 on 4 and 7098 DF,  p-value: < 2.2e-16

summary(reg_model2)

## 
## Call:
## lm(formula = subwel ~ freqnet + gen, data = rp, weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -95.391  -9.254   1.542  12.757  63.917 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                56.0044     0.4564 122.704  < 2e-16 ***
## freqnetOnly occasionally    2.6031     0.7012   3.712 0.000207 ***
## freqnetA few times a week   4.4718     0.7808   5.727 1.06e-08 ***
## freqnetMost days            6.9887     0.6381  10.953  < 2e-16 ***
## freqnetEvery day           11.1611     0.5510  20.256  < 2e-16 ***
## genBaby Boomers             2.8475     0.5853   4.865 1.17e-06 ***
## genGen X                    3.8796     0.6585   5.891 4.01e-09 ***
## genMillennials              8.4272     0.6961  12.107  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.33 on 7095 degrees of freedom
## Multiple R-squared:  0.1768, Adjusted R-squared:  0.176 
## F-statistic: 217.8 on 7 and 7095 DF,  p-value: < 2.2e-16

summary(reg_model3)

## 
## Call:
## lm(formula = subwel ~ freqnet + gen + freqnet * gen, data = rp, 
##     weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -95.498  -9.060   1.182  12.573  64.343 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               55.71091    0.50172 111.040  < 2e-16
## freqnetOnly occasionally                   3.62070    1.67556   2.161 0.030737
## freqnetA few times a week                  7.31588    2.47422   2.957 0.003118
## freqnetMost days                           7.94082    2.29462   3.461 0.000542
## freqnetEvery day                          12.22854    1.64201   7.447 1.07e-13
## genBaby Boomers                            3.73873    0.74573   5.014 5.47e-07
## genGen X                                   3.36440    1.48635   2.264 0.023633
## genMillennials                             2.39896    4.08374   0.587 0.556927
## freqnetOnly occasionally:genBaby Boomers  -1.63145    1.92215  -0.849 0.396040
## freqnetA few times a week:genBaby Boomers -2.90843    2.71362  -1.072 0.283851
## freqnetMost days:genBaby Boomers          -0.95007    2.46214  -0.386 0.699603
## freqnetEvery day:genBaby Boomers          -2.51574    1.79695  -1.400 0.161556
## freqnetOnly occasionally:genGen X         -0.07333    2.44714  -0.030 0.976096
## freqnetA few times a week:genGen X        -1.81220    3.03869  -0.596 0.550943
## freqnetMost days:genGen X                 -0.37093    2.78425  -0.133 0.894020
## freqnetEvery day:genGen X                 -0.17861    2.19143  -0.082 0.935042
## freqnetOnly occasionally:genMillennials    4.03830    4.76305   0.848 0.396556
## freqnetA few times a week:genMillennials   0.20190    5.02833   0.040 0.967972
## freqnetMost days:genMillennials            4.65628    4.73984   0.982 0.325951
## freqnetEvery day:genMillennials            5.68315    4.38987   1.295 0.195498
##                                              
## (Intercept)                               ***
## freqnetOnly occasionally                  *  
## freqnetA few times a week                 ** 
## freqnetMost days                          ***
## freqnetEvery day                          ***
## genBaby Boomers                           ***
## genGen X                                  *  
## genMillennials                               
## freqnetOnly occasionally:genBaby Boomers     
## freqnetA few times a week:genBaby Boomers    
## freqnetMost days:genBaby Boomers             
## freqnetEvery day:genBaby Boomers             
## freqnetOnly occasionally:genGen X            
## freqnetA few times a week:genGen X           
## freqnetMost days:genGen X                    
## freqnetEvery day:genGen X                    
## freqnetOnly occasionally:genMillennials      
## freqnetA few times a week:genMillennials     
## freqnetMost days:genMillennials              
## freqnetEvery day:genMillennials              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.32 on 7083 degrees of freedom
## Multiple R-squared:  0.1788, Adjusted R-squared:  0.1766 
## F-statistic: 81.15 on 19 and 7083 DF,  p-value: < 2.2e-16

# Below is what I did for saving Table 2 as Word

# supermodel summary_model_sum <- modelsummary(
#  list(reg_model1, reg_model2, reg_model3),
#  fmt = 1,
#  estimate  = c("{estimate} ({std.error}){stars}",
#                "{estimate} ({std.error}){stars}",
#                "{estimate} ({std.error}){stars}"),
#   coef_rename = c("freqnetOnly occasionally" = "Only occasionally", "freqnetA few times a week" = "A few times a week", "freqnetMost days" = "Most days", "freqnetEvery day" = "Every day", "genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"), # renaming for a clearer output
# statistic = NULL,
# title = "Table 2: Regression models predicting subjective well-being",
#  output = "flextable")

# flextable::save_as_docx(reg_model_sum, path = "Kim_Eric_Research_Project_Table_2.docx", width = 7.0, height = 7.0)

Presenting a third visual of my choice (interaction plot)

reg_model4 <- lm(subwel ~ freqnet + gen + freqnet*gen, data = rp, weights = weight) # MLR + Interaction

interaction_plot <- effect("freqnet*gen", reg_model4, na.rm=TRUE)

plot(interaction_plot,
     main="Figure 2: Interaction effect of freqnet and gen on subwel",
     xlab="Frequency of internet use",
     ylab="Subjective well-being scale")

I note that there is almost no, if not at all, interaction effect between the variables freqnet and gen.

# Below is what I did for checking model fit, using the performance package

# check_model(reg_model1)
# check_model(reg_model2)
# check_model(reg_model3)
# check_model(reg_model4)

Kim_Eric_Research_Project

Seokang Eric Kim

2024-03-25