Data Analysis using R

Exploratory Data Analysis and ANOVA on GSS 2018 Data

———————————————————————–

############################################################
# Clean R Environment
############################################################
# Remove all objects from the workspace
rm(list = ls())

############################################################
# Data File Information
############################################################
# Data file used: gss2018.rda

############################################################
# Set Working Directory
############################################################
# Check current working directory
getwd()
## [1] "D:/D Drive/Ph.D. Course Work/PhD_2025/StudyMaterial"
# Set the directory where data file is stored
setwd("D:/D Drive/Ph.D. Course Work/PhD_2025/DataFile")

# Verify working directory
getwd()
## [1] "D:/D Drive/Ph.D. Course Work/PhD_2025/DataFile"
############################################################
# Load Data into R
############################################################
load("gss2018.rda")

############################################################
# Load Required Libraries
############################################################
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(car) 
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
############################################################
# Create Working Copy of the Data
############################################################
gssdf = GSS

# Summary of original data
summary(gssdf)
##       YEAR          BALLOT         USETECH           HAPPY      
##  Min.   :2018   Min.   :1.000   Min.   : -1.00   Min.   :1.000  
##  1st Qu.:2018   1st Qu.:1.000   1st Qu.: -1.00   1st Qu.:1.000  
##  Median :2018   Median :2.000   Median : 10.00   Median :2.000  
##  Mean   :2018   Mean   :2.002   Mean   : 48.09   Mean   :1.855  
##  3rd Qu.:2018   3rd Qu.:3.000   3rd Qu.: 80.00   3rd Qu.:2.000  
##  Max.   :2018   Max.   :3.000   Max.   :999.00   Max.   :8.000  
##     PARTYID         RINCOME            RACE            SEX       
##  Min.   :0.000   Min.   : 0.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.: 0.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median : 9.000   Median :1.000   Median :2.000  
##  Mean   :2.968   Mean   : 7.509   Mean   :1.394   Mean   :1.552  
##  3rd Qu.:5.000   3rd Qu.:12.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :9.000   Max.   :98.000   Max.   :3.000   Max.   :2.000  
##      DEGREE           EDUC            AGE           MARITAL    
##  Min.   :0.000   Min.   : 0.00   Min.   :18.00   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:12.00   1st Qu.:34.00   1st Qu.:1.00  
##  Median :1.000   Median :14.00   Median :48.00   Median :2.00  
##  Mean   :1.684   Mean   :13.84   Mean   :49.13   Mean   :2.67  
##  3rd Qu.:3.000   3rd Qu.:16.00   3rd Qu.:63.00   3rd Qu.:5.00  
##  Max.   :4.000   Max.   :99.00   Max.   :99.00   Max.   :9.00  
##       HRS2               HRS1          WRKSTAT           ID_      
##  Min.   :-1.00000   Min.   :-1.00   Min.   :1.000   Min.   :   1  
##  1st Qu.:-1.00000   1st Qu.:-1.00   1st Qu.:1.000   1st Qu.: 588  
##  Median :-1.00000   Median :30.00   Median :2.000   Median :1176  
##  Mean   : 0.08017   Mean   :24.47   Mean   :2.963   Mean   :1175  
##  3rd Qu.:-1.00000   3rd Qu.:40.00   3rd Qu.:5.000   3rd Qu.:1762  
##  Max.   :99.00000   Max.   :99.00   Max.   :9.000   Max.   :2348  
##     UNHAPPY     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :1.039  
##  3rd Qu.:2.000  
##  Max.   :9.000
############################################################
# Data Pre-processing and Cleaning
############################################################
# Recode missing values and convert variables to factors
gssdf = gssdf%>%mutate(
  HAPPYC = as.factor(case_when(
    HAPPY == 1 ~ "Very Happy",
    HAPPY == 2 ~ "Pretty Happy",
    HAPPY == 3 ~ "Not too happy",
    TRUE ~ NA)),
  GENDERC = as.factor(case_when(
    SEX == 1 ~ "Male",
    TRUE ~ "Female")),
  DEGC = as.factor(case_when(
    DEGREE == 0 ~ "<High Sc.",
    DEGREE == 1 ~ "High Sc.",
    DEGREE == 2 ~ "HS.",
    DEGREE == 3 ~ "Bachelor",
    DEGREE == 4 ~ "Graduate",
    TRUE ~ NA)),
  USGC = case_when(
    USETECH == -1 ~NA,
    USETECH == 999 ~NA,
    USETECH == 998 ~NA,
    TRUE ~ USETECH),
  AGEC = case_when(
    AGE == 98 ~ NA,
    AGE == 99 ~ NA,
    TRUE ~ AGE))

gssdf = gssdf%>%select(USGC,DEGC,GENDERC)%>%drop_na()



############################################################
# Exploratory Data Analysis (EDA)
############################################################
# Compare mean USETECH across education levels
usedeg <- gssdf %>%
  group_by(DEGC) %>%
  summarize(
    aveT = mean(USGC),
    sdT  = sd(USGC)
  )

usedeg
## # A tibble: 5 × 3
##   DEGC       aveT   sdT
##   <fct>     <dbl> <dbl>
## 1 <High Sc.  24.8  36.2
## 2 Bachelor   67.9  32.1
## 3 Graduate   68.7  30.2
## 4 High Sc.   49.6  38.6
## 5 HS.        62.4  35.2
############################################################
# Boxplots for USETECH across DEGREE
############################################################
ggplot(gssdf, aes(x = DEGC, y = USGC, fill = DEGC)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.4) +
  labs(title = "USETECH by Education Level",
       x = "Education Level",
       y = "Use of Technology")

############################################################
# Density Plots for USETECH by DEGREE
############################################################
ggplot(gssdf, aes(x = USGC, fill = DEGC)) +
  geom_density(alpha = 0.4) +
  facet_wrap(~ DEGC) +
  labs(title = "Density of USETECH by Education Level")

############################################################
# One-Way ANOVA (Assuming Equal Variance)
############################################################
model = aov(USGC ~ DEGC, data = gssdf)
summary(model)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## DEGC           4  221301   55325    43.3 <2e-16 ***
## Residuals   1404 1793757    1278                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
############################################################
# Post Hoc Tests for One-Way ANOVA
############################################################
# Pairwise t-tests with Bonferroni adjustment
pairwise.t.test(gssdf$USGC,
                gssdf$DEGC,
                p.adj = "bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  gssdf$USGC and gssdf$DEGC 
## 
##          <High Sc. Bachelor Graduate High Sc.
## Bachelor < 2e-16   -        -        -       
## Graduate < 2e-16   1.0000   -        -       
## High Sc. 3.8e-11   8.0e-13  7.3e-09  -       
## HS.      2.8e-15   1.0000   1.0000   0.0022  
## 
## P value adjustment method: bonferroni
# Tukey HSD test
TukeyHSD(model)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = USGC ~ DEGC, data = gssdf)
## 
## $DEGC
##                           diff        lwr        upr     p adj
## Bachelor-<High Sc.  43.0859568  32.653180  53.518734 0.0000000
## Graduate-<High Sc.  43.9107249  32.256416  55.565034 0.0000000
## High Sc.-<High Sc.  24.8247754  15.145211  34.504340 0.0000000
## HS.-<High Sc.       37.6070313  25.201887  50.012175 0.0000000
## Graduate-Bachelor    0.8247681  -8.438819  10.088355 0.9992282
## High Sc.-Bachelor  -18.2611813 -24.870652 -11.651711 0.0000000
## HS.-Bachelor        -5.4789255 -15.671017   4.713166 0.5833665
## High Sc.-Graduate  -19.0859494 -27.492207 -10.679691 0.0000000
## HS.-Graduate        -6.3036936 -17.743047   5.135659 0.5592907
## HS.-High Sc.        12.7822558   3.362603  22.201908 0.0020352
############################################################
# Checking Normality Assumption (Q-Q Plot)
############################################################
ggplot(gssdf, aes(sample = USGC, colour = DEGC)) +
  stat_qq() +
  stat_qq_line() +
  facet_wrap(~ DEGC) +
  labs(title = "Q-Q Plot for USETECH by Degree")

############################################################
# Brown-Forsythe Test (Robust to Non-Normality)
############################################################
gssdf <- gssdf %>%
  group_by(DEGC) %>%
  mutate(usetrn = abs(USGC - median(USGC, na.rm = TRUE)))

oneway.test(usetrn ~ DEGC,
            data = gssdf,
            var.equal = TRUE)
## 
##  One-way analysis of means
## 
## data:  usetrn and DEGC
## F = 18.44, num df = 4, denom df = 1404, p-value = 8.845e-15
############################################################
# Levene's Test for Homogeneity of Variance
############################################################
leveneTest(USGC ~ DEGC, data = gssdf)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value    Pr(>F)    
## group    4   18.44 8.845e-15 ***
##       1404                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
############################################################
# Two-Way ANOVA: DEGREE and SEX
############################################################
ggplot(gssdf, aes(x = DEGC, y = USGC, fill = GENDERC)) +
  geom_boxplot() +
  labs(title = "USETECH by Degree and Sex")

############################################################
# Interaction Plot (Mean USETECH)
############################################################
ggplot(gssdf, aes(x = DEGC, y = USGC, colour = GENDERC)) +
  stat_summary(fun = mean, geom = "point") +
  stat_summary(fun = mean, geom = "line", aes(group = GENDERC)) +
  labs(title = "Interaction Plot: DEGREE × SEX")

############################################################
# Two-Way ANOVA Model
############################################################
x <- aov(USGC ~ DEGC * GENDERC, data = gssdf)

# ANOVA table
summary(x)
##                Df  Sum Sq Mean Sq F value   Pr(>F)    
## DEGC            4  221301   55325  44.209  < 2e-16 ***
## GENDERC         1   16473   16473  13.163 0.000296 ***
## DEGC:GENDERC    4   26510    6627   5.296 0.000311 ***
## Residuals    1399 1750775    1251                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
############################################################
# Post Hoc Test for Two-Way ANOVA
############################################################
TukeyHSD(x)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = USGC ~ DEGC * GENDERC, data = gssdf)
## 
## $DEGC
##                           diff        lwr        upr     p adj
## Bachelor-<High Sc.  43.0859568  32.760484  53.411429 0.0000000
## Graduate-<High Sc.  43.9107249  32.376284  55.445165 0.0000000
## High Sc.-<High Sc.  24.8247754  15.244768  34.404783 0.0000000
## HS.-<High Sc.       37.6070313  25.329478  49.884584 0.0000000
## Graduate-Bachelor    0.8247681  -8.343540   9.993076 0.9991960
## High Sc.-Bachelor  -18.2611813 -24.802671 -11.719691 0.0000000
## HS.-Bachelor        -5.4789255 -15.566188   4.608337 0.5733923
## High Sc.-Graduate  -19.0859494 -27.405746 -10.766152 0.0000000
## HS.-Graduate        -6.3036936 -17.625389   5.018002 0.5490670
## HS.-High Sc.        12.7822558   3.459487  22.105024 0.0017563
## 
## $GENDERC
##                 diff       lwr       upr     p adj
## Male-Female -6.80899 -10.50928 -3.108699 0.0003174
## 
## $`DEGC:GENDERC`
##                                         diff         lwr        upr     p adj
## Bachelor:Female-<High Sc.:Female  44.0775194  26.5912218  61.563817 0.0000000
## Graduate:Female-<High Sc.:Female  42.2192029  23.0019908  61.436415 0.0000000
## High Sc.:Female-<High Sc.:Female  32.1878788  15.7321731  48.643584 0.0000000
## HS.:Female-<High Sc.:Female       46.7797619  27.2270154  66.332508 0.0000000
## <High Sc.:Male-<High Sc.:Female    2.0378788 -18.5317533  22.607511 0.9999995
## Bachelor:Male-<High Sc.:Female    44.3530702  26.5940220  62.112118 0.0000000
## Graduate:Male-<High Sc.:Female    48.3917749  28.4942914  68.289258 0.0000000
## High Sc.:Male-<High Sc.:Female    19.8510848   3.4227876  36.279382 0.0052315
## HS.:Male-<High Sc.:Female         23.3560606   0.5896498  46.122471 0.0389231
## Graduate:Female-Bachelor:Female   -1.8583165 -16.3376501  12.621017 0.9999951
## High Sc.:Female-Bachelor:Female  -11.8896406 -22.4319416  -1.347340 0.0133486
## HS.:Female-Bachelor:Female         2.7022425 -12.2195454  17.624030 0.9999069
## <High Sc.:Male-Bachelor:Female   -42.0396406 -58.2710800 -25.808201 0.0000000
## Bachelor:Male-Bachelor:Female      0.2755508 -12.2037783  12.754880 1.0000000
## Graduate:Male-Bachelor:Female      4.3142555 -11.0564866  19.684998 0.9967894
## High Sc.:Male-Bachelor:Female    -24.2264346 -34.7259018 -13.726967 0.0000000
## HS.:Male-Bachelor:Female         -20.7214588 -39.6597618  -1.783156 0.0192858
## High Sc.:Female-Graduate:Female  -10.0313241 -23.2476303   3.184982 0.3233313
## HS.:Female-Graduate:Female         4.5605590 -12.3566037  21.477722 0.9976459
## <High Sc.:Male-Graduate:Female   -40.1813241 -58.2641962 -22.098452 0.0000000
## Bachelor:Male-Graduate:Female      2.1338673 -12.6737082  16.941443 0.9999867
## Graduate:Male-Graduate:Female      6.1725720 -11.1418829  23.487027 0.9816675
## High Sc.:Male-Graduate:Female    -22.3681181 -35.5502821  -9.185954 0.0000039
## HS.:Male-Graduate:Female         -18.8631423 -39.4104039   1.684119 0.1039186
## HS.:Female-High Sc.:Female        14.5918831   0.8922699  28.291496 0.0261888
## <High Sc.:Male-High Sc.:Female   -30.1500000 -45.2655308 -15.034469 0.0000000
## Bachelor:Male-High Sc.:Female     12.1651914   1.1764108  23.153972 0.0167764
## Graduate:Male-High Sc.:Female     16.2038961   2.0166004  30.391192 0.0113631
## High Sc.:Male-High Sc.:Female    -12.3367940 -21.0119573  -3.661631 0.0003049
## HS.:Male-High Sc.:Female          -8.8318182 -26.8228985   9.159262 0.8690307
## <High Sc.:Male-HS.:Female        -44.7418831 -63.1809427 -26.302824 0.0000000
## Bachelor:Male-HS.:Female          -2.4266917 -17.6671952  12.813812 0.9999688
## Graduate:Male-HS.:Female           1.6120130 -16.0741116  19.298138 0.9999998
## High Sc.:Male-HS.:Female         -26.9286771 -40.5953557 -13.261999 0.0000000
## HS.:Male-HS.:Female              -23.4237013 -44.2851158  -2.562287 0.0141081
## Bachelor:Male-<High Sc.:Male      42.3151914  25.7902764  58.840106 0.0000000
## Graduate:Male-<High Sc.:Male      46.3538961  27.5496712  65.158121 0.0000000
## High Sc.:Male-<High Sc.:Male      17.8132060   2.7275183  32.898894 0.0072699
## HS.:Male-<High Sc.:Male           21.3181818  -0.4992077  43.135571 0.0619111
## Graduate:Male-Bachelor:Male        4.0387047 -11.6416301  19.719040 0.9983501
## High Sc.:Male-Bachelor:Male      -24.5019854 -35.4496792 -13.554292 0.0000000
## HS.:Male-Bachelor:Male           -20.9970096 -40.1874372  -1.806582 0.0192892
## High Sc.:Male-Graduate:Male      -28.5406901 -42.6961858 -14.385194 0.0000000
## HS.:Male-Graduate:Male           -25.0357143 -46.2205808  -3.850848 0.0071871
## HS.:Male-High Sc.:Male             3.5049758 -14.4610385  21.470990 0.9998264