HW 08 - Exploring the GSS

Insert your name here Insert date here

Load packages and data

library(tidyverse)
library(dsbox)

Exercises

Exercise 1

gss16 %>%
  count(harass5)

## # A tibble: 4 × 2
##   harass5                                                     n
##   <chr>                                                   <int>
## 1 Does not apply (i do not have a job/superior/co-worker)    96
## 2 No                                                       1136
## 3 Yes                                                       237
## 4 <NA>                                                     1398

Exercise 2

gss %>%
  filter(harass5 != "NA", harass5 != "Does not apply(i do not have a job/superior/co-worker)") %>%
  count(harass5) %>%
  mutate(percent = n / sum(n) * 100)

## # A tibble: 3 × 3
##   harass5                                                     n percent
##   <chr>                                                   <int>   <dbl>
## 1 Does not apply (i do not have a job/superior/co-worker)    96    6.54
## 2 No                                                       1136   77.3 
## 3 Yes                                                       237   16.1

Exercise 3

gss16 <- gss16 %>%
  mutate(email = emailhr * 60 + emailmin)

gss16 <- gss16 %>%
  mutate(
    email = emailhr * 60 + emailmin
  )

Exercise 4

ggplot(gss16, aes(x = email)) +
  geom_histogram(binwidth = 60) +
  labs(
    title = "Minutes Spent on Email Weekly",
    x = "Minutes per week",
    y = "Count"
  )

## Warning: Removed 1218 rows containing non-finite outside the scale range
## (`stat_bin()`).

gss16 %>%
  summarize(
    mean_email = mean(email, na.rm = TRUE),
    median_email = median(email, na.rm = TRUE)
  )

## # A tibble: 1 × 2
##   mean_email median_email
##        <dbl>        <dbl>
## 1       417.          120

The median is a better measure of the typical amount of time spent on email because the distribution is likely skewed by people who spend extremely large amounts of time on email each week.

Exercise 5

gss16 <- gss16 %>%
  mutate(
    snap_insta = case_when(
      snapchat == "Yes" | instagrm == "Yes" ~ "Yes",
      snapchat == "No" & instagrm == "No" ~ "No",
      is.na(snapchat) & is.na(instagrm) ~ NA_character_
    )
  )

Exercise 6

gss16 %>%
  filter(!is.na(snap_insta)) %>%
  count(snap_insta) %>%
  mutate(percent = n / sum(n) * 100)

## # A tibble: 2 × 3
##   snap_insta     n percent
##   <chr>      <int>   <dbl>
## 1 No           858    62.5
## 2 Yes          514    37.5

Exercise 7

gss16 %>%
  count(wrkstat)

## # A tibble: 9 × 2
##   wrkstat              n
##   <chr>            <int>
## 1 Keeping house      284
## 2 Other               89
## 3 Retired            574
## 4 School              76
## 5 Temp not working    57
## 6 Unempl, laid off   118
## 7 Working fulltime  1321
## 8 Working parttime   345
## 9 <NA>                 3

Exercise 8

email_model <- lm(email ~ educ + wrkstat + snap_insta,
                  data = gss16)

summary(email_model)

## 
## Call:
## lm(formula = email ~ educ + wrkstat + snap_insta, data = gss16)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -760.5 -372.7 -161.2   95.4 3355.6 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -229.736    149.837  -1.533  0.12569    
## educ                      29.632      9.601   3.087  0.00211 ** 
## wrkstatOther              33.057    209.470   0.158  0.87465    
## wrkstatRetired            68.279    111.051   0.615  0.53887    
## wrkstatSchool           -123.812    143.981  -0.860  0.39014    
## wrkstatTemp not working  -73.709    153.948  -0.479  0.63225    
## wrkstatUnempl, laid off  118.349    151.242   0.783  0.43419    
## wrkstatWorking fulltime  366.840     87.690   4.183 3.26e-05 ***
## wrkstatWorking parttime   18.900    101.632   0.186  0.85253    
## snap_instaYes            149.961     52.745   2.843  0.00460 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 642.2 on 669 degrees of freedom
##   (2188 observations deleted due to missingness)
## Multiple R-squared:  0.1043, Adjusted R-squared:  0.09227 
## F-statistic: 8.657 on 9 and 669 DF,  p-value: 2.395e-12

Exercise 9

model_data <- na.omit(gss16[, c("email", "educ", "wrkstat", "snap_insta")])

email_model <- lm(email ~ educ + wrkstat + snap_insta, data = model_data)

model_data$predicted <- predict(email_model)
model_data$residuals <- resid(email_model)

ggplot(model_data, aes(predicted, residuals)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed")

Exercise 10

gss16 <- gss16 %>%
  mutate(
    science_support = case_when(
      advfront == "Strongly agree" ~ "Yes",
      advfront == "Agree" ~ "Yes",
      advfront == "Disagree" ~ "No",
      advfront == "Strongly disagree" ~ "No",)
  )

Exercise 11

gss16 <- gss16 %>%
  mutate(
    political_group = case_when(
      polviews %in% c("Extremely liberal",
                      "Liberal",
                      "Slightly liberal") ~ "Liberal",

      polviews %in% c("Slghtly conservative",
                      "Conservative",
                      "Extrmly conservative") ~ "Conservative",

      polviews == "Moderate" ~ "Moderate"
    )
  )

gss16$political_group <- factor(
  gss16$political_group,
  levels = c("Liberal", "Moderate", "Conservative")
)

Exercise 12

ggplot(gss16,
       aes(x = political_group,
           fill = science_support)) +
  geom_bar(position = "fill") +
  labs(
    title = "Political Views and Support for Science Research",
    x = "Political Group",
    y = "Proportion",
    fill = "Support Science Research"
  )