GSS Survey Assignment

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

GSS_Data <- read_excel("~/Desktop/GR Stats/GSS_Data.xlsx")
glimpse(GSS_Data)

Rows: 6,309
Columns: 16
$ year     <dbl> 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2…
$ id_      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
$ hrs2     <chr> ".i:  Inapplicable", ".i:  Inapplicable", ".i:  Inapplicable"…
$ childs   <chr> "0", "1", "1", "1", "2", "1", "2", "2", "2", "0", "2", "3", "…
$ age      <chr> "25", "43", "30", "55", "37", "47", "57", "71", "46", "19", "…
$ sex      <chr> "FEMALE", "MALE", "FEMALE", "FEMALE", "MALE", "MALE", "FEMALE…
$ race     <chr> "White", "White", "White", "White", "White", "White", "White"…
$ courts   <chr> "About right", "Not harshly enough", ".i:  Inapplicable", ".i…
$ relig    <chr> "Inter-nondenominational", "Protestant", "Protestant", "Prote…
$ attend   <chr> "About once or twice a year", "About once a month", "Every we…
$ hapmar   <chr> ".i:  Inapplicable", "PRETTY HAPPY", ".i:  Inapplicable", ".i…
$ class_   <chr> "Middle class", "Middle class", "Working class", "Upper class…
$ premarsx <chr> "ALWAYS WRONG", ".i:  Inapplicable", ".i:  Inapplicable", ".i…
$ xmarsex  <chr> "ALWAYS WRONG", "ALWAYS WRONG", ".i:  Inapplicable", ".i:  In…
$ spanking <chr> "STRONGLY AGREE", ".i:  Inapplicable", ".i:  Inapplicable", "…
$ ballot   <chr> "Ballot a", "Ballot c", "Ballot a", "Ballot b", "Ballot c", "…

This function allows us to see the types of data within the dataset named ‘GSS_Data’. This dataset was derived from the GSS data exploring network.

GSS_Data |> 
  mutate(race = as_factor(race)) |> 
  
  mutate(race = fct_recode(race, NULL = ".i:  Inapplicable")) |> 
  
  drop_na(race) |>
  
  
  plot_ly(x = ~race) |>
  add_histogram()

This histogram shows the distribution of races within the population that pertains to the dataset ‘GSS_Data’. We see that most of the population that this data comes from is of individuals who are white (4,500+ individuals), with a few African American or ‘other races’.

GSS_Data |> 
  mutate(spanking = as_factor(spanking)) |> 
  
  mutate(spanking = fct_recode(spanking,
                               "Strongly Agree" = "STRONGLY AGREE",
                               "Agree" = "AGREE",
                               "Disagree" = "DISAGREE",
                               "Strongly Disagree" = "STRONGLY DISAGREE",
                           NULL = ".i:  Inapplicable",
                           NULL = ".n:  No answer",
                           NULL = ".d:  Do not Know/Cannot Choose",
                           NULL = ".s:  Skipped on Web")) |> 
  
  drop_na(spanking) |>
  
  
  plot_ly(x = ~spanking) |>
  add_histogram()

This bar graph displays the parental views on spanking as a form of disciplinary action. Based on the population in ‘GSS_Data’, we see that the majority of the population agree with spanking as a form of disciplinary action (1,200+ agree). However the second highest belief on spanking is ‘Disagree’ (approx 800).

GSS_Data |> 
  mutate(spanking = as_factor(spanking)) |> 
  
  mutate(spanking = fct_recode(spanking,
                               "Strongly Agree" = "STRONGLY AGREE",
                               "Agree" = "AGREE",
                               "Disagree" = "DISAGREE",
                               "Strongly Disagree" = "STRONGLY DISAGREE",
                           NULL = ".i:  Inapplicable",
                           NULL = ".n:  No answer",
                           NULL = ".d:  Do not Know/Cannot Choose",
                           NULL = ".s:  Skipped on Web")) |> 
  
      mutate(spanking = fct_relevel(spanking,
                                c("Strongly Agree",
                                  "Agree",
                                  "Disagree",
                                  "Strongly Disagree"))) |>
  mutate(spanking = as.numeric(spanking)) |>
  
  drop_na(spanking) |>
  
  mutate(race = as_factor(race)) |> 
  
  mutate(race = fct_recode(race, NULL = ".i:  Inapplicable")) |> 
  
  drop_na(race) |>
  
  mutate(race = fct_infreq(race)) |>
  
  plot_ly(x = ~race, y = ~spanking) |>
  add_boxplot()

This boxplot graph displays the opinions on spanking as a form of disciplinary action based on the three most common races (white, black, other).

GSS_Data |> 
  
  mutate(childs = as_factor(childs)) |>
  mutate(childs = fct_recode(childs,
                               "8" = "8 or more",
                           NULL = ".i:  Inapplicable",
                           NULL = ".d:  Do not Know/Cannot Choose")) |> 
  mutate(childs = fct_relevel(childs,
                              c("0",
                                "1",
                                "2",
                                "3",
                                "4",
                                "5",
                                "6",
                                "7",
                                "8"))) |>
  
  drop_na(childs) |>
  
  
  plot_ly(x = ~childs) |>
  add_histogram()

This bar graph shows the number of children each person within the population of the dataset ‘GSS_data’ has. If there were more than eight children in a individuals response, it was only counted as eight. We can see that the majority of this population has 0-2 children.

GSS_Data |> 
  mutate(spanking = as_factor(spanking)) |> 
  
  mutate(spanking = fct_recode(spanking,
                               "Strongly Agree" = "STRONGLY AGREE",
                               "Agree" = "AGREE",
                               "Disagree" = "DISAGREE",
                               "Strongly Disagree" = "STRONGLY DISAGREE",
                           NULL = ".i:  Inapplicable",
                           NULL = ".n:  No answer",
                           NULL = ".d:  Do not Know/Cannot Choose",
                           NULL = ".s:  Skipped on Web")) |> 
  mutate(spanking = fct_relevel(spanking,
                                c("Strongly Agree",
                                  "Agree",
                                  "Disagree",
                                  "Strongly Disagree")))  |>
  
  drop_na(spanking) |>
  
   mutate(childs = as_factor(childs)) |>
  mutate(childs = fct_recode(childs,
                               "8" = "8 or more",
                           NULL = ".i:  Inapplicable",
                           NULL = ".d:  Do not Know/Cannot Choose")) |> 
  mutate(childs = fct_relevel(childs,
                              c("0",
                                "1",
                                "2",
                                "3",
                                "4",
                                "5",
                                "6",
                                "7",
                                "8"))) |>
  
  drop_na(childs) |>
  
  plot_ly(x = ~spanking, y = ~childs) |>
  add_boxplot()

This boxplot shows the distribution of opinions on spanking as a disciplinary action (x axis) in comparison to how many children each individual has (y axis). For the most part we see the those who strongly agree, agree, or disagree on spanking tend to have 2-3 children. Meanwhile those who strongly disagree on spanking have one or two children.

GSS_Data |> 
  mutate(hapmar = as_factor(hapmar)) |>
  mutate(hapmar = fct_recode(hapmar,
                             "Not Too Happy" = "NOT TOO HAPPY",
                             "Pretty Happy" = "PRETTY HAPPY",
                             "Very Happy" = "VERY HAPPY",
                              NULL = ".i:  Inapplicable",
                           NULL = ".n:  No answer",
                           NULL = ".d:  Do not Know/Cannot Choose",
                           NULL = ".s:  Skipped on Web")) |>
  
  mutate(hapmar = fct_relevel(hapmar,
                              c("Not Too Happy",
                                "Pretty Happy",
                                "Very Happy"))) |>
  drop_na(hapmar) |>
  plot_ly(x = ~hapmar) |>
  add_histogram()

This histogram displays the amount of individuals in the dataset who believe that their marriage is either ‘not too happy’, ‘pretty happy’, or ‘very happy’. For the majority of the population we see that the individuals believe that their marriage is very happy (1,200+).

GSS_Data |> 
  mutate(class_ = as_factor(class_)) |> 
  
  mutate(class_ = fct_recode(class_,
                             NULL = ".d:  Do not Know/Cannot Choose",
                             NULL = ".n:  No answer",
                             NULL = ".s:  Skipped on Web")) |>
  
  drop_na(class_) |>
  
  mutate(class_ = fct_relevel(class_,
                              "Lower class",
                              "Working class",
                              "Middle class",
                              "Upper class")) |>
  plot_ly(x = ~class_) |>
  add_histogram()

This historgram shows the overall socioeconomic status level of the individuals within the population in the dataset ‘GSS_data’. We see that the majority of its population contains individuals who are in the working and middle classes (2702, 2749).

GSS_Data |> 
  mutate(class_ = as_factor(class_)) |> 
  
  mutate(class_ = fct_recode(class_,
                             NULL = ".d:  Do not Know/Cannot Choose",
                             NULL = ".n:  No answer",
                             NULL = ".s:  Skipped on Web")) |>
  
  drop_na(class_) |>
  
  mutate(class_ = fct_relevel(class_,
                              "Lower class",
                              "Working class",
                              "Middle class",
                              "Upper class")) |>
   mutate(hapmar = as_factor(hapmar)) |>
  mutate(hapmar = fct_recode(hapmar,
                             "Not Too Happy" = "NOT TOO HAPPY",
                             "Pretty Happy" = "PRETTY HAPPY",
                             "Very Happy" = "VERY HAPPY",
                              NULL = ".i:  Inapplicable",
                           NULL = ".n:  No answer",
                           NULL = ".d:  Do not Know/Cannot Choose",
                           NULL = ".s:  Skipped on Web")) |>
  
  mutate(hapmar = fct_relevel(hapmar,
                              c("Not Too Happy",
                                "Pretty Happy",
                                "Very Happy"))) |>
  drop_na(hapmar) |>
  
  plot_ly(x = ~class_, color = ~hapmar) |>
  add_histogram()

This overlay histogram shows the correlation between the socioeconimc status of the population (lower, working, middle, and upper) in relation to the perceived level of happiness in their marriages (not too happy, pretty happy, very happy). We can see that those in the working and middle class have more reports of unhappiness. However we also see that the orking and middle class individuals have a much larger amount of ‘pretty happy’ and ‘very happy’ responses towards their marriages.

GSS_Data |> 
  mutate(age = as_factor(age)) |>
  mutate(age = fct_recode(age,
                          NULL = ".n:  No answer")) |>
  drop_na(age) |>
  mutate(age = fct_collapse(age,
                            "Under 30" = c("18",
                                           "19",
                                           "20",
                                           "21",
                                           "22",
                                           "23",
                                           "24",
                                           "25",
                                           "26",
                                           "27",
                                           "28",
                                           "29"),
                           "30s" = c("30",
                                     "31",
                                     "32",
                                     "33",
                                     "34",
                                     "35",
                                     "36",
                                     "37",
                                     "38",
                                     "39"),
                           "40s" = c("40",
                                     "41",
                                     "42",
                                     "43",
                                     "44",
                                     "45",
                                     "46",
                                     "47",
                                     "48",
                                     "49"),
                           "50s" = c("50",
                                     "51",
                                     "52",
                                     "53",
                                     "54",
                                     "55",
                                     "56",
                                     "57",
                                     "58",
                                     "59"),
                           "60s" = c("60",
                                     "61",
                                     "62",
                                     "63",
                                     "64",
                                     "65",
                                     "66",
                                     "67",
                                     "68",
                                     "69"),
                           "70s" = c("70",
                                     "71",
                                     "72",
                                     "73",
                                     "74",
                                     "75",
                                     "76",
                                     "77",
                                     "78",
                                     "79"),
                           "80 and up" = c("80",
                                           "81",
                                           "82",
                                           "83",
                                           "84",
                                           "85",
                                           "86",
                                           "87",
                                           "88",
                                           "89 or older"))) |>
  mutate(age = fct_relevel(age,
                           c("Under 30",
                             "30s",
                             "40s",
                             "50s",
                             "60s",
                             "70s",
                             "80 and up"))) |>
  plot_ly(x = ~age) |>
  add_histogram()

This histogram shows the distribution of age groups from the overall population in this data set. In this histogram, the years of each age group was separated into years of ten. We see that the majority of the population is around the age of 30.

GSS_Data |>
 mutate(age = as_factor(age)) |>
  mutate(age = fct_recode(age,
                          NULL = ".n:  No answer")) |>
  drop_na(age) |>
  mutate(age = fct_collapse(age,
                            "Under 30" = c("18",
                                           "19",
                                           "20",
                                           "21",
                                           "22",
                                           "23",
                                           "24",
                                           "25",
                                           "26",
                                           "27",
                                           "28",
                                           "29"),
                           "30s" = c("30",
                                     "31",
                                     "32",
                                     "33",
                                     "34",
                                     "35",
                                     "36",
                                     "37",
                                     "38",
                                     "39"),
                           "40s" = c("40",
                                     "41",
                                     "42",
                                     "43",
                                     "44",
                                     "45",
                                     "46",
                                     "47",
                                     "48",
                                     "49"),
                           "50s" = c("50",
                                     "51",
                                     "52",
                                     "53",
                                     "54",
                                     "55",
                                     "56",
                                     "57",
                                     "58",
                                     "59"),
                           "60s" = c("60",
                                     "61",
                                     "62",
                                     "63",
                                     "64",
                                     "65",
                                     "66",
                                     "67",
                                     "68",
                                     "69"),
                           "70s" = c("70",
                                     "71",
                                     "72",
                                     "73",
                                     "74",
                                     "75",
                                     "76",
                                     "77",
                                     "78",
                                     "79"),
                           "80 and up" = c("80",
                                           "81",
                                           "82",
                                           "83",
                                           "84",
                                           "85",
                                           "86",
                                           "87",
                                           "88",
                                           "89 or older"))) |>
  mutate(age = fct_relevel(age,
                           c("Under 30",
                             "30s",
                             "40s",
                             "50s",
                             "60s",
                             "70s",
                             "80 and up"))) |>
  mutate(hapmar = as_factor(hapmar)) |>
  mutate(hapmar = fct_recode(hapmar,
                             "Not Too Happy" = "NOT TOO HAPPY",
                             "Pretty Happy" = "PRETTY HAPPY",
                             "Very Happy" = "VERY HAPPY",
                              NULL = ".i:  Inapplicable",
                           NULL = ".n:  No answer",
                           NULL = ".d:  Do not Know/Cannot Choose",
                           NULL = ".s:  Skipped on Web")) |>
  
  mutate(hapmar = fct_relevel(hapmar,
                              c("Not Too Happy",
                                "Pretty Happy",
                                "Very Happy"))) |>
  drop_na(hapmar) |>
  plot_ly(x = ~age, y = ~hapmar) |>
  add_histogram2d()

This heatmap plot shows the perceived level of happiness in marriage (not happy, pretty happy, very happy) in comparsion to the age ranges of the population in the dataset ‘GSS_data’. We see that the individuals in this dataset that are between the ages of thirty and forty and commonly more happy with their marriages than individuals who are older.

Quarto