library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/caoanjie/Desktop/projects/thrive_coho
library(patchwork)
td <- read_csv(here("data/tidy_data.csv"))
## Rows: 684 Columns: 90
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (79): community_name, community_zipcode, community_stage, community_form...
## dbl  (6): Progress, community_households, individual_household_size, individ...
## lgl  (5): community_total_units_extra_1_5, community_total_units_extra_6_10,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Demographics of the participnts

total_n = count(td)$n
total_ca_n = count(td %>% filter(community_is_california))$n
total_community_n = (td %>% distinct(community_name) %>% count())$n

na_age_m = round(mean(as.numeric(td$individual_age), na.rm = TRUE), 2)
na_age_sd = round(sd(as.numeric(td$individual_age), na.rm = TRUE), 2)
ca_age_m = round(mean(as.numeric(filter(td, community_is_california)$individual_age), na.rm = TRUE), 2)
ca_age_sd = round(sd(as.numeric(filter(td, community_is_california)$individual_age), na.rm = TRUE), 2)
na_white = round((td %>% 
 mutate(
   individual_race_ethnic = case_when(
     grepl(",", individual_race_ethnic) ~ "Multiracial", 
     TRUE ~ individual_race_ethnic
   )
 ) %>% 
group_by(individual_race_ethnic) %>% count() %>% 
  filter(individual_race_ethnic == "White"))$n / total_n, 4)

ca_white = round((td %>% 
                    filter(community_is_california) %>% 
 mutate(
   individual_race_ethnic = case_when(
     grepl(",", individual_race_ethnic) ~ "Multiracial", 
     TRUE ~ individual_race_ethnic
   )
 ) %>% 
group_by(individual_race_ethnic) %>% count() %>% 
  filter(individual_race_ethnic == "White"))$n / total_ca_n, 4)
na_race_piechart <- td %>% 
 mutate(
   individual_race_ethnic = case_when(
     grepl(",", individual_race_ethnic) ~ "Multiracial", 
     TRUE ~ individual_race_ethnic
   )
 ) %>% 
group_by(individual_race_ethnic) %>% 
 count() %>% 
  mutate(percent = n / total_n) %>% 
  ggplot(aes(x="",y=n, fill = individual_race_ethnic)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  
   theme_void() + 
  guides(fill=guide_legend(title="Self-reported race and ethnicity")) + 
  labs(title = "National respondents")+
  theme(axis.text.x=element_blank()) 

ca_race_piechart <- td %>% 
  filter(community_is_california) %>% 
 mutate(
   individual_race_ethnic = case_when(
     grepl(",", individual_race_ethnic) ~ "Multiracial", 
     TRUE ~ individual_race_ethnic
   )
 ) %>% 
group_by(individual_race_ethnic) %>% 
 count() %>% 
  mutate(percent = n / total_n) %>% 
  ggplot(aes(x="",y=n, fill = individual_race_ethnic)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Self-reported race and ethnicity")) + 
  labs(title = "California respondents")
  
na_race_piechart 

ca_race_piechart

- The majority of the respondents in received graduate degrees (National: XXXX; California: XXXX)

na_education <- td %>% 
  group_by(individual_highest_degree) %>% 
  count() %>% 
  ggplot(aes(x="",y=n, fill = individual_highest_degree)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Highest degree received")) + 
  labs(title = "National respondents")

ca_education <- td %>% 
  filter(community_is_california) %>% 
  group_by(individual_highest_degree) %>% 
  count() %>% 
  ggplot(aes(x="",y=n, fill = individual_highest_degree)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Highest degree received")) + 
  labs(title = "California respondents")

na_education + ca_education

na_household_size_m = round(td %>% 
  summarise(mean = mean(individual_household_size, na.rm = TRUE)), 2)
na_household_size_sd = round(td %>% 
  summarise(sd = sd(individual_household_size, na.rm = TRUE)), 2)

ca_household_size_m = round(td %>% filter(community_is_california) %>% 
  summarise(mean = mean(individual_household_size, na.rm = TRUE)), 2)
ca_household_size_sd = round(td %>% filter(community_is_california) %>% 
  summarise(sd = sd(individual_household_size, na.rm = TRUE)), 2)
na_children = td %>% 
  mutate(individual_children_younger_than_17  = as.numeric(individual_children_younger_than_17)) %>%
 group_by(individual_children_younger_than_17) %>% 
  count() %>% 
  mutate(individual_children_younger_than_17 = as.factor(individual_children_younger_than_17)) %>% 
  ggplot(aes(x="",y=n, fill = individual_children_younger_than_17)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Number of Children Younger than 17")) + 
  labs(title = "National respondents")

ca_children = td %>% 
  filter(community_is_california) %>% 
  mutate(individual_children_younger_than_17  = as.numeric(individual_children_younger_than_17)) %>%
 group_by(individual_children_younger_than_17) %>% 
  count() %>% 
  mutate(individual_children_younger_than_17 = as.factor(individual_children_younger_than_17)) %>% 
  ggplot(aes(x="",y=n, fill = individual_children_younger_than_17)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Number of Children Younger than 17")) + 
  labs(title = "California respondents")

na_children + ca_children

td$individual_household_income_2021 = factor(td$individual_household_income_2021, 
                                             levels = c("$350,000 or more", 
                                                        "$250,000 to $349,999", 
                                                        "$150,000 to $249,999", 
                                                        "$100,000 to $149,999", 
                                                        "$75,000 to $99,999", 
                                                        "$50,000 to $74,999", 
                                                        "$35,000 to $49,999", 
                                                        "$20,000 to $34,999", 
                                                        "Less than $20,000", 
                                                        "Prefer not to answer", 
                                                        NA))

na_income <- td %>% 
  group_by(individual_household_income_2021) %>% 
  count() %>%
  ggplot(aes(x="",y=n, fill = individual_household_income_2021)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Household incomes in 2021")) + 
  labs(title = "National respondents")

ca_income <- 
  td %>% 
  filter(community_is_california) %>% 
  group_by(individual_household_income_2021) %>% 
  count() %>%
  ggplot(aes(x="",y=n, fill = individual_household_income_2021)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Household incomes in 2021")) + 
  labs(title = "California respondents")

na_income + ca_income

td$individual_household_asset = factor(td$individual_household_asset, 
                                             levels = c("$2,500,000 or more", 
                                                        "$1,000,000 to $2,499,999", 
                                                        "$500,000 to $999,999", 
                                                        "$250,000 to $499,999", 
                                                        "$100,000 to $249,99", 
                                                        "$50,000 to $99,999", 
                                                        "$25,000 to $49,999",
                                                        "$10,000 to $24,999",
                                                        "$0 to $9,999",
                                                        "Less than $0",
                                                        "Prefer not to answer",
                                                        NA))

na_assets <- td %>% 
  group_by(individual_household_asset) %>% 
  count() %>%
  ggplot(aes(x="",y=n, fill = individual_household_asset)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Total household asset")) + 
  labs(title = "National respondents")

ca_assets <- td %>% 
  filter(community_is_california) %>% 
  group_by(individual_household_asset) %>% 
  count() %>%
  ggplot(aes(x="",y=n, fill = individual_household_asset)) +
  geom_bar(stat="identity",color="white")+ 
  coord_polar("y", start=0) + 
  theme(axis.text.x=element_blank()) +
   theme_void() + 
  guides(fill=guide_legend(title="Total household asset")) + 
  labs(title = "California respondents")

na_assets + ca_assets

Respondent statistics

Most participants did not respond to this question

td %>% 
  select(starts_with("individual")) %>% 
  #filter(!is.na(individual_involved_time)) %>% 
  group_by(individual_involved_time) %>% 
  count()
## # A tibble: 9 × 2
## # Groups:   individual_involved_time [9]
##   individual_involved_time     n
##   <chr>                    <int>
## 1 1-3                         27
## 2 13-15                        1
## 3 19-21                        1
## 4 22-24                        2
## 5 31-33                        1
## 6 4-6                          8
## 7 7-9                          2
## 8 Less than one year          12
## 9 <NA>                       630

how long has they been living in the co-ho

td %>% 
  mutate(individual_years = as.numeric(individual_years)) %>% 
  ggplot(aes(x = individual_years)) + 
  geom_histogram() + 
  facet_wrap(~community_is_california)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `individual_years = as.numeric(individual_years)`.
## Caused by warning:
## ! NAs introduced by coercion
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 252 rows containing non-finite outside the scale range
## (`stat_bin()`).

td %>% 
  select(starts_with("individual"))
## # A tibble: 684 × 15
##    individual_involved_time individual_years individual_ownership
##    <chr>                    <chr>            <chr>               
##  1 <NA>                     6                Own                 
##  2 <NA>                     16               Own                 
##  3 <NA>                     22               Own                 
##  4 <NA>                     21               Own                 
##  5 <NA>                     <NA>             <NA>                
##  6 <NA>                     <NA>             <NA>                
##  7 <NA>                     21               Own                 
##  8 <NA>                     14               Own                 
##  9 <NA>                     2                Own                 
## 10 <NA>                     23               Own                 
## # ℹ 674 more rows
## # ℹ 12 more variables: individual_household_size <dbl>, individual_age <dbl>,
## #   individual_children_younger_than_17 <dbl>, individual_highest_degree <chr>,
## #   individual_household_income_2021 <fct>, individual_household_asset <fct>,
## #   individual_relationship_status <chr>, individual_race_ethnic <chr>,
## #   individual_gender <chr>, individual_religious <chr>,
## #   individual_political_party <chr>, individual_political_orientation <chr>

messy data need further cleaning but relationship status

td %>% 
  select(starts_with("individual")) 
## # A tibble: 684 × 15
##    individual_involved_time individual_years individual_ownership
##    <chr>                    <chr>            <chr>               
##  1 <NA>                     6                Own                 
##  2 <NA>                     16               Own                 
##  3 <NA>                     22               Own                 
##  4 <NA>                     21               Own                 
##  5 <NA>                     <NA>             <NA>                
##  6 <NA>                     <NA>             <NA>                
##  7 <NA>                     21               Own                 
##  8 <NA>                     14               Own                 
##  9 <NA>                     2                Own                 
## 10 <NA>                     23               Own                 
## # ℹ 674 more rows
## # ℹ 12 more variables: individual_household_size <dbl>, individual_age <dbl>,
## #   individual_children_younger_than_17 <dbl>, individual_highest_degree <chr>,
## #   individual_household_income_2021 <fct>, individual_household_asset <fct>,
## #   individual_relationship_status <chr>, individual_race_ethnic <chr>,
## #   individual_gender <chr>, individual_religious <chr>,
## #   individual_political_party <chr>, individual_political_orientation <chr>
td %>% 
  group_by(individual_relationship_status, community_is_california) %>% 
  count() %>% 
  ggplot(aes(x = individual_relationship_status, y = n)) + 
  geom_point() + 
  facet_wrap(~community_is_california) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

many woman?

td %>% 
  select(starts_with("individual")) 
## # A tibble: 684 × 15
##    individual_involved_time individual_years individual_ownership
##    <chr>                    <chr>            <chr>               
##  1 <NA>                     6                Own                 
##  2 <NA>                     16               Own                 
##  3 <NA>                     22               Own                 
##  4 <NA>                     21               Own                 
##  5 <NA>                     <NA>             <NA>                
##  6 <NA>                     <NA>             <NA>                
##  7 <NA>                     21               Own                 
##  8 <NA>                     14               Own                 
##  9 <NA>                     2                Own                 
## 10 <NA>                     23               Own                 
## # ℹ 674 more rows
## # ℹ 12 more variables: individual_household_size <dbl>, individual_age <dbl>,
## #   individual_children_younger_than_17 <dbl>, individual_highest_degree <chr>,
## #   individual_household_income_2021 <fct>, individual_household_asset <fct>,
## #   individual_relationship_status <chr>, individual_race_ethnic <chr>,
## #   individual_gender <chr>, individual_religious <chr>,
## #   individual_political_party <chr>, individual_political_orientation <chr>
td %>% 
group_by(individual_gender, community_is_california) %>% 
  count() 
## # A tibble: 21 × 3
## # Groups:   individual_gender, community_is_california [21]
##    individual_gender                                community_is_califor…¹     n
##    <chr>                                            <lgl>                  <int>
##  1 Cisgender (gender identity corresponds with bir… FALSE                     25
##  2 Cisgender (gender identity corresponds with bir… TRUE                       1
##  3 Cisgender (gender identity corresponds with bir… NA                         1
##  4 Genderfluid                                      FALSE                      1
##  5 I want to name my own custom gender Female       FALSE                      1
##  6 I want to name my own custom gender Queer        FALSE                      1
##  7 I want to name my own custom gender Trans and c… FALSE                      1
##  8 I want to name my own custom gender androgynous  FALSE                      1
##  9 I want to name my own custom gender femail       FALSE                      1
## 10 I want to name my own custom gender irrelevant   FALSE                      1
## # ℹ 11 more rows
## # ℹ abbreviated name: ¹​community_is_california

a huge variety of religion?

td %>% 
  #select(starts_with("individual")) %>% 
  group_by(individual_religious, community_is_california) %>% 
  count() 
## # A tibble: 122 × 3
## # Groups:   individual_religious, community_is_california [122]
##    individual_religious community_is_california     n
##    <chr>                <lgl>                   <int>
##  1 Agnosticism NA       FALSE                      55
##  2 Agnosticism NA       TRUE                       15
##  3 Atheism NA           FALSE                      55
##  4 Atheism NA           TRUE                        7
##  5 Atheism NA           NA                          2
##  6 Buddhism NA          FALSE                      31
##  7 Buddhism NA          TRUE                        5
##  8 Buddhism NA          NA                          3
##  9 Catholicism NA       FALSE                      12
## 10 Catholicism NA       TRUE                        1
## # ℹ 112 more rows

very left leaning?

td %>% 
  select(starts_with("individual"))
## # A tibble: 684 × 15
##    individual_involved_time individual_years individual_ownership
##    <chr>                    <chr>            <chr>               
##  1 <NA>                     6                Own                 
##  2 <NA>                     16               Own                 
##  3 <NA>                     22               Own                 
##  4 <NA>                     21               Own                 
##  5 <NA>                     <NA>             <NA>                
##  6 <NA>                     <NA>             <NA>                
##  7 <NA>                     21               Own                 
##  8 <NA>                     14               Own                 
##  9 <NA>                     2                Own                 
## 10 <NA>                     23               Own                 
## # ℹ 674 more rows
## # ℹ 12 more variables: individual_household_size <dbl>, individual_age <dbl>,
## #   individual_children_younger_than_17 <dbl>, individual_highest_degree <chr>,
## #   individual_household_income_2021 <fct>, individual_household_asset <fct>,
## #   individual_relationship_status <chr>, individual_race_ethnic <chr>,
## #   individual_gender <chr>, individual_religious <chr>,
## #   individual_political_party <chr>, individual_political_orientation <chr>
td %>% 
  #select(starts_with("individual")) %>% 
  group_by(individual_political_party, community_is_california) %>% 
  count() 
## # A tibble: 13 × 3
## # Groups:   individual_political_party, community_is_california [13]
##    individual_political_party community_is_california     n
##    <chr>                      <lgl>                   <int>
##  1 Democrat                   FALSE                     319
##  2 Democrat                   TRUE                       49
##  3 Democrat                   NA                          6
##  4 Independent                FALSE                      57
##  5 Independent                TRUE                        5
##  6 Independent                NA                          4
##  7 Other (please specify)     FALSE                      33
##  8 Other (please specify)     TRUE                        6
##  9 Other (please specify)     NA                          1
## 10 Republican                 FALSE                       2
## 11 <NA>                       FALSE                     170
## 12 <NA>                       TRUE                       19
## 13 <NA>                       NA                         13

very liberal

td %>% 
   group_by(individual_political_orientation, community_is_california) %>% 
  count() 
## # A tibble: 15 × 3
## # Groups:   individual_political_orientation, community_is_california [15]
##    individual_political_orientation                 community_is_califor…¹     n
##    <chr>                                            <lgl>                  <int>
##  1 Conservative on all or most issues               FALSE                      3
##  2 Conservative on social issues, liberal on econo… FALSE                      2
##  3 Conservative on social issues, liberal on econo… TRUE                       1
##  4 Liberal on all or most issues                    FALSE                    320
##  5 Liberal on all or most issues                    TRUE                      51
##  6 Liberal on all or most issues                    NA                         6
##  7 Liberal on social issues, conservative on econo… FALSE                     57
##  8 Liberal on social issues, conservative on econo… TRUE                       6
##  9 Liberal on social issues, conservative on econo… NA                         2
## 10 Other (please specify)                           FALSE                     26
## 11 Other (please specify)                           TRUE                       2
## 12 Other (please specify)                           NA                         2
## 13 <NA>                                             FALSE                    173
## 14 <NA>                                             TRUE                      19
## 15 <NA>                                             NA                        14
## # ℹ abbreviated name: ¹​community_is_california