group_by&summarise

## Loading required package: pacman

covid_well_being <- import("https://docs.google.com/uc?id=1QIzVc9xOwKbPJVZKZwNzWAAUe6MmOfBu&export=download",
 format = "csv",
 setclass = "tibble")

head(covid_well_being) %>% 
  select(respondent_id, survey_number, survey_time, 
         fell_asleep_time, woke_up_time)

## # A tibble: 6 x 5
##   respondent_id survey_number survey_time         fell_asleep_time woke_up_time
##           <int>         <int> <dttm>                         <int>        <int>
## 1             1             1 2020-03-23 16:11:36            82800        16200
## 2             1             2 2020-03-24 10:38:09            83700        25200
## 3             1             3 2020-03-25 17:33:05            81000        16200
## 4             1             4 2020-03-26 20:18:40            82800        24300
## 5             1             5 2020-03-27 09:16:29            84600        20700
## 6             1             6 2020-03-28 17:54:44            80100        26100

covid_demog <- 
  covid_well_being %>%
  filter(survey_number == 1) %>% # Filter rows to keep only the first survey day
  select(respondent_id:household_income) # Select columns from respondent_id to household_income

covid_demog %>%
  group_by(gender) %>%                 # Group the data by gender
  summarize(
    num_of_people = n(),               # Calculate the number of people per gender group
    median_age = median(age1, na.rm = TRUE)  # Calculate the median age per gender group
  )

## # A tibble: 3 x 3
##   gender     num_of_people median_age
##   <chr>              <int>      <dbl>
## 1 Female               193         31
## 2 Male                  56         33
## 3 Non-binary             1         21

covid_demog %>%
  group_by(state) %>%                    # Group the data by state
  summarize(
    num_of_respondents = n(),            # Calculate the number of respondents per state
    median_age = median(age1, na.rm = TRUE)  # Calculate the median age per state
  ) %>%
  arrange(desc(num_of_respondents)) %>%   # Arrange the results by the count of respondents in descending order
  head(3)                                 # Select the top three states with the most respondents

## # A tibble: 3 x 3
##   state num_of_respondents median_age
##   <chr>              <int>      <dbl>
## 1 MA                   104         33
## 2 NY                    19         28
## 3 IL                    13         30

social_contact_and_stress <- 
  covid_well_being %>%
  group_by(respondent_id) %>%
  summarize(
    mean_daily_contacts = mean(num_people_contacted, na.rm = TRUE),
    mean_stress_levels = mean(stress_1_low_7_high, na.rm = TRUE)
  )

social_contact_and_stress %>%
  signif(digits = 2) %>%
  head(2)

## # A tibble: 2 x 3
##   respondent_id mean_daily_contacts mean_stress_levels
##           <dbl>               <dbl>              <dbl>
## 1             1                 1.8                3.9
## 2             4                 1.4                4.2

ggplot(data = social_contact_and_stress) +
  aes(x = mean_daily_contacts, y = mean_stress_levels) +
  geom_point()

cor(x = social_contact_and_stress$mean_daily_contacts,
    y = social_contact_and_stress$mean_stress_levels) %>%
  signif(digits = 2)

## [1] -0.028

result_table <- covid_well_being %>%
  mutate(actual_minutes_asleep = minutes_from_sleep_to_wake - minutes_awake_at_night) %>%
  group_by(respondent_id) %>%
  summarize(
    mean_daily_minutes_asleep = mean(actual_minutes_asleep, na.rm = TRUE),
    mean_stress_levels = mean(stress_1_low_7_high, na.rm = TRUE)
  ) %>%
  mutate(
    mean_daily_minutes_asleep = signif(mean_daily_minutes_asleep, digits = 3),
    mean_stress_levels = signif(mean_stress_levels, digits = 3)
  )

result_table

## # A tibble: 250 x 3
##    respondent_id mean_daily_minutes_asleep mean_stress_levels
##            <int>                     <dbl>              <dbl>
##  1             1                       407               3.89
##  2             4                       496               4.18
##  3            11                       688               4.8 
##  4            14                       503               6.33
##  5            15                       397               4.5 
##  6            16                       529               3   
##  7            19                       364               6.12
##  8            20                       508               3.29
##  9            22                       534               4.88
## 10            23                       432               5.11
## # i 240 more rows

ggplot(data = result_table) +
  aes(x = mean_daily_minutes_asleep, y = mean_stress_levels) +
  geom_point() +
  labs(x = "Mean Daily Minutes Asleep", y = "Mean Stress Levels")

cor(x = result_table$mean_daily_minutes_asleep,
    y = result_table$mean_stress_levels) %>%
  signif(digits = 2)

## [1] 0.039

group_by&summarise

John Oyan Naivest

2023-08-16