Workbook

Author

Weronika Staniak

Published

October 10, 2024

Week 3 Session:

Problem A:

Summarising Population Data

# Load the tidyverse package 
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the midwest dataset
data("midwest")
# Summarizing population statistics by state
population_summary <- midwest %>%
  group_by(state) %>%  # Group the data by state
  summarise(
    poptotalmean = mean(poptotal),        # Calculate the average total population for each state
    poptotalmed = median(poptotal),       # Calculate the median total population for each state
    popmax = max(poptotal),               # Find the maximum total population for each state
    popmin = min(poptotal),               # Find the minimum total population for each state
    popdistinct = n_distinct(poptotal),   # Count the number of distinct total population values
    popfirst = first(poptotal),           # Get the first total population value for each state
    popany = any(poptotal < 5000),        # Check if any total population values are less than 5000
    popany2 = any(poptotal > 2000000)      # Check if any total population values are greater than 2,000,000
  ) %>%
  ungroup()  # Remove grouping structure

# Display the summarized population data
print(population_summary)
# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

Problem B

Counting Population Categories

# Load the tidyverse package
library(tidyverse)

# Load the midwest dataset
data("midwest")

# Counting counties based on population thresholds
population_count_summary <- midwest %>%
  group_by(state) %>%  # Group the data by state
  summarise(
    num5k = sum(poptotal < 5000),      # Count counties with a total population less than 5000
    num2mil = sum(poptotal > 2000000),  # Count counties with a total population greater than 2,000,000
    numrows = n()                       # Count the total number of counties in each state
  ) %>%
  ungroup()                             # Remove grouping structure

# Display the summarized population counts
print(population_count_summary)
# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

Problem C

# Counting distinct states per county
distinct_states_count <- midwest %>%
  group_by(county) %>%                # Group by county
  summarize(x = n_distinct(state)) %>% # Count distinct states in each county
  arrange(desc(x)) %>%                # Arrange by count in descending order
  ungroup()                           # Remove grouping

# Display the results for Part I
print(distinct_states_count)
# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows
# Counting total rows per county
total_count_per_county <- midwest %>%
  group_by(county) %>%                # Group by county
  summarize(x = n()) %>%              # Count total rows in each county
  ungroup()                           # Remove grouping

# Display the results for Part II
print(total_count_per_county)
# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows
# Counting distinct counties in each county (should always be 1)
distinct_counties_count <- midwest %>%
  group_by(county) %>%                # Group by county
  summarize(x = n_distinct(county)) %>% # Count distinct counties in each county
  ungroup()                           # Remove grouping

# Display the results for Part III
print(distinct_counties_count)
# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

Notes: I am doing this but I still don’t really understand it? Am I doing this right? Am I missing something? IDK HELP!!!! - I am going to finish these questions here as I know what I’m doing in terms of codes etc but not understanding them- so I will go back do some reading and try and understand what this means! feedback would be appreciated!

Good and Bad Questions About the Diamonds Dataset

In this section, I will explore the principles of formulating effective questions by generating one good and one bad question about the diamonds data-set.

Good Question

Question: What is the average price of diamonds for each cut, and how does this vary by clarity?

Why This is a Good Question:

Specific and Focused: It clearly defines the variables of interest (price, cut, and clarity).

Quantitative Analysis: It invites a quantitative analysis that can be explored using summary statistics, making it actionable.

Comparative Aspect: It allows for comparisons between different cuts and clarities, leading to more insightful conclusions.

# Example code to answer the good question
diamonds_summary <- diamonds %>%
  group_by(cut, clarity) %>%
  summarize(average_price = mean(price, na.rm = TRUE)) %>%
  arrange(cut, clarity)
`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
print(diamonds_summary)
# A tibble: 40 × 3
# Groups:   cut [5]
   cut   clarity average_price
   <ord> <ord>           <dbl>
 1 Fair  I1              3704.
 2 Fair  SI2             5174.
 3 Fair  SI1             4208.
 4 Fair  VS2             4175.
 5 Fair  VS1             4165.
 6 Fair  VVS2            3350.
 7 Fair  VVS1            3871.
 8 Fair  IF              1912.
 9 Good  I1              3597.
10 Good  SI2             4580.
# ℹ 30 more rows

Bad Question

Question: Why are diamonds expensive?

Why Is This a Bad Question?:

  • Vauge and Subjective: Question is too broad and lacks specificity regarding what factors influence prices

  • Not Quantifiable: Does not provide a clear path for analysis

  • Lacks Context: Without the scope (size, cut, colour), it can lead to confusion.

Instead of asking why diamonds are expensive, a more effective question might be:

What factors are significantly associated with the price of diamonds? - This question directs the analysis towards specific variables and allows for a more focused investigation.

Week 4 Session: GGPLOT

library(tidyverse)
library(modeldata)
?ggplot

?crickets
view(crickets)

Basics

ggplot(crickets, aes(x=temp,
                     y=rate)) +
  geom_point() +
  labs(x= "Temperature", 
       y= "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009")

ggplot(crickets, aes(x=temp,
                     y=rate,
                     colour= species)) +
  geom_point() +
  labs(x= "Temperature",
       y= "Chirp rate",
       colour= "Species",
       title="Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

Modifying the basic properties of the plot

ggplot(crickets, aes(x=temp,
                     y=rate)) +
  geom_point(colour= "blue",
             size=2,
             alpha=.3,
             shape="square") +
  labs(x="Temperature",
       y="Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald(2009)")

geom_abline() with ?geom_point - adding more layers

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")
`geom_smooth()` using formula = 'y ~ x'

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")
`geom_smooth()` using formula = 'y ~ x'

Other Plots :)

ggplot(crickets, aes(x = rate)) + 
  geom_histogram(bins = 15) # one quantitative variable

ggplot(crickets, aes(x = rate)) + 
  geom_freqpoly(bins = 15)

ggplot(crickets, aes(x = species)) + 
  geom_bar(color = "black",
           fill = "lightblue")

ggplot(crickets, aes(x = species, 
                     fill = species)) + 
  geom_bar(show.legend = FALSE) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = species, 
                     y = rate,
                     color = species)) + 
  geom_boxplot(show.legend = FALSE) +
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()

?theme_minimal()
ggplot(crickets, aes(x = rate, 
                     fill = species)) + 
  geom_histogram(bins = 15) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species) +
  scale_fill_brewer(palette = "Dark2")

?facet_wrap

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species,
             ncol = 1) +
  scale_fill_brewer(palette = "Dark2") + 
  theme_minimal()