Workbook

Author

Weronika Staniak

Published

October 10, 2024

Week 3 Session:

Problem A:

Summarising Population Data

# Load the tidyverse package 
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the midwest dataset
data("midwest")
# Summarizing population statistics by state
population_summary <- midwest %>%
  group_by(state) %>%  # Group the data by state
  summarise(
    poptotalmean = mean(poptotal),        # Calculate the average total population for each state
    poptotalmed = median(poptotal),       # Calculate the median total population for each state
    popmax = max(poptotal),               # Find the maximum total population for each state
    popmin = min(poptotal),               # Find the minimum total population for each state
    popdistinct = n_distinct(poptotal),   # Count the number of distinct total population values
    popfirst = first(poptotal),           # Get the first total population value for each state
    popany = any(poptotal < 5000),        # Check if any total population values are less than 5000
    popany2 = any(poptotal > 2000000)      # Check if any total population values are greater than 2,000,000
  ) %>%
  ungroup()  # Remove grouping structure

# Display the summarized population data
print(population_summary)
# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

Problem B

Counting Population Categories

# Load the tidyverse package
library(tidyverse)

# Load the midwest dataset
data("midwest")

# Counting counties based on population thresholds
population_count_summary <- midwest %>%
  group_by(state) %>%  # Group the data by state
  summarise(
    num5k = sum(poptotal < 5000),      # Count counties with a total population less than 5000
    num2mil = sum(poptotal > 2000000),  # Count counties with a total population greater than 2,000,000
    numrows = n()                       # Count the total number of counties in each state
  ) %>%
  ungroup()                             # Remove grouping structure

# Display the summarized population counts
print(population_count_summary)
# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

Problem C

# Counting distinct states per county
distinct_states_count <- midwest %>%
  group_by(county) %>%                # Group by county
  summarize(x = n_distinct(state)) %>% # Count distinct states in each county
  arrange(desc(x)) %>%                # Arrange by count in descending order
  ungroup()                           # Remove grouping

# Display the results for Part I
print(distinct_states_count)
# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows
# Counting total rows per county
total_count_per_county <- midwest %>%
  group_by(county) %>%                # Group by county
  summarize(x = n()) %>%              # Count total rows in each county
  ungroup()                           # Remove grouping

# Display the results for Part II
print(total_count_per_county)
# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows
# Counting distinct counties in each county (should always be 1)
distinct_counties_count <- midwest %>%
  group_by(county) %>%                # Group by county
  summarize(x = n_distinct(county)) %>% # Count distinct counties in each county
  ungroup()                           # Remove grouping

# Display the results for Part III
print(distinct_counties_count)
# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

Notes: I am doing this but I still don’t really understand it? Am I doing this right? Am I missing something? IDK HELP!!!! - I am going to finish these questions here as I know what I’m doing in terms of codes etc but not understanding them- so I will go back do some reading and try and understand what this means! feedback would be appreciated!

Good and Bad Questions About the Diamonds Dataset

In this section, I will explore the principles of formulating effective questions by generating one good and one bad question about the diamonds data-set.

Good Question

Question: What is the average price of diamonds for each cut, and how does this vary by clarity?

Why This is a Good Question:

Specific and Focused: It clearly defines the variables of interest (price, cut, and clarity).

Quantitative Analysis: It invites a quantitative analysis that can be explored using summary statistics, making it actionable.

Comparative Aspect: It allows for comparisons between different cuts and clarities, leading to more insightful conclusions.

# Example code to answer the good question
diamonds_summary <- diamonds %>%
  group_by(cut, clarity) %>%
  summarize(average_price = mean(price, na.rm = TRUE)) %>%
  arrange(cut, clarity)
`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
print(diamonds_summary)
# A tibble: 40 × 3
# Groups:   cut [5]
   cut   clarity average_price
   <ord> <ord>           <dbl>
 1 Fair  I1              3704.
 2 Fair  SI2             5174.
 3 Fair  SI1             4208.
 4 Fair  VS2             4175.
 5 Fair  VS1             4165.
 6 Fair  VVS2            3350.
 7 Fair  VVS1            3871.
 8 Fair  IF              1912.
 9 Good  I1              3597.
10 Good  SI2             4580.
# ℹ 30 more rows

Bad Question

Question: Why are diamonds expensive?

Why Is This a Bad Question?:

  • Vauge and Subjective: Question is too broad and lacks specificity regarding what factors influence prices

  • Not Quantifiable: Does not provide a clear path for analysis

  • Lacks Context: Without the scope (size, cut, colour), it can lead to confusion.

Instead of asking why diamonds are expensive, a more effective question might be:

What factors are significantly associated with the price of diamonds? - This question directs the analysis towards specific variables and allows for a more focused investigation.

Week 4 Session: GGPLOT

library(tidyverse)
library(modeldata)
?ggplot

?crickets
view(crickets)

Basics

ggplot(crickets, aes(x=temp,
                     y=rate)) +
  geom_point() +
  labs(x= "Temperature", 
       y= "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009")

ggplot(crickets, aes(x=temp,
                     y=rate,
                     colour= species)) +
  geom_point() +
  labs(x= "Temperature",
       y= "Chirp rate",
       colour= "Species",
       title="Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

Modifying the basic properties of the plot

ggplot(crickets, aes(x=temp,
                     y=rate)) +
  geom_point(colour= "blue",
             size=2,
             alpha=.3,
             shape="square") +
  labs(x="Temperature",
       y="Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald(2009)")

geom_abline() with ?geom_point - adding more layers

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")
`geom_smooth()` using formula = 'y ~ x'

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")
`geom_smooth()` using formula = 'y ~ x'

Other Plots :)

ggplot(crickets, aes(x = rate)) + 
  geom_histogram(bins = 15) # one quantitative variable

ggplot(crickets, aes(x = rate)) + 
  geom_freqpoly(bins = 15)

ggplot(crickets, aes(x = species)) + 
  geom_bar(color = "black",
           fill = "lightblue")

ggplot(crickets, aes(x = species, 
                     fill = species)) + 
  geom_bar(show.legend = FALSE) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = species, 
                     y = rate,
                     color = species)) + 
  geom_boxplot(show.legend = FALSE) +
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()

?theme_minimal()
ggplot(crickets, aes(x = rate, 
                     fill = species)) + 
  geom_histogram(bins = 15) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species) +
  scale_fill_brewer(palette = "Dark2")

?facet_wrap

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species,
             ncol = 1) +
  scale_fill_brewer(palette = "Dark2") + 
  theme_minimal()

Week 5: How to Choose the Correct Analysis and Hypothesis

Formative 5

Graphs:

Bar Chart: Bar charts typically represent categorical data, so the test that can be used depend on whether you compare proportions, frequencies or means across categories.

Chi-Square Test

  • evaluates whether there is a significance associated between two categorical variables. It checks if the distribution of one variable differs based on the levels of another variable.

T-test:

  • compares the means between two independent groups to determine if the observed differences are statistically significant.

ANOVA:

  • tests whether the means of the continuous variables differ significantly across multiple categories.

image

Box-plot: The graph looks like it may be comparing the distribution of sepal length across different species. To see if there is any significance, statistical tests can be used:

ANOVA:

  • can be used when comparing the means of length across two species

  • ANOVA tests whether there is a significant difference in the means of sepal length between the species. Since the boxplot visualises the spread and central tendency of sepal lengths for each species, ANOVA would formally test if the observed differences in the means (which might be suggested by the boxplot) are statistically significant.

KRUSKAL-WALLIS TEST

  • non parametric alternative to ANOVA

  • compares the medians rather than the means, making it appropriate if the distributions are skewered, which the boxplot can help to identify. This test will help to see if there is a significant different in the sepal length distribution across species.

T-Test

  • when comparing sepal length between only two species (not three or more!)

  • t-test compares the means of two independent groups. This test assumes that the data is normally distributed and what variances are equal between the two groups.

Mann-Whitney U Test

  • non parametric

  • compares the distribution between two independent groups without assuming normality.

image

Line Graph : Represent continuous data/variable over time. When analyzing data visualized in line graphs, various tests can be used depending on the research question.

Linear Regression

  • Analyses relationships between a continuous dependent variable and one or more independent variables; useful for trend analysis and predictions

ANOVA

  • Compares means of a continuous variable across different groups; useful for testing differences over time

image

Scatter-plot : Common way to visualize the relationship between two continuous variables.

Pearson Correlation Coefficient

  • Measures the strength and direction of linear relationship between two continuous variables.

Linear Regression:

  • Models the relationship between continuous dependent variable and one or more independent variables; provides predictive equation.

Spearman’s Rank Correlation:

  • non-parametric alternative to Pearson’s; used for assessing monotonic relationships and ordinal data.

image

Formative 5: Re-creating graphs using the iris data-set

Box-Plot

data("iris")
# Creating a boxplot of Sepal.Lenght by Species
boxplot(Sepal.Length ~ Species, data = iris,
        main = "Boxplot of Sepal Length by Species",
        xlab = "Species",
        ylab = "Sepal Length (cm)",
        col = c("lightblue", "lightgreen", "lightpink"))

Density Plot

library(ggplot2)
data("iris")
# Create a density plot using ggplot2
ggplot(iris, aes(x = Petal.Length, color = Species)) +
  geom_density(linewidth = 1) +  
  labs(title = "Density of Petal Length by Species",
       x = "Petal Length (cm)",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "top")  

Scatter Plot

library(ggplot2)
data(iris)

# Create a scatter plot of Petal.Length vs. Petal.Width with a regression line
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +  # Add regression line
  labs(title = "Scatter Plot of Petal Length vs. Petal Width",
       x = "Petal Length (cm)",
       y = "Petal Width (cm)") +
  theme_minimal() +
  theme(legend.position = "top")  
`geom_smooth()` using formula = 'y ~ x'

Bar Chart-

library(ggplot2)

# Create a new variable 'size' in the iris dataset
iris$size <- ifelse(iris$Sepal.Length < median(iris$Sepal.Length), "small", "big")

# Create a bar chart of species counts categorized by size
ggplot(data = iris, aes(x = Species, fill = size)) +
  geom_bar(position = "dodge", color = "black", width = 0.7) +  # Bars with black outline and dodged for separation
  labs(title = "Count of Iris Species by Size Based on Sepal Length",
       x = "Species",
       y = "Count") +
  scale_fill_manual(values = c("small" = "lightblue", "big" = "lightgreen")) +  # Custom colors for sizes
  theme_minimal(base_size = 15) +  # Minimal theme with larger base font size
  theme(
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold"),  # Center and style title
    axis.title.x = element_text(size = 16, face = "bold"),  # X-axis title styling
    axis.title.y = element_text(size = 16, face = "bold"),  # Y-axis title styling
    axis.text.x = element_text(size = 14),    # X-axis text size
    axis.text.y = element_text(size = 14)     # Y-axis text size
  ) +
  geom_text(stat = "count", aes(label = after_stat(count)), position = position_dodge(0.7), vjust = -0.5, size = 5, color = "black")  # Add count labels above bars