# Load all packages here:
library(tidyverse)
library(openintro)
library(readr)
library(dplyr)
library(ggplot2)
library(janitor)

# Set seed value of random number generator to get "replicable" random numbers.
# The choice of seed value of 76 was an arbitrary one on my part.
set.seed(76)

Data

Load data into R

Include the code to load your data here. If your data is not confidential nor is it private in nature, consider publishing it as a .csv file on Google Sheets as in the code chunk below; instructions on how to do this are in Steps 1-6 here. If the data shouldn’t be published online, then please submit the spreadsheet file on Moodle.

data_raw <- iris

Clean variable names

Pipe your data frame into the clean_names() function from the janitor package. Then be sure to overwrite the original data frame with this new data frame using the <- assignment operator. This will clean your variable names, making them easier to work with.

data <- data_raw %>%
  janitor::clean_names()

Data wrangling

Complete your data wrangling here:

data <- data %>%
  dplyr::mutate(
    id = dplyr::row_number(),
    petal_length_group = cut(
      petal_length,
      breaks = stats::quantile(petal_length, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
      include.lowest = TRUE,
      labels = c("short", "medium", "long")
    )
  )

Preview of data

Pare down variables

select() the following variables in this order and drop all others. Eliminating all unnecessary variables will making visually exploring the raw values less taxing mentally, as we’ll have less data to look at.

First: The identification variable (if any)
Second: The outcome variable \(y\)
Third: The numerical explanatory variable
Fourth: The categorical explanatory variable
Rest: any other variable you find interesting

df <- data %>%
  dplyr::select(
    id,                 
    petal_length,       
    sepal_width,        
    petal_length_group, 
    species             
  )

Look at your data using glimpse

Look at your data using the glimpse() function.

glimpse(df)

## Rows: 150
## Columns: 5
## $ id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ petal_length       <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1…
## $ sepal_width        <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3…
## $ petal_length_group <fct> short, short, short, short, short, short, short, sh…
## $ species            <fct> setosa, setosa, setosa, setosa, setosa, setosa, set…

Show a preview of your data

Look at your data another way by displaying a random sample of 5 rows of your data frame by piping it into the sample_n(5) function from the dplyr package.

dplyr::sample_n(df, 5)

id	petal_length	sepal_width	petal_length_group	species
129	5.6	2.8	long	virginica
109	5.8	2.5	long	virginica
28	1.5	3.5	short	setosa
74	4.7	2.8	medium	versicolor
16	1.5	4.4	short	setosa

Exploratory data analysis

Let’s do an little exploratory data analysis.

Inspect for missing values

Address missing values.

colSums(is.na(df))

##                 id       petal_length        sepal_width petal_length_group 
##                  0                  0                  0                  0 
##            species 
##                  0

# keep complete cases for the key vars used in plots/stats
key_vars <- c("petal_length", "sepal_width", "petal_length_group")
df_complete <- df[stats::complete.cases(df[, key_vars]), ]

Summary statistics

Compute some quick summary statistics of the outcome variable and comment.

df_complete %>%
  dplyr::group_by(petal_length_group) %>%
  dplyr::summarise(
    n           = dplyr::n(),
    correlation = stats::cor(petal_length, sepal_width),   # like cor(avg_sat_math, percent_disadvantaged)
    mean        = mean(petal_length),
    median      = stats::median(petal_length),
    sd          = stats::sd(petal_length)
  )

petal_length_group	n	correlation	mean	median	sd
short	50	0.1777000	1.462000	1.5	0.1736640
medium	54	0.5506873	4.290741	4.4	0.4634850
long	46	0.3559652	5.628261	5.6	0.5088825

Histogram of outcome variable

Visualize the distribution of the outcome variable using a histogram and comment.

ggplot2::ggplot(df_complete, ggplot2::aes(x = petal_length)) +
  ggplot2::geom_histogram(binwidth = 0.3, color = "white", fill = "steelblue") +
  ggplot2::labs(x = "Petal Length (cm)", y = "Count") +
  ggplot2::theme_minimal()

Figure 1. WRITE A TITLE HERE

Scatterplot

Visualize the relationship of the outcome variable and the numerical explanatory variable using a scatterplot and comment.

ggplot2::ggplot(df_complete, ggplot2::aes(x = sepal_width, y = petal_length)) +
  ggplot2::geom_point(alpha = 0.85) +
  ggplot2::geom_smooth(method = "lm", se = FALSE) +
  ggplot2::labs(x = "Sepal Width (cm)", y = "Petal Length (cm)") +
  ggplot2::theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Figure 2. WRITE A TITLE HERE

Boxplot

Visualize the relationship of the outcome variable and the categorical explanatory variable using a scatterplot and comment.

ggplot2::ggplot(df_complete, ggplot2::aes(x = petal_length_group, y = petal_length, fill = petal_length_group)) +
  ggplot2::geom_boxplot() +
  ggplot2::labs(x = "Petal Length Group", y = "Petal Length (cm)") +
  ggplot2::theme_minimal() +
  ggplot2::theme(legend.position = "none")

Figure 3. WRITE A TITLE HERE

Colored scatterplot

Visualize the relationship of the outcome variable and both explanatory variables using a colored scatterplot and comment.

ggplot2::ggplot(df_complete, ggplot2::aes(x = sepal_width, y = petal_length, color = petal_length_group)) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", se = FALSE) +
  ggplot2::labs(x = "Sepal Width (cm)", y = "Petal Length (cm)", color = "Length Group") +
  ggplot2::theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Figure 4. WRITE A TITLE HERE

Author Statement

Individual Roles

Describe the role (Facilitator, Recorder, Reporter, Time Keeper, Checker) and level of contribution of each group member during this project phase

Individual Contribution

If your group were to earn 5 points on this submission, how should those points be shared across your group members?

STAT 1910 Project Proposal

WRITE YOUR GROUP NAME HERE

Last updated on 2025-10-25