# Load all packages here:
library(tidyverse)
library(openintro)
library(readr)
library(dplyr)
library(ggplot2)
library(janitor)
# Set seed value of random number generator to get "replicable" random numbers.
# The choice of seed value of 76 was an arbitrary one on my part.
set.seed(76)
Include the code to load your data here. If your data is not
confidential nor is it private in nature, consider publishing it as a
.csv file on Google Sheets as in the code chunk below;
instructions on how to do this are in Steps 1-6 here.
If the data shouldn’t be published online, then please submit the
spreadsheet file on Moodle.
data_raw <- iris
Pipe your data frame into the clean_names() function
from the janitor package. Then be sure to overwrite the
original data frame with this new data frame using the
<- assignment operator. This will clean your variable
names, making them easier to work with.
data <- data_raw %>%
janitor::clean_names()
Complete your data wrangling here:
data <- data %>%
dplyr::mutate(
id = dplyr::row_number(),
petal_length_group = cut(
petal_length,
breaks = stats::quantile(petal_length, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
include.lowest = TRUE,
labels = c("short", "medium", "long")
)
)
select() the following variables in this
order and drop all others. Eliminating all unnecessary
variables will making visually exploring the raw values less taxing
mentally, as we’ll have less data to look at.
df <- data %>%
dplyr::select(
id,
petal_length,
sepal_width,
petal_length_group,
species
)
Look at your data using the glimpse()
function.
glimpse(df)
## Rows: 150
## Columns: 5
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ petal_length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1…
## $ sepal_width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3…
## $ petal_length_group <fct> short, short, short, short, short, short, short, sh…
## $ species <fct> setosa, setosa, setosa, setosa, setosa, setosa, set…
Look at your data another way by displaying a random sample of 5
rows of your data frame by piping it into the sample_n(5)
function from the dplyr package.
dplyr::sample_n(df, 5)
| id | petal_length | sepal_width | petal_length_group | species |
|---|---|---|---|---|
| 129 | 5.6 | 2.8 | long | virginica |
| 109 | 5.8 | 2.5 | long | virginica |
| 28 | 1.5 | 3.5 | short | setosa |
| 74 | 4.7 | 2.8 | medium | versicolor |
| 16 | 1.5 | 4.4 | short | setosa |
Let’s do an little exploratory data analysis.
Address missing values.
colSums(is.na(df))
## id petal_length sepal_width petal_length_group
## 0 0 0 0
## species
## 0
# keep complete cases for the key vars used in plots/stats
key_vars <- c("petal_length", "sepal_width", "petal_length_group")
df_complete <- df[stats::complete.cases(df[, key_vars]), ]
Compute some quick summary statistics of the outcome variable and comment.
df_complete %>%
dplyr::group_by(petal_length_group) %>%
dplyr::summarise(
n = dplyr::n(),
correlation = stats::cor(petal_length, sepal_width), # like cor(avg_sat_math, percent_disadvantaged)
mean = mean(petal_length),
median = stats::median(petal_length),
sd = stats::sd(petal_length)
)
| petal_length_group | n | correlation | mean | median | sd |
|---|---|---|---|---|---|
| short | 50 | 0.1777000 | 1.462000 | 1.5 | 0.1736640 |
| medium | 54 | 0.5506873 | 4.290741 | 4.4 | 0.4634850 |
| long | 46 | 0.3559652 | 5.628261 | 5.6 | 0.5088825 |
Visualize the distribution of the outcome variable using a histogram and comment.
ggplot2::ggplot(df_complete, ggplot2::aes(x = petal_length)) +
ggplot2::geom_histogram(binwidth = 0.3, color = "white", fill = "steelblue") +
ggplot2::labs(x = "Petal Length (cm)", y = "Count") +
ggplot2::theme_minimal()
Figure 1. WRITE A TITLE HERE
Visualize the relationship of the outcome variable and the numerical explanatory variable using a scatterplot and comment.
ggplot2::ggplot(df_complete, ggplot2::aes(x = sepal_width, y = petal_length)) +
ggplot2::geom_point(alpha = 0.85) +
ggplot2::geom_smooth(method = "lm", se = FALSE) +
ggplot2::labs(x = "Sepal Width (cm)", y = "Petal Length (cm)") +
ggplot2::theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Figure 2. WRITE A TITLE HERE
Visualize the relationship of the outcome variable and the categorical explanatory variable using a scatterplot and comment.
ggplot2::ggplot(df_complete, ggplot2::aes(x = petal_length_group, y = petal_length, fill = petal_length_group)) +
ggplot2::geom_boxplot() +
ggplot2::labs(x = "Petal Length Group", y = "Petal Length (cm)") +
ggplot2::theme_minimal() +
ggplot2::theme(legend.position = "none")
Figure 3. WRITE A TITLE HERE
Visualize the relationship of the outcome variable and both explanatory variables using a colored scatterplot and comment.
ggplot2::ggplot(df_complete, ggplot2::aes(x = sepal_width, y = petal_length, color = petal_length_group)) +
ggplot2::geom_point() +
ggplot2::geom_smooth(method = "lm", se = FALSE) +
ggplot2::labs(x = "Sepal Width (cm)", y = "Petal Length (cm)", color = "Length Group") +
ggplot2::theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Figure 4. WRITE A TITLE HERE