library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
Download chickens.csv to your working directory. Make sure to set your working directory appropriately! This dataset was created by modifying the R built-in dataset chickwts.
Import the chickens.csv data into R. Store it in a data.frame named ch_df and print out the entire ch_df to the screen.
library(readr)
ch_df <- read_csv("chickens.csv")
## Rows: 71 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): weight, feed
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(ch_df)
## # A tibble: 71 x 2
## weight feed
## <chr> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 <NA> <NA>
## 4 318 sunflower
## 5 332 casein
## 6 na horsebean
## 7 216 na
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # ... with 61 more rows
There are some missing values in this dataset. Unfortunately they are represented in a number of different ways.
sum(is.na(ch_df))
## [1] 12
replace_na(ch_df)
## # A tibble: 71 x 2
## weight feed
## <chr> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 <NA> <NA>
## 4 318 sunflower
## 5 332 casein
## 6 na horsebean
## 7 216 na
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # ... with 61 more rows
Now that the dataset is clean, let’s see what percentage of our data is missing.
mean(is.na(ch_df$weight)) * 100
## [1] 9.859155
mean(is.na(ch_df$feed)) * 100
## [1] 7.042254
mean(is.na(ch_df)) * 100
## [1] 8.450704
EXTRA CREDIT (Optional): Figure out how to create these print statements so that the name and percentage number are not hard-coded into the statement. In other words, so that the name and percentage number are read in dynamically (for example, from a variable, from a function call, etc.) instead of just written in the statement. Please ask me for clarification if necessary.
# fill in your code here
ch_df$weight <- as.character(ch_df$weight)
ch_df$weight <- as.numeric(ch_df$weight)
## Warning: NAs introduced by coercion
ch_dfmedmen <- ch_df %>%
group_by(feed) %>%
summarise(mean_weight= mean(weight, na.rm = TRUE), median_weight= median(weight, na.rm = TRUE))
ch_dfmedmen
## # A tibble: 11 x 3
## feed mean_weight median_weight
## <chr> <dbl> <dbl>
## 1 ? 190. 161
## 2 casein 314. 325
## 3 horsebean 161. 160
## 4 linseed 232. 236.
## 5 meatmeal 304. 315
## 6 na 216 216
## 7 not sure 329 329
## 8 soybean 242. 249
## 9 sunflower 353. 340
## 10 unknown 263 263
## 11 <NA> 298. 286.
ch_dfmedmen[which.max(ch_dfmedmen$median_weight),]
## # A tibble: 1 x 3
## feed mean_weight median_weight
## <chr> <dbl> <dbl>
## 1 sunflower 353. 340
class(ch_df$weight)
## [1] "numeric"
weight_chdf <- as.numeric(ch_df$weight)
hist(weight_chdf)
ch_df$weight <- as.numeric(ch_df$weight)
p1 <- ch_df %>%
ggplot(aes(x= feed, y= weight, group = feed)) + geom_boxplot()
p1
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
summary(ch_df)
## weight feed
## Min. :108.0 Length:71
## 1st Qu.:211.2 Class :character
## Median :261.5 Mode :character
## Mean :264.1
## 3rd Qu.:325.5
## Max. :423.0
## NA's :15
#Group by FEED types (note to self)
ch_df %>%
group_by(feed) %>%
summarize(min = min(weight, na.rm = TRUE),
Q1 = quantile(weight, 0.25, na.rm = TRUE),
Median = median(weight, na.rm = TRUE),
Mean = mean(weight, na.rm = TRUE),
Q3 = quantile(weight, 0.75, na.rm = TRUE),
Max = max(weight, na.rm = TRUE))
## # A tibble: 11 x 7
## feed min Q1 Median Mean Q3 Max
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ? 141 150 161 190. 200. 295
## 2 casein 222 277. 325 314. 356 379
## 3 horsebean 108 142. 160 161. 174. 227
## 4 linseed 148 205 236. 232. 263. 309
## 5 meatmeal 206 280. 315 304. 334. 380
## 6 na 216 216 216 216 216 216
## 7 not sure 329 329 329 329 329 329
## 8 soybean 158 225 249 242. 268 327
## 9 sunflower 318 328 340 353. 366. 423
## 10 unknown 263 263 263 263 263 263
## 11 <NA> 217 247 286. 298. 338 404
p2 <- p1 + ggtitle("Feed and Weights - Chickens") +
xlab("Feed Type") + ylab ("Weight")
p2
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).