Download chickens.csv to your working directory. Make sure to set your working directory appropriately! This dataset was created by modifying the R built-in dataset chickwts.
Import the chickens.csv data into R. Store it in a data.frame named ch_df and print out the entire ch_df to the screen.
Packages <- c("tidyverse","dplyr")
invisible(lapply(Packages, library, character.only = TRUE))
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
setwd("~/Data Study/Data 101 MC/Week9")
ch_df <- read_csv("chickens.csv")
## Rows: 71 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): weight, feed
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ch_df
## # A tibble: 71 × 2
## weight feed
## <chr> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 <NA> <NA>
## 4 318 sunflower
## 5 332 casein
## 6 na horsebean
## 7 216 na
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # … with 61 more rows
There are some missing values in this dataset. Unfortunately they are represented in a number of different ways.
colSums(is.na(ch_df))
## weight feed
## 7 5
ch_df[ch_df == "-"] <- NA
ch_df[ch_df == "na"] <- NA
ch_df[ch_df == "N/A"] <- NA
ch_df[ch_df == "?"] <- NA
str(ch_df)
## spec_tbl_df [71 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ weight: chr [1:71] "206" "140" NA "318" ...
## $ feed : chr [1:71] "meatmeal" "horsebean" NA "sunflower" ...
## - attr(*, "spec")=
## .. cols(
## .. weight = col_character(),
## .. feed = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
ch_df$weight <- as.numeric(as.character(ch_df$weight))
Now that the dataset is clean, let’s see what percentage of our data is missing.
na.cols <- which(colSums(is.na(ch_df)) >0)
sort(colSums(sapply(ch_df[na.cols], is.na)),decreasing = TRUE)
## weight feed
## 15 10
paste('Number of columns with no values:', length(na.cols))
## [1] "Number of columns with no values: 2"
paste('Percentage of missing data in Weight:', round(100*sum(is.na(ch_df$weight))/length(ch_df$weight),2),'%')
## [1] "Percentage of missing data in Weight: 21.13 %"
paste('Percentage of missing data in Feed:', round(100*sum(is.na(ch_df$feed))/length(ch_df$feed),2),'%')
## [1] "Percentage of missing data in Feed: 14.08 %"
EXTRA CREDIT (Optional): Figure out how to create these print statements so that the name and percentage number are not hard-coded into the statement. In other words, so that the name and percentage number are read in dynamically (for example, from a variable, from a function call, etc.) instead of just written in the statement. Please ask me for clarification if necessary.
paste('Percentage of missing data in',colnames(ch_df)[1],":", round(100*sum(is.na(ch_df$weight))/length(ch_df$weight),2),'%')
## [1] "Percentage of missing data in weight : 21.13 %"
paste('Percentage of missing data in',colnames(ch_df)[2],":", round(100*sum(is.na(ch_df$feed))/length(ch_df$feed),2),'%')
## [1] "Percentage of missing data in feed : 14.08 %"
ch_stat <- ch_df %>%
group_by(feed) %>%
summarise(weight_mean = mean(weight, na.rm = TRUE), weight_median = median(weight, na.rm = TRUE))
ch_stat
## # A tibble: 9 × 3
## feed weight_mean weight_median
## <chr> <dbl> <dbl>
## 1 casein 314. 325
## 2 horsebean 161. 160
## 3 linseed 232. 236.
## 4 meatmeal 304. 315
## 5 not sure 329 329
## 6 soybean 242. 249
## 7 sunflower 353. 340
## 8 unknown 263 263
## 9 <NA> 241. 217
ch_stat %>%
arrange(desc(weight_median))
## # A tibble: 9 × 3
## feed weight_mean weight_median
## <chr> <dbl> <dbl>
## 1 sunflower 353. 340
## 2 not sure 329 329
## 3 casein 314. 325
## 4 meatmeal 304. 315
## 5 unknown 263 263
## 6 soybean 242. 249
## 7 linseed 232. 236.
## 8 <NA> 241. 217
## 9 horsebean 161. 160
# sunflower at 340
hist(ch_df$weight)
boxplot(ch_df$weight ~ ch_df$feed)
# The charts are representative of the data we had pulled originally on the mean and median. Boxplot's middle line shows the median and that close to or equal to the median we had calculated. The boxplot is only showing a single outlier.
# sunflower
fivenum(subset(ch_df, feed == "sunflower")$weight)
## [1] 318.0 328.0 340.0 366.5 423.0
IQR(subset(ch_df, feed == "sunflower")$weight, na.rm=TRUE)
## [1] 38.5
# casein
fivenum(subset(ch_df, feed == "casein")$weight)
## [1] 222.0 271.5 325.0 360.0 379.0
IQR(subset(ch_df, feed == "casein")$weight, na.rm=TRUE)
## [1] 78.75
# linseed
fivenum(subset(ch_df, feed == "linseed")$weight)
## [1] 148.0 197.0 236.5 265.5 309.0
IQR(subset(ch_df, feed == "linseed")$weight, na.rm=TRUE)
## [1] 57.75
library(ggplot2)
p <- ggplot(ch_df, aes(x=feed, y=weight)) +
geom_boxplot()
p
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
# in the ggplot, we can see that there is another outlier in soybean. The only outlier in base R boxplot is shown to be in horsebean. There is also an additional box for NA in the ggplot.