Download chickens.csv to your working directory. Make sure to set your working directory appropriately! This dataset was created by modifying the R built-in dataset chickwts.
Import the chickens.csv data into R. Store it in a data.frame named ch_df and print out the entire ch_df to the screen.
ch_df<-read.csv("chickens.csv")
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
There are some missing values in this dataset. Unfortunately they are represented in a number of different ways.
sum(is.na(ch_df))
## [1] 7
ch_df <- ch_df %>%
mutate(weight = replace(weight, weight == "na", NA)) %>%
mutate(weight = replace(weight, weight == "N/A", NA)) %>%
mutate(weight = replace(weight, weight == "", NA)) %>%
mutate(weight = replace(weight, weight == "?", NA)) %>%
mutate(weight = replace(weight, weight == "NA", NA))
ch_df <- ch_df %>%
mutate(feed = replace(feed, feed == "na", NA)) %>%
mutate(feed = replace(feed, feed == "N/A", NA)) %>%
mutate(feed = replace(feed, feed == "", NA)) %>%
mutate(feed = replace(feed, feed == "?", NA)) %>%
mutate(feed = replace(feed, feed == "NA", NA))
Now that the dataset is clean, let’s see what percentage of our data is missing.
((sum(is.na(ch_df$weight))/(length(ch_df$weight))*100))
## [1] 18.30986
((sum(is.na(ch_df$feed))/(length(ch_df$feed))*100))
## [1] 14.08451
((sum(is.na(ch_df))/(length(ch_df$weight)+length(ch_df$feed))*100))
## [1] 16.19718
print("Percentage of missing data in the weight column: 18.31%")
## [1] "Percentage of missing data in the weight column: 18.31%"
print("Percentage of missing data in the feed column: 14.08%")
## [1] "Percentage of missing data in the feed column: 14.08%"
print("Percentage of missing data in the entire dataset: 16.20%")
## [1] "Percentage of missing data in the entire dataset: 16.20%"
EXTRA CREDIT (Optional): Figure out how to create these print statements so that the name and percentage number are not hard-coded into the statement. In other words, so that the name and percentage number are read in dynamically (for example, from a variable, from a function call, etc.) instead of just written in the statement. Please ask me for clarification if necessary.
x<-((sum(is.na(ch_df$weight))/(length(ch_df$weight))*100))
y<-((sum(is.na(ch_df$feed))/(length(ch_df$feed))*100))
z<-((sum(is.na(ch_df))/(length(ch_df$weight)+length(ch_df$feed))*100))
cat("weight column:", x,"\n")
## weight column: 18.30986
cat("feed column:", y,"\n")
## feed column: 14.08451
cat("entire dataset:", z, "\n")
## entire dataset: 16.19718
ch_df$weight<-as.character(ch_df$weight)
ch_df$weight<-as.numeric(ch_df$weight)
## Warning: NAs introduced by coercion
df<-ch_df %>%
group_by(feed)%>%
summarise(weight_mean=mean(weight,na.rm = TRUE),weight_median=median(weight,na.rm = TRUE))
## Warning: Factor `feed` contains implicit NA, consider using
## `forcats::fct_explicit_na`
df
## # A tibble: 9 x 3
## feed weight_mean weight_median
## <fct> <dbl> <dbl>
## 1 casein 314. 325
## 2 horsebean 161. 160
## 3 linseed 232. 236.
## 4 meatmeal 304. 315
## 5 not sure 329 329
## 6 soybean 242. 249
## 7 sunflower 353. 340
## 8 unknown 263 263
## 9 <NA> 241. 217
which.max(df$weight_median)
## [1] 7
df[7,]
## # A tibble: 1 x 3
## feed weight_mean weight_median
## <fct> <dbl> <dbl>
## 1 sunflower 353. 340