Download chickens.csv to your working directory. Make sure to set your working directory appropriately! This dataset was created by modifying the R built-in dataset chickwts.
Import the chickens.csv data into R. Store it in a data.frame named ch_df and print out the entire ch_df to the screen.
setwd("~/Data 101")
ch_df<-read.csv("chickens.csv")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
view(ch_df)
There are some missing values in this dataset. Unfortunately they are represented in a number of different ways.
sum(is.na(ch_df))
## [1] 7
ch_df[ch_df == ""]<-NA
ch_df[ch_df == "na"]<-NA
ch_df[ch_df == "N/A"]<-NA
ch_df[ch_df == "?"]<-NA
ch_df[ch_df == "NA"]<-NA
ch_df[ch_df == "-"]<-NA
view(ch_df)
ch_df <- ch_df %>%
mutate(weight = replace(weight, weight == "na", NA)) %>%
mutate(weight = replace(weight, weight == "N/A", NA)) %>%
mutate(weight = replace(weight, weight == "", NA)) %>%
mutate(weight = replace(weight, weight == "?", NA)) %>%
mutate(weight = replace(weight, weight == "NA", NA))
ch_df <- ch_df %>%
mutate(feed = replace(feed, feed == "na", NA)) %>%
mutate(feed = replace(feed, feed == "N/A", NA)) %>%
mutate(feed = replace(feed, feed == "", NA)) %>%
mutate(feed = replace(feed, feed == "?", NA)) %>%
mutate(feed = replace(feed, feed == "NA", NA))
Now that the dataset is clean, let’s see what percentage of our data is missing.
mean(is.na(ch_df$weight))*100
## [1] 21.12676
print("Percentage of missing data in the weight column: 21.13%")
## [1] "Percentage of missing data in the weight column: 21.13%"
mean(is.na(ch_df$feed))*100
## [1] 14.08451
print("Percentage of missing data in the weight column: 14.09%")
## [1] "Percentage of missing data in the weight column: 14.09%"
mean(is.na(ch_df))*100
## [1] 17.60563
print("Percentage of missing data in the weight column: 17.61%")
## [1] "Percentage of missing data in the weight column: 17.61%"
EXTRA CREDIT (Optional): Figure out how to create these print statements so that the name and percentage number are not hard-coded into the statement. In other words, so that the name and percentage number are read in dynamically (for example, from a variable, from a function call, etc.) instead of just written in the statement. Please ask me for clarification if necessary.
x<-mean(is.na(ch_df$weight))*100
y<-mean(is.na(ch_df$feed))*100
z<-mean(is.na(ch_df))*100
cat("weight column:", x,"\n")
## weight column: 21.12676
cat("feed column:", y,"\n")
## feed column: 14.08451
cat("entire dataset:", z, "\n")
## entire dataset: 17.60563
ch_df$weight<-as.character(ch_df$weight)
ch_df$weight<-as.numeric(ch_df$weight)
Chick<-ch_df %>%
group_by(feed)%>%
summarise(weight_mean=mean(weight,na.rm = TRUE),weight_median=median(weight,na.rm = TRUE))
view(Chick)
which.max(Chick$weight_median)
## [1] 7
Chick[7,]
## # A tibble: 1 x 3
## feed weight_mean weight_median
## <chr> <dbl> <dbl>
## 1 sunflower 353. 340
ggplot(ch_df, aes(weight, fill = "weight")) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).
ggplot(ch_df, aes(feed, weight)) +
geom_boxplot()
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
ggplot(ch_df, aes(feed, weight, fill= feed)) +
geom_boxplot()+
labs(x = "feed", y = "weight", title = "Weights of Feed")
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).