Download chickens.csv to your working directory. Make sure to set your working directory appropriately! This dataset was created by modifying the R built-in dataset chickwts.
Import the chickens.csv data into R. Store it in a data.frame named ch_df and print out the entire ch_df to the screen.
library(readr)
ch_df <- read_csv("C:/Users/Mitcheyla$/Desktop/DATA 101, Fall Semester/chickens.csv")
## Rows: 71 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): weight, feed
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(ch_df)
There are some missing values in this dataset. Unfortunately they are represented in a number of different ways.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ dplyr 1.0.10
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
sum(is.na(ch_df))
## [1] 12
ch_df <- ch_df %>%
mutate(weight = replace(weight, weight == "na", NA)) %>%
mutate(weight = replace(weight, weight == "N/A", NA)) %>%
mutate(weight = replace(weight, weight == "", NA)) %>%
mutate(weight = replace(weight, weight == "?", NA)) %>%
mutate(weight = replace(weight, weight == "-", NA)) %>%
mutate(weight = replace(weight, weight == "NA", NA))
ch_df <- ch_df %>%
mutate(feed = replace(feed, feed == "na", NA)) %>%
mutate(feed = replace(feed, feed == "N/A", NA)) %>%
mutate(feed = replace(feed, feed == "", NA)) %>%
mutate(feed = replace(feed, feed == "?", NA)) %>%
mutate(feed = replace(feed, feed == "NA", NA))
Now that the dataset is clean, let’s see what percentage of our data is missing.
((sum(is.na(ch_df$weight))/(length(ch_df$weight))*100))
## [1] 21.12676
print("Percentage of missing data in the weight column: 21.12676%")
## [1] "Percentage of missing data in the weight column: 21.12676%"
((sum(is.na(ch_df$feed))/(length(ch_df$feed))*100))
## [1] 14.08451
print("Percentage of missing data in the feed column: 14.08451%")
## [1] "Percentage of missing data in the feed column: 14.08451%"
((sum(is.na(ch_df))/(length(ch_df$weight)+length(ch_df$feed))*100))
## [1] 17.60563
print("Percentage of missing data in the entire dataset: 17.60563%")
## [1] "Percentage of missing data in the entire dataset: 17.60563%"
EXTRA CREDIT (Optional): Figure out how to create these print statements so that the name and percentage number are not hard-coded into the statement. In other words, so that the name and percentage number are read in dynamically (for example, from a variable, from a function call, etc.) instead of just written in the statement. Please ask me for clarification if necessary.
x<-((sum(is.na(ch_df$weight))/(length(ch_df$weight))*100))
y<-((sum(is.na(ch_df$feed))/(length(ch_df$feed))*100))
z<-((sum(is.na(ch_df))/(length(ch_df$weight)+length(ch_df$feed))*100))
cat("weight column:", x,"\n")
## weight column: 21.12676
cat("feed column:", y,"\n")
## feed column: 14.08451
cat("entire dataset:", z, "\n")
## entire dataset: 17.60563
ch_df$weight<-as.character(ch_df$weight)
ch_df$weight<-as.numeric(ch_df$weight)
ch_df2<-ch_df %>%
group_by(feed)%>%
summarise(weight_mean=mean(weight,na.rm = TRUE),weight_median=median(weight,na.rm = TRUE))
ch_df2
## # A tibble: 9 × 3
## feed weight_mean weight_median
## <chr> <dbl> <dbl>
## 1 casein 314. 325
## 2 horsebean 161. 160
## 3 linseed 232. 236.
## 4 meatmeal 304. 315
## 5 not sure 329 329
## 6 soybean 242. 249
## 7 sunflower 353. 340
## 8 unknown 263 263
## 9 <NA> 241. 217
which.max(ch_df2$weight_median)
## [1] 7
ch_df
## # A tibble: 71 × 2
## weight feed
## <dbl> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 NA <NA>
## 4 318 sunflower
## 5 332 casein
## 6 NA horsebean
## 7 216 <NA>
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # … with 61 more rows
hist(ch_df$weight, col="blue", main="The Weight of Chickens",xlab="Weight in Grams",ylab ="Frequency")
boxplot(weight ~ feed, data = ch_df, col=c("blue","brown","green","orange","red","yellow"),
varwidth = TRUE, notch = TRUE, main = "chicken data",
ylab = "Weight in gram")
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE
The histogram shows there is one distinct peak indicating the most frequent number of 200-250 and 300-350, and there is no significant skew. Also, it does not show any outlier.
The box plot shows sunflower has the maximum mean and median chicken weight, follows by casein, and the horsebean has the least mean and median. Also, we can see an outlier on the horsebean feed.
summary(ch_df)
## weight feed
## Min. :108.0 Length:71
## 1st Qu.:211.2 Class :character
## Median :261.5 Mode :character
## Mean :264.1
## 3rd Qu.:325.5
## Max. :423.0
## NA's :15
ch_df %>%
group_by(feed) %>%
summarize(min = min(weight, na.rm = TRUE),
q1 = quantile(weight, 0.25, na.rm = TRUE),
median = median(weight, na.rm = TRUE),
mean = mean(weight, na.rm = TRUE),
q3 = quantile(weight, 0.75, na.rm = TRUE),
max = max(weight, na.rm = TRUE))
## # A tibble: 9 × 7
## feed min q1 median mean q3 max
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 casein 222 277. 325 314. 356 379
## 2 horsebean 108 142. 160 161. 174. 227
## 3 linseed 148 205 236. 232. 263. 309
## 4 meatmeal 206 280. 315 304. 334. 380
## 5 not sure 329 329 329 329 329 329
## 6 soybean 158 225 249 242. 268 327
## 7 sunflower 318 328 340 353. 366. 423
## 8 unknown 263 263 263 263 263 263
## 9 <NA> 141 169 217 241. 295 404
library(ggplot2)
ch_dfplot<-ggplot(ch_df, aes(x=feed, y=weight, fill=feed)) +
geom_boxplot()+
labs(title="Interrelationship between Type of Feed and Weight of Chickens",x="Feed", y = "Weight of Chickens")
ch_dfplot + scale_fill_brewer(palette="Spectral") + theme_light()
## Warning: Removed 15 rows containing non-finite values (`stat_boxplot()`).
In the above charts,we have less details. GGplot gives access to change the position and the appearance of the graph. Also, in the boxplot above, we could only see one outlier, but in this one, we can see two outliers.