Download chickens.csv to your working directory. Make sure to set your working directory appropriately! This dataset was created by modifying the R built-in dataset chickwts.
Import the chickens.csv data into R. Store it in a data.frame named ch_df and print out the entire ch_df to the screen.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
ch_df <- read_csv('chickens.csv')
## Rows: 71 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): weight, feed
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ch_df
## # A tibble: 71 x 2
## weight feed
## <chr> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 <NA> <NA>
## 4 318 sunflower
## 5 332 casein
## 6 na horsebean
## 7 216 na
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # ... with 61 more rows
There are some missing values in this dataset. Unfortunately they are represented in a number of different ways.
class(ch_df)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
dim(ch_df) # the dataset contains 71 observations and 2 variables.
## [1] 71 2
summary(ch_df)
## weight feed
## Length:71 Length:71
## Class :character Class :character
## Mode :character Mode :character
is.na(ch_df)
## weight feed
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] TRUE TRUE
## [4,] FALSE FALSE
## [5,] FALSE FALSE
## [6,] FALSE FALSE
## [7,] FALSE FALSE
## [8,] FALSE FALSE
## [9,] FALSE FALSE
## [10,] FALSE FALSE
## [11,] FALSE FALSE
## [12,] FALSE FALSE
## [13,] FALSE FALSE
## [14,] FALSE FALSE
## [15,] FALSE FALSE
## [16,] FALSE FALSE
## [17,] FALSE FALSE
## [18,] FALSE FALSE
## [19,] FALSE FALSE
## [20,] FALSE FALSE
## [21,] FALSE TRUE
## [22,] FALSE FALSE
## [23,] FALSE FALSE
## [24,] FALSE FALSE
## [25,] FALSE FALSE
## [26,] FALSE FALSE
## [27,] FALSE FALSE
## [28,] FALSE FALSE
## [29,] FALSE FALSE
## [30,] TRUE FALSE
## [31,] TRUE FALSE
## [32,] FALSE FALSE
## [33,] FALSE FALSE
## [34,] FALSE FALSE
## [35,] FALSE FALSE
## [36,] FALSE FALSE
## [37,] FALSE FALSE
## [38,] FALSE FALSE
## [39,] FALSE FALSE
## [40,] FALSE FALSE
## [41,] TRUE FALSE
## [42,] FALSE FALSE
## [43,] FALSE TRUE
## [44,] FALSE FALSE
## [45,] FALSE FALSE
## [46,] FALSE FALSE
## [47,] FALSE FALSE
## [48,] FALSE FALSE
## [49,] FALSE FALSE
## [50,] FALSE FALSE
## [51,] FALSE FALSE
## [52,] TRUE FALSE
## [53,] FALSE FALSE
## [54,] FALSE FALSE
## [55,] FALSE FALSE
## [56,] FALSE FALSE
## [57,] FALSE FALSE
## [58,] FALSE FALSE
## [59,] FALSE FALSE
## [60,] TRUE FALSE
## [61,] FALSE FALSE
## [62,] FALSE TRUE
## [63,] FALSE FALSE
## [64,] FALSE FALSE
## [65,] FALSE FALSE
## [66,] FALSE FALSE
## [67,] FALSE FALSE
## [68,] FALSE FALSE
## [69,] FALSE TRUE
## [70,] FALSE FALSE
## [71,] TRUE FALSE
sum(is.na(ch_df$weight))
## [1] 7
sum(is.na(ch_df$feed))
## [1] 5
colSums(is.na(ch_df))
## weight feed
## 7 5
ch_df <-replace(ch_df,is.na(ch_df),"NA")
ch_df[ch_df=="na"] <- "NA" # I consider
ch_df
## # A tibble: 71 x 2
## weight feed
## <chr> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 NA NA
## 4 318 sunflower
## 5 332 casein
## 6 NA horsebean
## 7 216 NA
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # ... with 61 more rows
ch_df$weight <- as.numeric(as.character(ch_df$weight))
## Warning: NAs introduced by coercion
ch_df
## # A tibble: 71 x 2
## weight feed
## <dbl> <chr>
## 1 206 meatmeal
## 2 140 horsebean
## 3 NA NA
## 4 318 sunflower
## 5 332 casein
## 6 NA horsebean
## 7 216 NA
## 8 143 horsebean
## 9 271 soybean
## 10 315 meatmeal
## # ... with 61 more rows
sum(is.na(ch_df))
## [1] 15
Now that the dataset is clean, let’s see what percentage of our data is missing.
# Percentage of missing data in the weight column : 7.042252 %
# Percentage of missing data in the feed column : 4.225352 %
# Percentage of missing data in the mod_ch_df dataframe : 11.26761 %
EXTRA CREDIT (Optional): Figure out how to create these print statements so that the name and percentage number are not hard-coded into the statement. In other words, so that the name and percentage number are read in dynamically (for example, from a variable, from a function call, etc.) instead of just written in the statement. Please ask me for clarification if necessary.
totalcells = prod(dim(ch_df)) # calculate the product of dimensions of the dataframe.
totalcells
## [1] 142
missingcells = sum(is.na(ch_df$weight)) # calculate the missing cells in the weight column.
missingcells
## [1] 15
percentage = (missingcells*100)/(totalcells) # calculate the percentage of the missing cells in the weight column.
percentage
## [1] 10.56338
missingcells = sum(is.na(ch_df$feed)) # calculate the missing cells in the weight column.
missingcells
## [1] 0
percentage = (missingcells*100)/(totalcells) # calculate the percentage of the missing cells in the weight column.
percentage
## [1] 0
missingcells = sum(is.na(ch_df)) # calculate the missing cells in the weight column.
missingcells
## [1] 15
percentage = (missingcells*100)/(totalcells) # calculate the percentage of the missing cells in the weight column.
percentage
## [1] 10.56338
ch_df$weight <- as.numeric(as.character(ch_df$weight))
sapply(ch_df, class)
## weight feed
## "numeric" "character"
suppressWarnings(weight <- as.numeric(ch_df$weight))
summary(ch_df)
## weight feed
## Min. :108.0 Length:71
## 1st Qu.:211.2 Class :character
## Median :261.5 Mode :character
## Mean :264.1
## 3rd Qu.:325.5
## Max. :423.0
## NA's :15
weight_mean1 <- lm(formula = weight ~ feed, ch_df)
summary(weight_mean1)
##
## Call:
## lm(formula = weight ~ feed, data = ch_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.43 -32.15 0.00 30.34 122.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 189.50 27.41 6.912 1.23e-08 ***
## feedcasein 124.75 33.58 3.716 0.000548 ***
## feedhorsebean -28.79 34.37 -0.838 0.406566
## feedlinseed 42.37 33.58 1.262 0.213279
## feedmeatmeal 114.93 34.37 3.344 0.001648 **
## feedNA 92.50 36.78 2.515 0.015463 *
## feednot sure 139.50 61.30 2.276 0.027568 *
## feedsoybean 52.38 33.58 1.560 0.125632
## feedsunflower 163.36 34.37 4.754 2.00e-05 ***
## feedunknown 73.50 61.30 1.199 0.236659
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.83 on 46 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.5934, Adjusted R-squared: 0.5138
## F-statistic: 7.459 on 9 and 46 DF, p-value: 1.268e-06
Weight <-ch_df$weight
hist(Weight)
library(ggplot2)
p <- ggplot(data=ch_df, aes(x=feed,y=weight, fill = feed))+geom_boxplot()+stat_summary(fun = mean, geom = "point", shape = 20, size = 3,show.legend = FALSE)+theme(legend.position = "none")+scale_fill_brewer(palette = "Set1")
p
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
## Warning: Removed 15 rows containing non-finite values (stat_summary).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
# The mean is the avearage number of the data set and the median is the middle number in the data set. In the boxplot, A vertical line goes through the box at the median. The point in the box represents the mean.
# Yes. There is outliers in the chart.
# I have created a boxplot with x and y axis in the number 8.