library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

setwd("C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/project_chickens")
chickens <- read.csv("chickens.csv")

QUESTION 3

Clean up data

You do NOT have to fill in the missing values. Just leave them as NA.

summary(chickens)
##     weight              feed          
##  Length:71          Length:71         
##  Class :character   Class :character  
##  Mode  :character   Mode  :character
str(chickens)
## 'data.frame':    71 obs. of  2 variables:
##  $ weight: chr  "206" "140" NA "318" ...
##  $ feed  : chr  "meatmeal" "horsebean" NA "sunflower" ...
view(chickens)
head(chickens)
##   weight      feed
## 1    206  meatmeal
## 2    140 horsebean
## 3   <NA>      <NA>
## 4    318 sunflower
## 5    332    casein
## 6     na horsebean
str(chickens)
## 'data.frame':    71 obs. of  2 variables:
##  $ weight: chr  "206" "140" NA "318" ...
##  $ feed  : chr  "meatmeal" "horsebean" NA "sunflower" ...

Calculate how many elements in the original ch_df are recognized as NA by R.

is.na(chickens)
##       weight  feed
##  [1,]  FALSE FALSE
##  [2,]  FALSE FALSE
##  [3,]   TRUE  TRUE
##  [4,]  FALSE FALSE
##  [5,]  FALSE FALSE
##  [6,]  FALSE FALSE
##  [7,]  FALSE FALSE
##  [8,]  FALSE FALSE
##  [9,]  FALSE FALSE
## [10,]  FALSE FALSE
## [11,]  FALSE FALSE
## [12,]  FALSE FALSE
## [13,]  FALSE FALSE
## [14,]  FALSE FALSE
## [15,]  FALSE FALSE
## [16,]  FALSE FALSE
## [17,]  FALSE FALSE
## [18,]  FALSE FALSE
## [19,]  FALSE FALSE
## [20,]  FALSE FALSE
## [21,]  FALSE FALSE
## [22,]  FALSE FALSE
## [23,]  FALSE FALSE
## [24,]  FALSE FALSE
## [25,]  FALSE FALSE
## [26,]  FALSE FALSE
## [27,]  FALSE FALSE
## [28,]  FALSE FALSE
## [29,]  FALSE FALSE
## [30,]   TRUE FALSE
## [31,]  FALSE FALSE
## [32,]  FALSE FALSE
## [33,]  FALSE FALSE
## [34,]  FALSE FALSE
## [35,]  FALSE FALSE
## [36,]  FALSE FALSE
## [37,]  FALSE FALSE
## [38,]  FALSE FALSE
## [39,]  FALSE FALSE
## [40,]  FALSE FALSE
## [41,]  FALSE FALSE
## [42,]  FALSE FALSE
## [43,]  FALSE  TRUE
## [44,]  FALSE FALSE
## [45,]  FALSE FALSE
## [46,]  FALSE FALSE
## [47,]  FALSE FALSE
## [48,]  FALSE FALSE
## [49,]  FALSE FALSE
## [50,]  FALSE FALSE
## [51,]  FALSE FALSE
## [52,]   TRUE FALSE
## [53,]  FALSE FALSE
## [54,]  FALSE FALSE
## [55,]  FALSE FALSE
## [56,]  FALSE FALSE
## [57,]  FALSE FALSE
## [58,]  FALSE FALSE
## [59,]  FALSE FALSE
## [60,]   TRUE FALSE
## [61,]  FALSE FALSE
## [62,]  FALSE FALSE
## [63,]  FALSE FALSE
## [64,]  FALSE FALSE
## [65,]  FALSE FALSE
## [66,]  FALSE FALSE
## [67,]  FALSE FALSE
## [68,]  FALSE FALSE
## [69,]  FALSE  TRUE
## [70,]  FALSE FALSE
## [71,]  FALSE FALSE
which(is.na(chickens))
## [1]   3  30  52  60  74 114 140
sum(is.na(chickens))
## [1] 7

Change all of the missing elements to NA in ch_df.

mis_ele_na <- chickens
mis_ele_na[mis_ele_na == "" | mis_ele_na ==" "] <-NA
mis_ele_na[mis_ele_na == "na"] <- NA
mis_ele_na
##    weight      feed
## 1     206  meatmeal
## 2     140 horsebean
## 3    <NA>      <NA>
## 4     318 sunflower
## 5     332    casein
## 6    <NA> horsebean
## 7     216      <NA>
## 8     143 horsebean
## 9     271   soybean
## 10    315  meatmeal
## 11    227 horsebean
## 12    N/A sunflower
## 13    322 sunflower
## 14    352    casein
## 15    329  not sure
## 16    N/A   linseed
## 17    379    casein
## 18    153         ?
## 19    N/A   linseed
## 20    213   linseed
## 21    257      <NA>
## 22    179 horsebean
## 23    380  meatmeal
## 24    327   soybean
## 25    260   linseed
## 26    168 horsebean
## 27    248   soybean
## 28    181   linseed
## 29    160 horsebean
## 30   <NA> sunflower
## 31   <NA>   soybean
## 32    340 sunflower
## 33    260    casein
## 34    169         ?
## 35    171   soybean
## 36    368    casein
## 37    283    casein
## 38    334 sunflower
## 39      -   unknown
## 40    309   linseed
## 41   <NA>   soybean
## 42    295         ?
## 43    404      <NA>
## 44    392 sunflower
## 45   <NA>    casein
## 46    267   soybean
## 47    303  meatmeal
## 48    250   soybean
## 49    243   soybean
## 50    108 horsebean
## 51    229   linseed
## 52   <NA> horsebean
## 53    222    casein
## 54    344  meatmeal
## 55    263   unknown
## 56    148   linseed
## 57    318    casein
## 58      -  meatmeal
## 59    258  meatmeal
## 60   <NA> sunflower
## 61    325  meatmeal
## 62    217      <NA>
## 63    271   linseed
## 64    244   linseed
## 65    341 sunflower
## 66    141         ?
## 67    158   soybean
## 68    423 sunflower
## 69    316      <NA>
## 70   <NA>   soybean
## 71   <NA>    casein
view(mis_ele_na)

mis_ele_na <- mis_ele_na %>% 
  mutate(weight = replace(weight, weight == "-", NA)) %>% 
  mutate(feed = replace(feed, feed == "?", NA)) %>% 
  mutate(feed = replace(feed, feed == " ", NA))
view(mis_ele_na)

QUESTION 4

Calculate the percentage of missing data from the weight column, the feed column, and the entire dataset. Print out each result in the following format: “Percentage of missing data in [fill in the column or dataset name]: [fill in percentage]%.”

df <- mis_ele_na

sum(is.na(df))
## [1] 22
percentage = mean(is.na(df)) * 100
percentage
## [1] 15.49296
print(paste("Percentage of missing data in chicken data set: ", percentage,"%" , sep = ""))
## [1] "Percentage of missing data in chicken data set: 15.4929577464789%"

QUESTION 5

Group the data by feed and find the mean and median weight for each group. Your result should be a new data frame with the group means in a column named weight_mean and the group medians in a column named weight_median. Save this new data frame; you can name the data frame as you wish. (Remember that variable names should be somewhat descriptive of what they contain.)

class(df)
## [1] "data.frame"
class(df$weight)
## [1] "character"
#df_new <- df %>% 
 # mutate_at(1, as.numeric, na.rm = TRUE)
#df_new


df$weight <- as.numeric(df$weight)
## Warning: NAs introduced by coercion
glimpse(df$weight)
##  num [1:71] 206 140 NA 318 332 NA 216 143 271 315 ...
df <- df %>% 
  mutate(weight = replace(weight,
                          is.na(weight),
                          median(weight, na.rm = T)))
df$weight
##  [1] 206.0 140.0 261.5 318.0 332.0 261.5 216.0 143.0 271.0 315.0 227.0 261.5
## [13] 322.0 352.0 329.0 261.5 379.0 153.0 261.5 213.0 257.0 179.0 380.0 327.0
## [25] 260.0 168.0 248.0 181.0 160.0 261.5 261.5 340.0 260.0 169.0 171.0 368.0
## [37] 283.0 334.0 261.5 309.0 261.5 295.0 404.0 392.0 261.5 267.0 303.0 250.0
## [49] 243.0 108.0 229.0 261.5 222.0 344.0 263.0 148.0 318.0 261.5 258.0 261.5
## [61] 325.0 217.0 271.0 244.0 341.0 141.0 158.0 423.0 316.0 261.5 261.5
mean(df$weight)
## [1] 263.5563
df <- df %>% 
  group_by(feed) %>%
  summarise(mean_weight = mean(weight),
            median_weight = median(weight),
            .groups = 'drop') %>% 
  as.data.frame()
df
##        feed mean_weight median_weight
## 1    casein    303.7000        300.50
## 2 horsebean    183.1111        168.00
## 3   linseed    237.8000        252.00
## 4  meatmeal    299.0625        309.00
## 5  not sure    329.0000        329.00
## 6   soybean    247.2273        261.50
## 7 sunflower    325.4500        328.00
## 8   unknown    262.2500        262.25
## 9      <NA>    242.9500        237.00

QUESTION 6

Find the feed that has the maximum median chicken weight.

median_weight_max = max(df$median_weight)
print(median_weight_max)
## [1] 329