install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Before diving into data manipulation, it’s crucial to first conduct a thorough inspection of the dataset. This initial exploration helps us identify what needs to be cleaned or transformed, and it also informs us about the kind of output or analysis we should aim for.
Let’s import the data!
data<- read.csv("Grooming_Store.csv")
For today’s data, we have our data frame from a table of a pet grooming shop. The shop had been collecting simple information for client analysis. Your task of this week is to familiarize yourself with what is in the data frame and do some simply cleaning for further analysis:
Here is the form that was filled out during each grooming visit
head(data)
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 1 Buddy 3 Long 20 TRUE 50 0 Labrador
## 2 Bella 4 Medium 25 FALSE 60 1 Poodle
## 3 Charlie 2 Short 15 TRUE 40 0 Beagle
## 4 Lucy 5 Long NA FALSE 70 1 Golden Retriever
## 5 Max 1 Medium 10 TRUE NA 0 Shih Tzu
## 6 Sadie 6 Short 30 FALSE 80 1 Dachshund
## Mood_Entry Mood_Showering Mood_Grooming
## 1 Happy Calm Excited
## 2 Anxious Happy Relaxed
## 3 Excited Anxious Calm
## 4 Calm Happy Excited
## 5 Happy Anxious Relaxed
## 6 Anxious Calm Happy
head(data, 3)
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed Mood_Entry
## 1 Buddy 3 Long 20 TRUE 50 0 Labrador Happy
## 2 Bella 4 Medium 25 FALSE 60 1 Poodle Anxious
## 3 Charlie 2 Short 15 TRUE 40 0 Beagle Excited
## Mood_Showering Mood_Grooming
## 1 Calm Excited
## 2 Happy Relaxed
## 3 Anxious Calm
tail(data)
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 15 Judy 15 Short 75 TRUE 170 1 Boxer
## 16 Rosie 16 Long NA FALSE 180 1 Chihuahua
## 17 Rex 17 Medium 85 TRUE 190 0 Labrador
## 18 Ruby NA Short 90 FALSE 200 1 Golden Retriever
## 19 Milo 19 Long 95 TRUE 210 0 Poodle
## 20 Milo 19 Long 95 TRUE 210 0 Poodle
## Mood_Entry Mood_Showering Mood_Grooming
## 15 Excited Relaxed Calm
## 16 Calm Anxious Happy
## 17 Happy Excited Anxious
## 18 Anxious Calm Relaxed
## 19 Excited Happy Calm
## 20 Excited Happy Calm
tail(data, 3)
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 18 Ruby NA Short 90 FALSE 200 1 Golden Retriever
## 19 Milo 19 Long 95 TRUE 210 0 Poodle
## 20 Milo 19 Long 95 TRUE 210 0 Poodle
## Mood_Entry Mood_Showering Mood_Grooming
## 18 Anxious Calm Relaxed
## 19 Excited Happy Calm
## 20 Excited Happy Calm
nrow(data)
## [1] 20
num_rows<- nrow(data)
ncol(data)
## [1] 11
num_columns<- ncol(data)
length(data) # count the number of variables
## [1] 11
length(data$Age) # count the number of responses
## [1] 20
numeric (as for age, weight, pricing)
minimum, 1st quartile, median, mean, 3rd quartile, maximum values
categorical (Pet_name, Hair_length, etc)
Length, Class, Mode
logical (New_Client)
Count TRUEs and FALSEs, and NAs if present
summary(data)
## Pet_Name Age Hair_Length Weight
## Length:20 Min. : 1.00 Length:20 Min. :10.00
## Class :character 1st Qu.: 5.00 Class :character 1st Qu.:30.00
## Mode :character Median :10.00 Mode :character Median :55.00
## Mean :10.06 Mean :53.82
## 3rd Qu.:15.00 3rd Qu.:75.00
## Max. :19.00 Max. :95.00
## NA's :3 NA's :3
## New_Client Pricing Gender Breed
## Mode :logical Min. : 40.0 Min. :0.0 Length:20
## FALSE:9 1st Qu.: 82.5 1st Qu.:0.0 Class :character
## TRUE :11 Median :125.0 Median :0.5 Mode :character
## Mean :128.9 Mean :0.5
## 3rd Qu.:177.5 3rd Qu.:1.0
## Max. :210.0 Max. :1.0
## NA's :2
## Mood_Entry Mood_Showering Mood_Grooming
## Length:20 Length:20 Length:20
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
# I can select the desired columns and create a new data frame
data_Client_Info<- data %>% select(Pet_Name, Hair_Length, Weight, New_Client, Pricing, Gender, Breed)
# or I can delete undesired columns from the data frame to create a new data frame
data_Client_Info_2<- data %>% select(-Mood_Entry, -Mood_Showering, -Mood_Grooming)
%>% is called the pipe operator. It provides a method for chaining commands so you can run multiple commands at once instead of doing them one by one. Makes your data cleaner. You could think of it as a conjunction or a comma like in grammar.
Removing missing values (known as NA in R) from your data is important for a few reasons:
Calculations: Many math functions don’t work well with missing values.
Charts: Missing values can mess up your graphs.
Sorting Data: It’s easier to organize your data without missing values.
Learning from Data: If you’re using tools to learn from your data, they usually need complete data to work well.
Sharing Data: When you share your data with others or use it in different software, missing values can cause problems.
colSums(is.na(data))
## Pet_Name Age Hair_Length Weight New_Client
## 0 3 0 3 0
## Pricing Gender Breed Mood_Entry Mood_Showering
## 2 0 0 0 0
## Mood_Grooming
## 0
How did this came about? Lets break it apart: ##### is.na(): check if a value in the vector is NA
#Let's use variable Age as a example
data$Age
## [1] 3 4 2 5 1 6 7 NA 9 10 11 NA 13 14 15 16 17 NA 19 19
is.na (data$Age)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [13] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
#The function returns a logical vector, indicating 'TRUE' where the data is 'NA' and 'FALSE' otherwise.
# This function is used to calculate the sum of each column in a matrix or data frame, in this case, we want the number of NAs from the data frame
colSums(is.na(data))
## Pet_Name Age Hair_Length Weight New_Client
## 0 3 0 3 0
## Pricing Gender Breed Mood_Entry Mood_Showering
## 2 0 0 0 0
## Mood_Grooming
## 0
# If we are only working with column Age, lets count the number of NA in the column
sum(is.na(data$Age))
## [1] 3
Now here is a task your boss gave you: I want the average age of the dogs
data$Age
## [1] 3 4 2 5 1 6 7 NA 9 10 11 NA 13 14 15 16 17 NA 19 19
mean(data$Age)
## [1] NA
But if we just go in to calculate the average using mean(), it didn’t work. Because there are presence of NA in our data!
Now, what if you want to get rid of these pesky ‘NA’ values? Simple, you can use the na.omit() function.
#Remove NA from the column age
na_removal_Age <- na.omit(data$Age)
na_removal_Age
## [1] 3 4 2 5 1 6 7 9 10 11 13 14 15 16 17 19 19
## attr(,"na.action")
## [1] 8 12 18
## attr(,"class")
## [1] "omit"
#Calculate the average once again
mean_Age_1<-mean(na_removal_Age)
mean_Age_1
## [1] 10.05882
Alternatively, you might want to replace ‘NA’ with a specific value. The boss told you, I want to replace the incomplete NA with 10
# Setting my vector to column Age from data
na_replace_Age <- data$Age
# If is.na is True, for the value is NA, it will be replaced with 10
na_replace_Age [is.na(data$Age)] <- 10
na_replace_Age
## [1] 3 4 2 5 1 6 7 10 9 10 11 10 13 14 15 16 17 10 19 19
#Once again,calculate the average
mean_Age_2<-mean(na_replace_Age)
mean_Age_2
## [1] 10.05
Let’s say you want to calculate the average of these dogs, ignoring the ‘NA’ values. You can do this by adding a parameter na.rm=TRUE to the mean() function
#use na.rm=TRUE to ignore na is it exist
mean_Age_3 <-mean(data$Age,na.rm=TRUE)
mean_Age_3
## [1] 10.05882
# Use complete.cases to identify rows with NA values
# complete.cases returns a logical vector where TRUE indicates rows without NA
rows_without_na<-complete.cases(data)
# To get rows with NA, you can negate the logical vector. The ! symbol reverses the logical condition, turning TRUE into FALSE
rows_with_na <- !rows_without_na
data[rows_with_na, ]
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 4 Lucy 5 Long NA FALSE 70 1 Golden Retriever
## 5 Max 1 Medium 10 TRUE NA 0 Shih Tzu
## 8 Molly NA Medium 40 FALSE 100 1 Boxer
## 9 Rocky 9 Short NA TRUE 110 0 Chihuahua
## 12 Lola NA Short 60 FALSE NA 1 Beagle
## 16 Rosie 16 Long NA FALSE 180 1 Chihuahua
## 18 Ruby NA Short 90 FALSE 200 1 Golden Retriever
## Mood_Entry Mood_Showering Mood_Grooming
## 4 Calm Happy Excited
## 5 Happy Anxious Relaxed
## 8 Calm Anxious Excited
## 9 Happy Excited Anxious
## 12 Calm Excited Relaxed
## 16 Calm Anxious Happy
## 18 Anxious Calm Relaxed
# Now you boss want datasets for age under 10 and age including and above 10 because older dogs needs more service time
Age_above_include10<-data %>%
filter(Age>=10)%>%
# When you created these two data frame, the boss said: "well, I want them to be sorted in ascending order so it is easier for me to find the client's info"
arrange(Age)
Age_above_include10
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 1 Daisy 10 Long 50 FALSE 120 1 Golden Retriever
## 2 Zeus 11 Medium 55 TRUE 130 0 Poodle
## 3 Coco 13 Long 65 TRUE 150 1 Shih Tzu
## 4 Simba 14 Medium 70 FALSE 160 0 Dachshund
## 5 Judy 15 Short 75 TRUE 170 1 Boxer
## 6 Rosie 16 Long NA FALSE 180 1 Chihuahua
## 7 Rex 17 Medium 85 TRUE 190 0 Labrador
## 8 Milo 19 Long 95 TRUE 210 0 Poodle
## 9 Milo 19 Long 95 TRUE 210 0 Poodle
## Mood_Entry Mood_Showering Mood_Grooming
## 1 Anxious Relaxed Happy
## 2 Excited Calm Anxious
## 3 Happy Anxious Calm
## 4 Anxious Happy Excited
## 5 Excited Relaxed Calm
## 6 Calm Anxious Happy
## 7 Happy Excited Anxious
## 8 Excited Happy Calm
## 9 Excited Happy Calm
# The next task the boss gave you is to create another column Years_Above_10 that tells her the dogs actual age - 10,
Age_above_include10 %>% mutate(Years_Above_10= (Age-10))
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 1 Daisy 10 Long 50 FALSE 120 1 Golden Retriever
## 2 Zeus 11 Medium 55 TRUE 130 0 Poodle
## 3 Coco 13 Long 65 TRUE 150 1 Shih Tzu
## 4 Simba 14 Medium 70 FALSE 160 0 Dachshund
## 5 Judy 15 Short 75 TRUE 170 1 Boxer
## 6 Rosie 16 Long NA FALSE 180 1 Chihuahua
## 7 Rex 17 Medium 85 TRUE 190 0 Labrador
## 8 Milo 19 Long 95 TRUE 210 0 Poodle
## 9 Milo 19 Long 95 TRUE 210 0 Poodle
## Mood_Entry Mood_Showering Mood_Grooming Years_Above_10
## 1 Anxious Relaxed Happy 0
## 2 Excited Calm Anxious 1
## 3 Happy Anxious Calm 3
## 4 Anxious Happy Excited 4
## 5 Excited Relaxed Calm 5
## 6 Calm Anxious Happy 6
## 7 Happy Excited Anxious 7
## 8 Excited Happy Calm 9
## 9 Excited Happy Calm 9
Age_above_include10
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 1 Daisy 10 Long 50 FALSE 120 1 Golden Retriever
## 2 Zeus 11 Medium 55 TRUE 130 0 Poodle
## 3 Coco 13 Long 65 TRUE 150 1 Shih Tzu
## 4 Simba 14 Medium 70 FALSE 160 0 Dachshund
## 5 Judy 15 Short 75 TRUE 170 1 Boxer
## 6 Rosie 16 Long NA FALSE 180 1 Chihuahua
## 7 Rex 17 Medium 85 TRUE 190 0 Labrador
## 8 Milo 19 Long 95 TRUE 210 0 Poodle
## 9 Milo 19 Long 95 TRUE 210 0 Poodle
## Mood_Entry Mood_Showering Mood_Grooming
## 1 Anxious Relaxed Happy
## 2 Excited Calm Anxious
## 3 Happy Anxious Calm
## 4 Anxious Happy Excited
## 5 Excited Relaxed Calm
## 6 Calm Anxious Happy
## 7 Happy Excited Anxious
## 8 Excited Happy Calm
## 9 Excited Happy Calm
# Now that the age column in this dataset is fairly clean, we can start to draw some statistic conclusion using summarise()
# Mean
Age_above_include10%>% summarise(mean(Age))
## mean(Age)
## 1 14.88889
# Unique values
Age_above_include10%>% summarise(n_distinct(Age))
## n_distinct(Age)
## 1 8
# Median
Age_above_include10%>% summarise(median(Age))
## median(Age)
## 1 15
# there are much more than what we did, here is link to [summarise()](https://docs.google.com/document/d/1jvgy0FhBve4CP1QN798KMblIecxZqRDjOLTtiZxmrQ8/edit?usp=sharing)
# Now for the dogs that either is 10 or above, your boss want you to group the data set by gender and count
# Let's print the column Gender
Age_above_include10$Gender
## [1] 1 0 1 0 1 1 0 0 0
Age_above_include10 %>%
group_by(Gender) %>%
summarise(Count = n())
## # A tibble: 2 × 2
## Gender Count
## <int> <int>
## 1 0 5
## 2 1 4
# Boss want you to give her the average weight of female and male dog to see if males are heavier than females
# Now that we want to involve other columns, lets clean the NAs in Weight and gender first
cleaned_Gender_Weight<-Age_above_include10 %>%
na.omit(Weight)%>%
na.omit(Gender)
cleaned_Gender_Weight
## Pet_Name Age Hair_Length Weight New_Client Pricing Gender Breed
## 1 Daisy 10 Long 50 FALSE 120 1 Golden Retriever
## 2 Zeus 11 Medium 55 TRUE 130 0 Poodle
## 3 Coco 13 Long 65 TRUE 150 1 Shih Tzu
## 4 Simba 14 Medium 70 FALSE 160 0 Dachshund
## 5 Judy 15 Short 75 TRUE 170 1 Boxer
## 7 Rex 17 Medium 85 TRUE 190 0 Labrador
## 8 Milo 19 Long 95 TRUE 210 0 Poodle
## 9 Milo 19 Long 95 TRUE 210 0 Poodle
## Mood_Entry Mood_Showering Mood_Grooming
## 1 Anxious Relaxed Happy
## 2 Excited Calm Anxious
## 3 Happy Anxious Calm
## 4 Anxious Happy Excited
## 5 Excited Relaxed Calm
## 7 Happy Excited Anxious
## 8 Excited Happy Calm
## 9 Excited Happy Calm
average_by_Gender <- cleaned_Gender_Weight %>%
group_by(Gender) %>%
summarise(Avg_weight = mean(Weight))
average_by_Gender
## # A tibble: 2 × 2
## Gender Avg_weight
## <int> <dbl>
## 1 0 80
## 2 1 63.3
# Before you go, boss want you to create another data set that contains the information about the dogs mood
# grep(): can use the pattern you entered to find matching items
# Find columns that start with "Mood_"
mood_columns <- grep("^Mood_", names(data), value = TRUE)
mood_columns
## [1] "Mood_Entry" "Mood_Showering" "Mood_Grooming"
mood_data<-data[, mood_columns]
mood_data
## Mood_Entry Mood_Showering Mood_Grooming
## 1 Happy Calm Excited
## 2 Anxious Happy Relaxed
## 3 Excited Anxious Calm
## 4 Calm Happy Excited
## 5 Happy Anxious Relaxed
## 6 Anxious Calm Happy
## 7 Excited Happy Calm
## 8 Calm Anxious Excited
## 9 Happy Excited Anxious
## 10 Anxious Relaxed Happy
## 11 Excited Calm Anxious
## 12 Calm Excited Relaxed
## 13 Happy Anxious Calm
## 14 Anxious Happy Excited
## 15 Excited Relaxed Calm
## 16 Calm Anxious Happy
## 17 Happy Excited Anxious
## 18 Anxious Calm Relaxed
## 19 Excited Happy Calm
## 20 Excited Happy Calm
#If we got through most or all of the content, applause for yourself!