Data cleaning and manipulation

Load & remove

Load necessary packages

install.packages("tidyverse")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)

install.packages("skimr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load dataset

df <- read.csv("avocado.csv")
str(df)

## 'data.frame':    18249 obs. of  14 variables:
##  $ X           : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Date        : chr  "2015-12-27" "2015-12-20" "2015-12-13" "2015-12-06" ...
##  $ AveragePrice: num  1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
##  $ Total.Volume: num  64237 54877 118220 78992 51040 ...
##  $ X4046       : num  1037 674 795 1132 941 ...
##  $ X4225       : num  54455 44639 109150 71976 43838 ...
##  $ X4770       : num  48.2 58.3 130.5 72.6 75.8 ...
##  $ Total.Bags  : num  8697 9506 8145 5811 6184 ...
##  $ Small.Bags  : num  8604 9408 8042 5677 5986 ...
##  $ Large.Bags  : num  93.2 97.5 103.1 133.8 197.7 ...
##  $ XLarge.Bags : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ type        : chr  "conventional" "conventional" "conventional" "conventional" ...
##  $ year        : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ region      : chr  "Albany" "Albany" "Albany" "Albany" ...

Remove unnecessary columns

df <- df[,-(8:11)]

df <- df[,-1]

Remove unreasonable value

# We have some unreasonable values in the region column. Accordingly, the value of West, totalUS, Northeast and Southeast
df <- subset(df,subset= !(df$region %in% c("Northeast","TotalUS","West","Southeast")))

Check value

Date Format

# We can be seen that the date column is in character form, we need to convert it to the date format
class(df$Date)

## [1] "character"

# Convert character to date-time in lubridate package
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

df$Date <- ymd(df$Date)

# Check date column again
class(df$Date)

## [1] "Date"

Check duplicate values

df[duplicated(df),]

## [1] Date         AveragePrice Total.Volume X4046        X4225       
## [6] X4770        type         year         region      
## <0 rows> (or 0-length row.names)

Add column

Day, Month, Day_of_Week

df <- mutate(df, day_col = day(df$Date), month_col = month(df$Date))

# Overview data set
df$day_of_week <- weekdays(df$Date)
str(df)

## 'data.frame':    16897 obs. of  12 variables:
##  $ Date        : Date, format: "2015-12-27" "2015-12-20" ...
##  $ AveragePrice: num  1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
##  $ Total.Volume: num  64237 54877 118220 78992 51040 ...
##  $ X4046       : num  1037 674 795 1132 941 ...
##  $ X4225       : num  54455 44639 109150 71976 43838 ...
##  $ X4770       : num  48.2 58.3 130.5 72.6 75.8 ...
##  $ type        : chr  "conventional" "conventional" "conventional" "conventional" ...
##  $ year        : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ region      : chr  "Albany" "Albany" "Albany" "Albany" ...
##  $ day_col     : int  27 20 13 6 29 22 15 8 1 25 ...
##  $ month_col   : num  12 12 12 12 11 11 11 11 11 10 ...
##  $ day_of_week : chr  "Sunday" "Sunday" "Sunday" "Sunday" ...

# Export data frame to .csv file 
write.csv(df,"Final.csv", row.names = FALSE)

Data cleaning and manipulation

Nguyễn Gia Huy

Load & remove

Load necessary packages

Load dataset

Remove unnecessary columns

Remove unreasonable value

Check value

Date Format

Check duplicate values

Add column

Day, Month, Day_of_Week