Load necessary packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
install.packages("skimr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Load dataset
df <- read.csv("avocado.csv")
str(df)
## 'data.frame': 18249 obs. of 14 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Date : chr "2015-12-27" "2015-12-20" "2015-12-13" "2015-12-06" ...
## $ AveragePrice: num 1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
## $ Total.Volume: num 64237 54877 118220 78992 51040 ...
## $ X4046 : num 1037 674 795 1132 941 ...
## $ X4225 : num 54455 44639 109150 71976 43838 ...
## $ X4770 : num 48.2 58.3 130.5 72.6 75.8 ...
## $ Total.Bags : num 8697 9506 8145 5811 6184 ...
## $ Small.Bags : num 8604 9408 8042 5677 5986 ...
## $ Large.Bags : num 93.2 97.5 103.1 133.8 197.7 ...
## $ XLarge.Bags : num 0 0 0 0 0 0 0 0 0 0 ...
## $ type : chr "conventional" "conventional" "conventional" "conventional" ...
## $ year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ region : chr "Albany" "Albany" "Albany" "Albany" ...
Remove unreasonable value
# We have some unreasonable values in the region column. Accordingly, the value of West, totalUS, Northeast and Southeast
df <- subset(df,subset= !(df$region %in% c("Northeast","TotalUS","West","Southeast")))