This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
Note: this analysis was performed using the open source software R and Rstudio.
library(readr)
data <- read_csv('avocado.csv')
## Rows: 12628 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): date, type, geography
## dbl (4): average_price, total_volume, year, Mileage
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(data)
## date average_price total_volume type
## Length:12628 Min. :0.500 Min. : 253 Length:12628
## Class :character 1st Qu.:1.100 1st Qu.: 15733 Class :character
## Mode :character Median :1.320 Median : 94806 Mode :character
## Mean :1.359 Mean : 325259
## 3rd Qu.:1.570 3rd Qu.: 430222
## Max. :2.780 Max. :5660216
## year geography Mileage
## Min. :2017 Length:12628 Min. : 111
## 1st Qu.:2018 Class :character 1st Qu.:1097
## Median :2019 Mode :character Median :2193
## Mean :2019 Mean :1911
## 3rd Qu.:2020 3rd Qu.:2632
## Max. :2020 Max. :2998
urlfile<-'https://raw.github.com/utjimmyx/resources/master/avocado_HAA.csv'
data<-read.csv(urlfile, fileEncoding="UTF-8-BOM")
summary(data)
## date average_price total_volume type
## Length:12628 Min. :0.500 Min. : 253 Length:12628
## Class :character 1st Qu.:1.100 1st Qu.: 15733 Class :character
## Mode :character Median :1.320 Median : 94806 Mode :character
## Mean :1.359 Mean : 325259
## 3rd Qu.:1.570 3rd Qu.: 430222
## Max. :2.780 Max. :5660216
## year geography
## Min. :2017 Length:12628
## 1st Qu.:2018 Class :character
## Median :2019 Mode :character
## Mean :2019
## 3rd Qu.:2020
## Max. :2020
library(plyr)
str(data)
## 'data.frame': 12628 obs. of 6 variables:
## $ date : chr "2017/12/3" "2017/12/3" "2017/12/3" "2017/12/3" ...
## $ average_price: num 1.39 1.44 1.07 1.62 1.43 1.58 1.14 1.77 1.4 1.88 ...
## $ total_volume : int 139970 3577 504933 10609 658939 38754 86646 1829 488588 21338 ...
## $ type : chr "conventional" "organic" "conventional" "organic" ...
## $ year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ geography : chr "Albany" "Albany" "Atlanta" "Atlanta" ...
# Let's build a simple histogram
hist(data$average_price ,
main = "Histogram of average_price",
xlab = "Price in USD (US Dollar)")
library(ggplot2)
ggplot(data, aes(x = average_price, fill = type)) +
geom_histogram(bins = 30, col = "red") +
scale_fill_manual(values = c("purple", "pink")) +
ggtitle("Frequency of Average Price - Oragnic vs. Conventional")
# Simple EFA with ggplot
ggplot() +
geom_col(data, mapping = aes(x = reorder(geography,total_volume),
y = total_volume, fill = year )) +
xlab("geography")+
ylab("total_volume")+
theme(axis.text.x = element_text(angle = 90, size = 7))
# Sample response for year 2017 - The plot shows that Los Angels has the highest amount of sales in 2017.