library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#read in data
df <- read_csv("C:/Users/kiram/Dropbox/My PC (DESKTOP-1BIMTFK)/Downloads/mod_4_peer_review_data.csv")
## Rows: 34432 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Date, Department, Category, CustomerCode
## dbl (2): Price, Quantity
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#summarize data
summary(df)
## Date Department Category CustomerCode
## Length:34432 Length:34432 Length:34432 Length:34432
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Price Quantity
## Min. : 3.00 Min. : 1.00
## 1st Qu.:12.00 1st Qu.: 8.00
## Median :25.00 Median :11.00
## Mean :22.81 Mean :11.31
## 3rd Qu.:33.00 3rd Qu.:15.00
## Max. :50.00 Max. :24.00
## NA's :10 NA's :7
str(df)
## spec_tbl_df [34,432 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : chr [1:34432] "1/14/2016" "7/2/2016" "7/9/2014" "9/10/2014" ...
## $ Department : chr [1:34432] "Kabobs" "Sides" "Sides" "Sides" ...
## $ Category : chr [1:34432] "Pork" "Rice" "Rice" "Rice" ...
## $ CustomerCode: chr [1:34432] "CWM11331L8O" "CWM11331L8O" "CXP4593H7E" "CWM11331L8O" ...
## $ Price : num [1:34432] 28 9 9 9 25 18 26 12 9 12 ...
## $ Quantity : num [1:34432] 11 5 14 6 7 13 9 6 11 22 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_character(),
## .. Department = col_character(),
## .. Category = col_character(),
## .. CustomerCode = col_character(),
## .. Price = col_double(),
## .. Quantity = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(df, n=10)
tail(df, n=10)
slice_sample(df, n=10)
#chaange date from character to date
df$Date<-mdy(df$Date)
#change department from character to factor
df$Department<-as.factor(df$Department)
#change Category from character to factor
df$Category<-as.factor(df$Category)
#3. Display the count of NA values in each column.
df %>% map(is.na) %>% map(sum)
## $Date
## [1] 0
##
## $Department
## [1] 0
##
## $Category
## [1] 0
##
## $CustomerCode
## [1] 0
##
## $Price
## [1] 10
##
## $Quantity
## [1] 7
#4. Display a bar chart for Category column
category_df <- df %>% group_by(Category) %>% summarise(Total_Quantity = sum(Quantity, na.rm = T)) %>% arrange(desc(Total_Quantity))
ggplot(data = category_df, aes(x = reorder(Category, - Total_Quantity), y = Total_Quantity, fill = Total_Quantity)) + geom_col() + labs(x = 'Category', y = 'Quantity', title = 'Bar Chart of Quantity by Category')+ theme(legend.position = 'none')
#5. Display the Departments and their revenue using a bar chart. Order the bars in a meaningful way.
df$Revenue<- with(df, Price * Quantity)
ggplot(df, aes(x=factor(Department), y=Revenue)) +
geom_col() +
labs(title = 'Bar Plot-Revenue by Department')
## Warning: Removed 17 rows containing missing values (position_stack).
#6. Create a histogram and box and whisker plot of the Price and
Quantity columns
ggplot(df, aes(x=Price))+ geom_histogram() +labs(title='Histogram of Price')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 10 rows containing non-finite values (stat_bin).
ggplot(df, aes(x=Quantity))+ geom_histogram() +labs(title='Histogram of Quantity')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing non-finite values (stat_bin).
ggplot(df, aes(x=Price, y=Quantity)) +
geom_boxplot() +
labs(title = 'Box Plot-Quantity Sold by Price')
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 10 rows containing missing values (stat_boxplot).
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
#7 Write a short essay (150-200 words) to compare the strengths and weaknesses of (1) Power BI and (2) Alteryx with that of R, for this kind of analysis. You may discuss how each of these fare in terms of replicability, ease of use, cost, ability to share results with others, scalability, etc.
#Power BI’ strengths include ease of use, accessibilty to those who don’t know programming languages, and offers many functions in its free version such as custom visualizations and custom Excel integrations. It’s also easy to share and replicate. Some weaknesses it has are formulas and visualizations are rigid, has limitations in volumes of data it can handle, and doesn’t handle complex data relationships.
#R’s strengths include it’s open source (free to use), has a wide variety of packages to use, and does better with complex statistical relationships. Weaknesses include that it has a steep learning curve, is fussy when it comes to the language, and security will be questionable since it is open source. Despite being open source, it’s not very scalable, because it’s harder to duplicate results with those unfamiliar with programming languages. For this reason, it would be much more difficult to scale.
#Alteryx’s strengths includes easy to use and replicate, lots of functionality, easy to scale, and share results. Weaknesses include cost and rigidity of visualizations. ###