###Ready to share to students
library(readr)
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'pillar'
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ dplyr 1.0.2
## ✓ tibble 3.0.4 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Cleaned_Trees <- read_csv("Cleaned Trees.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## City_or_customer = col_character(),
## Type = col_character(),
## Date = col_character(),
## Num = col_character(),
## Ship_To_Address2 = col_character(),
## Ship_Zip = col_character(),
## Item = col_character(),
## Qty = col_double(),
## Sales_Price = col_character(),
## Amount = col_double()
## )
#View(Cleaned_Trees)
df <- Cleaned_Trees %>%
filter(!is.na(Sales_Price)) %>%
select(!City_or_customer)
#View(df)
df2 <- df %>%
mutate(Ship_To_Address2 = gsub("^[0-9]+", "", Ship_To_Address2))
#View(df2)
#df2 is the best
###Evan Practice
items <- df2 %>%
count(Item)
items
## # A tibble: 1,309 x 2
## Item n
## <chr> <int>
## 1 "#7 Sugar Maple" 1
## 2 "1.5\" White Fringe Tree" 1
## 3 "2.5\" Winter King Hawthorn" 1
## 4 "2\" Shagbark Hickory" 1
## 5 "3\" Black Walnut" 1
## 6 "Anchoring" 1
## 7 "Bundle of Firewood" 5
## 8 "Cedar:6' Alaskan Weeping Cedar" 13
## 9 "Cedar:6' Red Cedar" 3
## 10 "Cedar:7' Alaskan Weeping Cedar" 9
## # … with 1,299 more rows
items %>% arrange(desc(n))
## # A tibble: 1,309 x 2
## Item n
## <chr> <int>
## 1 Installation 2952
## 2 Extended Warranty 1382
## 3 Tree Staking 1299
## 4 Replacement Discount 1052
## 5 Trip Charge 684
## 6 Fertilization 482
## 7 Trees:Arborvitae:6' Emerald Green Arborvitae 415
## 8 Delivery Fee 391
## 9 Trees:Spruce:6' Norway Spruce 384
## 10 Invoice Balancer 288
## # … with 1,299 more rows
Install <- df2 %>%
filter(Item == "Installation")
Install
## # A tibble: 2,952 x 9
## Type Date Num Ship_To_Address2 Ship_Zip Item Qty Sales_Price Amount
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Invoi… 5/31/… 9065R " Mill Pond Rd." 46711 Inst… 1 231 231
## 2 Invoi… 9/15/… SI-61… " Stony Creek C… 46714 Inst… 1 533 533
## 3 Invoi… 5/22/… SI-16… " North Etna Rd" 46725 Inst… 1 158.33 158.
## 4 Invoi… 8/17/… SI-15… " E 200 S" 46725 Inst… 1 437.25 437.
## 5 Invoi… 6/8/2… 7434R " W. Business 3… 46725 Inst… 1 66.67 66.7
## 6 Invoi… 5/10/… 7975 " Ashford Drive" 46725 Inst… 1 425 425
## 7 Credi… 5/14/… 7976 " Ashford Drive" 46725 Inst… -1 425 -425
## 8 Invoi… 8/29/… 8941R " North Conrad … 46725 Inst… 1 183.33 183.
## 9 Invoi… 5/17/… SI-15… " Tula Trail" 46814 Inst… 1 66.83 66.8
## 10 Invoi… 5/17/… SI-13… " Coronado Ct." 46845 Inst… 1 75 75
## # … with 2,942 more rows
sum(Install$Amount)
## [1] 2064647
#Installation by Zip Codes
#need to filter out no address data
#why only 63 rows?? should have 754 rows
Install_Zips <- Install %>%
#Yvonne try
#filter(!is.na(Ship_Zip))
#Evan Try
filter(Ship_Zip != "NA")
Install_Zips
## # A tibble: 63 x 9
## Type Date Num Ship_To_Address2 Ship_Zip Item Qty Sales_Price Amount
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Invoi… 5/31/… 9065R " Mill Pond Rd." 46711 Inst… 1 231 231
## 2 Invoi… 9/15/… SI-61… " Stony Creek C… 46714 Inst… 1 533 533
## 3 Invoi… 5/22/… SI-16… " North Etna Rd" 46725 Inst… 1 158.33 158.
## 4 Invoi… 8/17/… SI-15… " E 200 S" 46725 Inst… 1 437.25 437.
## 5 Invoi… 6/8/2… 7434R " W. Business 3… 46725 Inst… 1 66.67 66.7
## 6 Invoi… 5/10/… 7975 " Ashford Drive" 46725 Inst… 1 425 425
## 7 Credi… 5/14/… 7976 " Ashford Drive" 46725 Inst… -1 425 -425
## 8 Invoi… 8/29/… 8941R " North Conrad … 46725 Inst… 1 183.33 183.
## 9 Invoi… 5/17/… SI-15… " Tula Trail" 46814 Inst… 1 66.83 66.8
## 10 Invoi… 5/17/… SI-13… " Coronado Ct." 46845 Inst… 1 75 75
## # … with 53 more rows
#View(Install_Zips)
ggplot(Install_Zips, aes(x = Amount,
fill = Ship_Zip)) +
geom_histogram(binwidth = 100,
alpha = 0.5)
ggplot(Install, aes(x = Amount), fill = Ship_Zip) +
geom_histogram(binwidth = 1000)
ggplot(Install, aes(x = Amount,
fill = Ship_Zip)) +
geom_histogram(binwidth = 5000,
alpha = 0.5)
##write out cleaned code
getwd()
## [1] "/cloud/project"
write.csv(df2, "df2.csv")
#trying to change date to date and sales_price to numeric
str(df2)
## tibble [22,102 × 9] (S3: tbl_df/tbl/data.frame)
## $ Type : chr [1:22102] "Invoice" "Invoice" "Invoice" "Invoice" ...
## $ Date : chr [1:22102] "7/29/2023" "7/29/2023" "5/11/2023" "5/11/2023" ...
## $ Num : chr [1:22102] "9845" "9845" "SI-1560R" "SI-1560R" ...
## $ Ship_To_Address2: chr [1:22102] " MacTavish Court" " MacTavish Court" " McDarmid Ave" " McDarmid Ave" ...
## $ Ship_Zip : chr [1:22102] "46703" "46703" "46703" "46703" ...
## $ Item : chr [1:22102] "Trees:Spruce:6' Fat Albert Blue Spruce" "Trip Charge" "Trees:Arborvitae:8' Emerald Green Arborvitae" "Replacement Discount" ...
## $ Qty : num [1:22102] 2 1 4 NA 1 NA 1 NA 1 NA ...
## $ Sales_Price : chr [1:22102] "525" "160" "450" "-1800" ...
## $ Amount : num [1:22102] 1050 160 1800 -1800 450 -450 924 -924 975 -975 ...
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
df2
## # A tibble: 22,102 x 9
## Type Date Num Ship_To_Address2 Ship_Zip Item Qty Sales_Price Amount
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Invoi… 7/29… 9845 " MacTavish Cou… 46703 "Trees… 2 525 1050
## 2 Invoi… 7/29… 9845 " MacTavish Cou… 46703 "Trip … 1 160 160
## 3 Invoi… 5/11… SI-1… " McDarmid Ave" 46703 "Trees… 4 450 1800
## 4 Invoi… 5/11… SI-1… " McDarmid Ave" 46703 "Repla… NA -1800 -1800
## 5 Invoi… 7/11… SI-1… " McDarmid Ave" 46703 "Trees… 1 450 450
## 6 Invoi… 7/11… SI-1… " McDarmid Ave" 46703 "Repla… NA -450 -450
## 7 Invoi… 4/12… 7662R " Menza Drive" 46706 "Trees… 1 924 924
## 8 Invoi… 4/12… 7662R " Menza Drive" 46706 "Repla… NA -924 -924
## 9 Invoi… 5/19… 8378R " Cascina Lane" 46706 "Trees… 1 975 975
## 10 Invoi… 5/19… 8378R " Cascina Lane" 46706 "Repla… NA -975 -975
## # … with 22,092 more rows
fixed_date <- df2 %>%
mutate(Date = mdy(Date)) %>%
mutate(Sales_Price = as.numeric(Sales_Price))
## Warning: Problem with `mutate()` input `Sales_Price`.
## i NAs introduced by coercion
## i Input `Sales_Price` is `as.numeric(Sales_Price)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
fixed_date
## # A tibble: 22,102 x 9
## Type Date Num Ship_To_Address2 Ship_Zip Item Qty Sales_Price
## <chr> <date> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Invo… 2023-07-29 9845 " MacTavish Cou… 46703 "Tre… 2 525
## 2 Invo… 2023-07-29 9845 " MacTavish Cou… 46703 "Tri… 1 160
## 3 Invo… 2023-05-11 SI-1… " McDarmid Ave" 46703 "Tre… 4 450
## 4 Invo… 2023-05-11 SI-1… " McDarmid Ave" 46703 "Rep… NA -1800
## 5 Invo… 2023-07-11 SI-1… " McDarmid Ave" 46703 "Tre… 1 450
## 6 Invo… 2023-07-11 SI-1… " McDarmid Ave" 46703 "Rep… NA -450
## 7 Invo… 2023-04-12 7662R " Menza Drive" 46706 "Tre… 1 924
## 8 Invo… 2023-04-12 7662R " Menza Drive" 46706 "Rep… NA -924
## 9 Invo… 2023-05-19 8378R " Cascina Lane" 46706 "Tre… 1 975
## 10 Invo… 2023-05-19 8378R " Cascina Lane" 46706 "Rep… NA -975
## # … with 22,092 more rows, and 1 more variable: Amount <dbl>
fixed_date %>%
select(Sales_Price) %>%
count(Sales_Price) %>%
arrange(desc(n))
## # A tibble: 2,788 x 2
## Sales_Price n
## <dbl> <int>
## 1 35 2807
## 2 85 496
## 3 375 457
## 4 175 375
## 5 325 349
## 6 0 325
## 7 395 317
## 8 475 292
## 9 425 289
## 10 495 276
## # … with 2,778 more rows
#when do they make their sales
count_dates <- fixed_date %>%
count(Date) %>%
arrange(desc(n))
count_dates
## # A tibble: 1,948 x 2
## Date n
## <date> <int>
## 1 2021-04-17 79
## 2 2020-05-16 73
## 3 2023-04-29 72
## 4 2019-05-11 71
## 5 2020-05-27 70
## 6 2020-09-19 69
## 7 2020-05-02 63
## 8 2020-06-13 61
## 9 2021-04-24 60
## 10 2019-10-09 57
## # … with 1,938 more rows
count_dates %>%
ggplot(aes(x = Date, y = n, group = 1, colour = n )) +
geom_line() +
labs(title = "Number of Sales Each Day",
subtitle = "IDK YET",
x = "Date Sold", y = "Number of Sales",
colour = "Date")
count_dates %>%
ggplot(aes(x = Date, y = n, group = 1 )) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#trying but QTY is too high
fixed_date %>%
count(Date) %>%
arrange(desc(n))
## # A tibble: 1,948 x 2
## Date n
## <date> <int>
## 1 2021-04-17 79
## 2 2020-05-16 73
## 3 2023-04-29 72
## 4 2019-05-11 71
## 5 2020-05-27 70
## 6 2020-09-19 69
## 7 2020-05-02 63
## 8 2020-06-13 61
## 9 2021-04-24 60
## 10 2019-10-09 57
## # … with 1,938 more rows
#most qty
fixed_date %>%
count(Qty) %>%
arrange(desc(Qty))
## # A tibble: 70 x 2
## Qty n
## <dbl> <int>
## 1 199 1
## 2 125 1
## 3 121 1
## 4 116 1
## 5 105 1
## 6 100 1
## 7 90 1
## 8 80 1
## 9 78 1
## 10 77 1
## # … with 60 more rows
#most ammount
fixed_date %>%
count(Amount) %>%
arrange(desc(Amount))
## # A tibble: 3,048 x 2
## Amount n
## <dbl> <int>
## 1 45000 1
## 2 19552. 1
## 3 19355 1
## 4 18125 1
## 5 18000 1
## 6 17760 1
## 7 16334. 1
## 8 15750 1
## 9 15225 1
## 10 14700 1
## # … with 3,038 more rows
fixed_date
## # A tibble: 22,102 x 9
## Type Date Num Ship_To_Address2 Ship_Zip Item Qty Sales_Price
## <chr> <date> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Invo… 2023-07-29 9845 " MacTavish Cou… 46703 "Tre… 2 525
## 2 Invo… 2023-07-29 9845 " MacTavish Cou… 46703 "Tri… 1 160
## 3 Invo… 2023-05-11 SI-1… " McDarmid Ave" 46703 "Tre… 4 450
## 4 Invo… 2023-05-11 SI-1… " McDarmid Ave" 46703 "Rep… NA -1800
## 5 Invo… 2023-07-11 SI-1… " McDarmid Ave" 46703 "Tre… 1 450
## 6 Invo… 2023-07-11 SI-1… " McDarmid Ave" 46703 "Rep… NA -450
## 7 Invo… 2023-04-12 7662R " Menza Drive" 46706 "Tre… 1 924
## 8 Invo… 2023-04-12 7662R " Menza Drive" 46706 "Rep… NA -924
## 9 Invo… 2023-05-19 8378R " Cascina Lane" 46706 "Tre… 1 975
## 10 Invo… 2023-05-19 8378R " Cascina Lane" 46706 "Rep… NA -975
## # … with 22,092 more rows, and 1 more variable: Amount <dbl>
fixed_date %>%
ggplot(aes(x = Date, y = Qty, group = 1, colour = Amount )) +
geom_line() +
labs(title = "Number of Sales Each Day",
subtitle = "Date and Quantity",
x = "Date Sold", y = "Number of Items Sold",
colour = "Date")