###Ready to share to students
library(readr)
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'tibble'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'pillar'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'pillar'
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ dplyr   1.0.2
## ✓ tibble  3.0.4     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
Cleaned_Trees <- read_csv("Cleaned Trees.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   City_or_customer = col_character(),
##   Type = col_character(),
##   Date = col_character(),
##   Num = col_character(),
##   Ship_To_Address2 = col_character(),
##   Ship_Zip = col_character(),
##   Item = col_character(),
##   Qty = col_double(),
##   Sales_Price = col_character(),
##   Amount = col_double()
## )
#View(Cleaned_Trees)

df <- Cleaned_Trees %>%
  filter(!is.na(Sales_Price)) %>%
  select(!City_or_customer)
#View(df)
df2 <- df %>%
  mutate(Ship_To_Address2 = gsub("^[0-9]+", "", Ship_To_Address2))
#View(df2)
#df2 is the best

###Evan Practice

items <- df2 %>% 
  count(Item) 
items
## # A tibble: 1,309 x 2
##    Item                                 n
##    <chr>                            <int>
##  1 "#7 Sugar Maple"                     1
##  2 "1.5\" White Fringe Tree"            1
##  3 "2.5\" Winter King Hawthorn"         1
##  4 "2\" Shagbark Hickory"               1
##  5 "3\" Black Walnut"                   1
##  6 "Anchoring"                          1
##  7 "Bundle of Firewood"                 5
##  8 "Cedar:6' Alaskan Weeping Cedar"    13
##  9 "Cedar:6' Red Cedar"                 3
## 10 "Cedar:7' Alaskan Weeping Cedar"     9
## # … with 1,299 more rows
items %>% arrange(desc(n))
## # A tibble: 1,309 x 2
##    Item                                             n
##    <chr>                                        <int>
##  1 Installation                                  2952
##  2 Extended Warranty                             1382
##  3 Tree Staking                                  1299
##  4 Replacement Discount                          1052
##  5 Trip Charge                                    684
##  6 Fertilization                                  482
##  7 Trees:Arborvitae:6' Emerald Green Arborvitae   415
##  8 Delivery Fee                                   391
##  9 Trees:Spruce:6' Norway Spruce                  384
## 10 Invoice Balancer                               288
## # … with 1,299 more rows
Install <- df2 %>%
  filter(Item == "Installation")

Install
## # A tibble: 2,952 x 9
##    Type   Date   Num    Ship_To_Address2 Ship_Zip Item    Qty Sales_Price Amount
##    <chr>  <chr>  <chr>  <chr>            <chr>    <chr> <dbl> <chr>        <dbl>
##  1 Invoi… 5/31/… 9065R  " Mill Pond Rd." 46711    Inst…     1 231          231  
##  2 Invoi… 9/15/… SI-61… " Stony Creek C… 46714    Inst…     1 533          533  
##  3 Invoi… 5/22/… SI-16… " North Etna Rd" 46725    Inst…     1 158.33       158. 
##  4 Invoi… 8/17/… SI-15… " E 200 S"       46725    Inst…     1 437.25       437. 
##  5 Invoi… 6/8/2… 7434R  " W. Business 3… 46725    Inst…     1 66.67         66.7
##  6 Invoi… 5/10/… 7975   " Ashford Drive" 46725    Inst…     1 425          425  
##  7 Credi… 5/14/… 7976   " Ashford Drive" 46725    Inst…    -1 425         -425  
##  8 Invoi… 8/29/… 8941R  " North Conrad … 46725    Inst…     1 183.33       183. 
##  9 Invoi… 5/17/… SI-15… " Tula Trail"    46814    Inst…     1 66.83         66.8
## 10 Invoi… 5/17/… SI-13… " Coronado Ct."  46845    Inst…     1 75            75  
## # … with 2,942 more rows
sum(Install$Amount)
## [1] 2064647
#Installation by Zip Codes
#need to filter out no address data 

#why only 63 rows?? should have 754 rows
Install_Zips <- Install %>%
  #Yvonne try 
  #filter(!is.na(Ship_Zip)) 
  #Evan Try 
  filter(Ship_Zip != "NA")
Install_Zips
## # A tibble: 63 x 9
##    Type   Date   Num    Ship_To_Address2 Ship_Zip Item    Qty Sales_Price Amount
##    <chr>  <chr>  <chr>  <chr>            <chr>    <chr> <dbl> <chr>        <dbl>
##  1 Invoi… 5/31/… 9065R  " Mill Pond Rd." 46711    Inst…     1 231          231  
##  2 Invoi… 9/15/… SI-61… " Stony Creek C… 46714    Inst…     1 533          533  
##  3 Invoi… 5/22/… SI-16… " North Etna Rd" 46725    Inst…     1 158.33       158. 
##  4 Invoi… 8/17/… SI-15… " E 200 S"       46725    Inst…     1 437.25       437. 
##  5 Invoi… 6/8/2… 7434R  " W. Business 3… 46725    Inst…     1 66.67         66.7
##  6 Invoi… 5/10/… 7975   " Ashford Drive" 46725    Inst…     1 425          425  
##  7 Credi… 5/14/… 7976   " Ashford Drive" 46725    Inst…    -1 425         -425  
##  8 Invoi… 8/29/… 8941R  " North Conrad … 46725    Inst…     1 183.33       183. 
##  9 Invoi… 5/17/… SI-15… " Tula Trail"    46814    Inst…     1 66.83         66.8
## 10 Invoi… 5/17/… SI-13… " Coronado Ct."  46845    Inst…     1 75            75  
## # … with 53 more rows
#View(Install_Zips)

ggplot(Install_Zips, aes(x = Amount, 
                  fill = Ship_Zip)) +
  geom_histogram(binwidth = 100,
                 alpha = 0.5)

ggplot(Install, aes(x = Amount), fill = Ship_Zip) +
  geom_histogram(binwidth = 1000)

ggplot(Install, aes(x = Amount, 
                  fill = Ship_Zip)) +
  geom_histogram(binwidth = 5000,
                 alpha = 0.5)

##write out cleaned code
getwd()
## [1] "/cloud/project"
write.csv(df2, "df2.csv")
#trying to change date to date and sales_price to numeric
str(df2)
## tibble [22,102 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Type            : chr [1:22102] "Invoice" "Invoice" "Invoice" "Invoice" ...
##  $ Date            : chr [1:22102] "7/29/2023" "7/29/2023" "5/11/2023" "5/11/2023" ...
##  $ Num             : chr [1:22102] "9845" "9845" "SI-1560R" "SI-1560R" ...
##  $ Ship_To_Address2: chr [1:22102] " MacTavish Court" " MacTavish Court" " McDarmid Ave" " McDarmid Ave" ...
##  $ Ship_Zip        : chr [1:22102] "46703" "46703" "46703" "46703" ...
##  $ Item            : chr [1:22102] "Trees:Spruce:6' Fat Albert Blue Spruce" "Trip Charge" "Trees:Arborvitae:8' Emerald Green Arborvitae" "Replacement Discount" ...
##  $ Qty             : num [1:22102] 2 1 4 NA 1 NA 1 NA 1 NA ...
##  $ Sales_Price     : chr [1:22102] "525" "160" "450" "-1800" ...
##  $ Amount          : num [1:22102] 1050 160 1800 -1800 450 -450 924 -924 975 -975 ...
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
df2
## # A tibble: 22,102 x 9
##    Type   Date  Num   Ship_To_Address2 Ship_Zip Item      Qty Sales_Price Amount
##    <chr>  <chr> <chr> <chr>            <chr>    <chr>   <dbl> <chr>        <dbl>
##  1 Invoi… 7/29… 9845  " MacTavish Cou… 46703    "Trees…     2 525           1050
##  2 Invoi… 7/29… 9845  " MacTavish Cou… 46703    "Trip …     1 160            160
##  3 Invoi… 5/11… SI-1… " McDarmid Ave"  46703    "Trees…     4 450           1800
##  4 Invoi… 5/11… SI-1… " McDarmid Ave"  46703    "Repla…    NA -1800        -1800
##  5 Invoi… 7/11… SI-1… " McDarmid Ave"  46703    "Trees…     1 450            450
##  6 Invoi… 7/11… SI-1… " McDarmid Ave"  46703    "Repla…    NA -450          -450
##  7 Invoi… 4/12… 7662R " Menza Drive"   46706    "Trees…     1 924            924
##  8 Invoi… 4/12… 7662R " Menza Drive"   46706    "Repla…    NA -924          -924
##  9 Invoi… 5/19… 8378R " Cascina Lane"  46706    "Trees…     1 975            975
## 10 Invoi… 5/19… 8378R " Cascina Lane"  46706    "Repla…    NA -975          -975
## # … with 22,092 more rows
fixed_date <- df2 %>%
  mutate(Date = mdy(Date)) %>%
  mutate(Sales_Price = as.numeric(Sales_Price))
## Warning: Problem with `mutate()` input `Sales_Price`.
## i NAs introduced by coercion
## i Input `Sales_Price` is `as.numeric(Sales_Price)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
fixed_date
## # A tibble: 22,102 x 9
##    Type  Date       Num   Ship_To_Address2 Ship_Zip Item    Qty Sales_Price
##    <chr> <date>     <chr> <chr>            <chr>    <chr> <dbl>       <dbl>
##  1 Invo… 2023-07-29 9845  " MacTavish Cou… 46703    "Tre…     2         525
##  2 Invo… 2023-07-29 9845  " MacTavish Cou… 46703    "Tri…     1         160
##  3 Invo… 2023-05-11 SI-1… " McDarmid Ave"  46703    "Tre…     4         450
##  4 Invo… 2023-05-11 SI-1… " McDarmid Ave"  46703    "Rep…    NA       -1800
##  5 Invo… 2023-07-11 SI-1… " McDarmid Ave"  46703    "Tre…     1         450
##  6 Invo… 2023-07-11 SI-1… " McDarmid Ave"  46703    "Rep…    NA        -450
##  7 Invo… 2023-04-12 7662R " Menza Drive"   46706    "Tre…     1         924
##  8 Invo… 2023-04-12 7662R " Menza Drive"   46706    "Rep…    NA        -924
##  9 Invo… 2023-05-19 8378R " Cascina Lane"  46706    "Tre…     1         975
## 10 Invo… 2023-05-19 8378R " Cascina Lane"  46706    "Rep…    NA        -975
## # … with 22,092 more rows, and 1 more variable: Amount <dbl>
fixed_date %>%
  select(Sales_Price) %>%
  count(Sales_Price) %>%
  arrange(desc(n))
## # A tibble: 2,788 x 2
##    Sales_Price     n
##          <dbl> <int>
##  1          35  2807
##  2          85   496
##  3         375   457
##  4         175   375
##  5         325   349
##  6           0   325
##  7         395   317
##  8         475   292
##  9         425   289
## 10         495   276
## # … with 2,778 more rows
#when do they make their sales

count_dates <- fixed_date %>%
  count(Date) %>%
  arrange(desc(n))
count_dates
## # A tibble: 1,948 x 2
##    Date           n
##    <date>     <int>
##  1 2021-04-17    79
##  2 2020-05-16    73
##  3 2023-04-29    72
##  4 2019-05-11    71
##  5 2020-05-27    70
##  6 2020-09-19    69
##  7 2020-05-02    63
##  8 2020-06-13    61
##  9 2021-04-24    60
## 10 2019-10-09    57
## # … with 1,938 more rows
count_dates %>%
  ggplot(aes(x = Date, y = n, group = 1, colour = n )) +
  geom_line() +
  labs(title = "Number of Sales Each Day",
       subtitle = "IDK YET",
       x = "Date Sold", y = "Number of Sales",
       colour = "Date")

count_dates %>%
  ggplot(aes(x = Date, y = n, group = 1 )) +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#trying but QTY is too high
fixed_date %>%
  count(Date) %>%
    arrange(desc(n))
## # A tibble: 1,948 x 2
##    Date           n
##    <date>     <int>
##  1 2021-04-17    79
##  2 2020-05-16    73
##  3 2023-04-29    72
##  4 2019-05-11    71
##  5 2020-05-27    70
##  6 2020-09-19    69
##  7 2020-05-02    63
##  8 2020-06-13    61
##  9 2021-04-24    60
## 10 2019-10-09    57
## # … with 1,938 more rows
#most qty
fixed_date %>%
  count(Qty) %>%
  arrange(desc(Qty))
## # A tibble: 70 x 2
##      Qty     n
##    <dbl> <int>
##  1   199     1
##  2   125     1
##  3   121     1
##  4   116     1
##  5   105     1
##  6   100     1
##  7    90     1
##  8    80     1
##  9    78     1
## 10    77     1
## # … with 60 more rows
#most ammount
fixed_date %>%
  count(Amount) %>%
  arrange(desc(Amount))
## # A tibble: 3,048 x 2
##    Amount     n
##     <dbl> <int>
##  1 45000      1
##  2 19552.     1
##  3 19355      1
##  4 18125      1
##  5 18000      1
##  6 17760      1
##  7 16334.     1
##  8 15750      1
##  9 15225      1
## 10 14700      1
## # … with 3,038 more rows
fixed_date
## # A tibble: 22,102 x 9
##    Type  Date       Num   Ship_To_Address2 Ship_Zip Item    Qty Sales_Price
##    <chr> <date>     <chr> <chr>            <chr>    <chr> <dbl>       <dbl>
##  1 Invo… 2023-07-29 9845  " MacTavish Cou… 46703    "Tre…     2         525
##  2 Invo… 2023-07-29 9845  " MacTavish Cou… 46703    "Tri…     1         160
##  3 Invo… 2023-05-11 SI-1… " McDarmid Ave"  46703    "Tre…     4         450
##  4 Invo… 2023-05-11 SI-1… " McDarmid Ave"  46703    "Rep…    NA       -1800
##  5 Invo… 2023-07-11 SI-1… " McDarmid Ave"  46703    "Tre…     1         450
##  6 Invo… 2023-07-11 SI-1… " McDarmid Ave"  46703    "Rep…    NA        -450
##  7 Invo… 2023-04-12 7662R " Menza Drive"   46706    "Tre…     1         924
##  8 Invo… 2023-04-12 7662R " Menza Drive"   46706    "Rep…    NA        -924
##  9 Invo… 2023-05-19 8378R " Cascina Lane"  46706    "Tre…     1         975
## 10 Invo… 2023-05-19 8378R " Cascina Lane"  46706    "Rep…    NA        -975
## # … with 22,092 more rows, and 1 more variable: Amount <dbl>
fixed_date %>%
  ggplot(aes(x = Date, y = Qty, group = 1, colour = Amount )) +
  geom_line() +
  labs(title = "Number of Sales Each Day",
       subtitle = "Date and Quantity",
       x = "Date Sold", y = "Number of Items Sold",
       colour = "Date")