library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tinytex)
theme_set(theme_bw(16))
getwd()
## [1] "C:/Users/MCuser/Desktop/DATA110_CourseMaterials"
setwd("C:/Users/MCuser/Desktop/DATA110_CourseMaterials")
list.files()
## [1] "Data 110 syllabus sum22(2).pdf"
## [2] "eCig.csv"
## [3] "eCigHW---Submitted.docx"
## [4] "eCigHW---Submitted.html"
## [5] "eCigHW---Submitted.Rmd"
## [6] "eCigHW - Submitted.Rmd"
## [7] "eCigHW.html"
## [8] "eCigHW.Rmd"
## [9] "HW 1 - Reputable Sources"
## [10] "HW 2 - Elevator Pitch"
## [11] "HW 3 - Five Plots"
## [12] "HW 4 - Hate Crimes Data"
## [13] "HW 5 - Flights"
## [14] "HW 6 - eCigarette Project"
## [15] "rmarkdown-reference.pdf"
## [16] "rsconnect"
## [17] "Week 1 Notes - Intro and Markdown and Rpubs sum.pdf"
## [18] "Week 2 Notes - Data 110 unit 2 Using GitHub and handling data sum21.docx"
## [19] "Week 2 Notes - Data 110 unit 2 Using GitHub and handling data sum21.pdf"
## [20] "Week 2 Screenshots.docx"
## [21] "Week 3 Notes - Data 110 unit 3 ethics p-hacking reproducibility heatmap treemap streamgraph sum21(1).docx"
## [22] "Week 3 Notes - Data 110 unit 3 ethics p-hacking reproducibility heatmap treemap streamgraph sum21.pdf"
## [23] "Week 3 Screenshots.docx"
## [24] "Week 4 Screenshots.docx"
eCig <- read.csv("eCig.csv")
dim(eCig)
## [1] 2080 11
str(eCig)
## 'data.frame': 2080 obs. of 11 variables:
## $ Filing.Period : chr "3/18/2022" "3/18/2022" "3/18/2022" "3/18/2022" ...
## $ Taxable.Amount : num 13659 238 166 46264 0 ...
## $ Total.AmountDue: num 4097.6 71.2 49.9 13879.3 0 ...
## $ Date.Paid : chr "4/16/2018" "4/18/2018" "4/4/2018" "4/20/2018" ...
## $ Name : chr "MJ Vape LLC" "C&S Wholesale Grocers, Inc. (AWI)" "Delhaize America Distribution, LLC" "AVAIL Vapor, LLC" ...
## $ Street.Address : chr "19328 Montgomery Village Ave" "10 Optical Avenue" "1703 East D Street" "820 Southlake Blvd" ...
## $ Street.Address2: chr "" "" "" "" ...
## $ City : chr "Gaithersburg" "Keene" "Butner" "Richmond" ...
## $ State : chr "Maryland" "New Hampshire" "NC" "Virginia" ...
## $ Zip : chr "20886" "3431" "27509" "23236" ...
## $ Location1 : chr "19328 Montgomery Village Ave\nGaithersburg, Maryland 20886\n(39.173257, -77.203734)" "10 Optical Avenue\nKeene, New Hampshire 03431\n(42.922316, -72.263149)" "1703 East D Street\nButner, NC 27509\n(36.157157, -78.730843)" "820 Southlake Blvd\nRichmond, Virginia 23236\n(37.503787, -77.59775)" ...
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")
mean(eCig$Total.AmountDue)
## [1] 3089.262
median(eCig$Total.AmountDue)
## [1] 371.21
sd(eCig$Total.AmountDue)
## [1] 9503.848
eCig2 <- eCig %>%
select(Taxable.Amount, Total.AmountDue, City, State) %>%
group_by(City, State)
head(eCig2)
## # A tibble: 6 × 4
## # Groups: City, State [6]
## Taxable.Amount Total.AmountDue City State
## <dbl> <dbl> <chr> <chr>
## 1 13659. 4098. Gaithersburg Maryland
## 2 238. 71.2 Keene New Hampshire
## 3 166. 49.9 Butner NC
## 4 46264. 13879. Richmond Virginia
## 5 0 0 Madison Heights VA
## 6 0 0 Rockville MD
dim(eCig2)
## [1] 2080 4
view(eCig2)
summary(eCig2)
## Taxable.Amount Total.AmountDue City State
## Min. : 0 Min. : 0.0 Length:2080 Length:2080
## 1st Qu.: 0 1st Qu.: 0.0 Class :character Class :character
## Median : 1237 Median : 371.2 Mode :character Mode :character
## Mean : 10298 Mean : 3089.3
## 3rd Qu.: 5677 3rd Qu.: 1703.2
## Max. :287233 Max. :86169.8
eCig2 %>%
group_by(City) %>%
summarize(Total.AmountDue = n()) %>%
arrange(desc(Total.AmountDue)) %>%
head(10)
## # A tibble: 10 × 2
## City Total.AmountDue
## <chr> <int>
## 1 Gaithersburg 338
## 2 Rockville 245
## 3 Baltimore 162
## 4 Keene 158
## 5 silver spring 87
## 6 Fredericksburg 81
## 7 Halethorpe 81
## 8 Mountville 81
## 9 Westlake 81
## 10 Madison Heights 80
eCig2 %>%
group_by(City) %>%
summarize(Taxable.AmountDue = n()) %>%
arrange(desc(Taxable.AmountDue)) %>%
head(10)
## # A tibble: 10 × 2
## City Taxable.AmountDue
## <chr> <int>
## 1 Gaithersburg 338
## 2 Rockville 245
## 3 Baltimore 162
## 4 Keene 158
## 5 silver spring 87
## 6 Fredericksburg 81
## 7 Halethorpe 81
## 8 Mountville 81
## 9 Westlake 81
## 10 Madison Heights 80
ggplot(data = eCig2) +
stat_summary(
mapping = aes(x = City, y = Total.AmountDue/1000),
fun.min = min,
fun.max = max,
fun = median) +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Total Excise Tax Due by City")
ggplot(data = eCig2) +
stat_summary(
mapping = aes(x = City, y = Taxable.Amount/1000),
fun.min = min,
fun.max = max,
fun = median) +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Total Taxable Amount by City")
data <- tibble::as_tibble(eCig2)
data %>%
group_by(City) %>%
summarize(sum_Taxable.Amount = sum(Taxable.Amount))
## # A tibble: 32 × 2
## City sum_Taxable.Amount
## <chr> <dbl>
## 1 Addison 0
## 2 Asheville 17512.
## 3 Baltimore 249492.
## 4 Beloit 320.
## 5 Bensenville 86332.
## 6 Bentonville 31283.
## 7 Bethesda 83670.
## 8 Butner 9440.
## 9 Costa Mesa 20755.
## 10 Damascus 223094.
## # … with 22 more rows
ggplot(data = eCig2) +
geom_col(aes(x = factor(City), y = Taxable.Amount/1000),
fill = "blue", width = 0.3) +
geom_col(aes(x = City, y = Total.AmountDue/1000),
alpha = 0.3, fill = "red", width = 0.6) +
labs(title = "Total Excise Tax Due and Taxable Amount By City", x = "City", y = "Dollars per 1,000") +
theme(axis.text.x = element_text(angle = 90))
## ECig Excise Tax Essay