#upload libraries
library(ggplot2)
library(datasets)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(dplyr)
library(knitr)
library(directlabels)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggrepel)
library(dplyr)

#upload the fastfood_sales document
d <- read.csv("data_fastfood_sales.csv")

#check the data file
head(d)
##      restaurant average_sales us_sales num_company_stores num_franchised_stores
## 1        Subway        416.86 10800.00                  0                 25908
## 2     Mcdonalds       2670.32 37480.67                842                 13194
## 3     Starbucks        945.27 13167.61               8222                  5708
## 4 Dunkin Donuts        733.13  9192.00                  0                 12538
## 5     Pizza Hut        900.00  5510.84                 96                  7426
## 6   Burger King       1387.81 10028.32                 50                  7196
##   unit_count
## 1      25908
## 2      14036
## 3      13930
## 4      12538
## 5       7522
## 6       7266
dim(d)
## [1] 19  6
colnames(d)
## [1] "restaurant"            "average_sales"         "us_sales"             
## [4] "num_company_stores"    "num_franchised_stores" "unit_count"
#take out the required variables in the dataset
restaurant = d[, "restaurant"]
avg_sales = d[, "average_sales"]
us_sales = d[, "us_sales"]
num_comp_stores = d[, "num_company_stores"]
num_franch_stores = d[, "num_franchised_stores"]
unit_count = d[, "unit_count"]

#calculate the proporiton of franchise stores 
pro <- num_franch_stores / unit_count

#plot the scatter plot
ggplot(d, aes(x = us_sales, y = unit_count, fill = pro), color = num_franch_stores )+
  geom_point(color = "dodgerblue")+
  geom_text_repel(label = restaurant)+
  #guides(color = guide_legend("Proportion of stores franchised")) +
  scale_y_continuous(trans = "log10") +
  scale_x_continuous(trans = "log10") +
  ylab("Total number of stores (log10 scale)") +
  xlab("U.S. sales in million (log10 scale)") +
  labs(fill = "Proportion of stores franchised")+
  theme_minimal() +
  theme(title = element_text(size = 10), 
        axis.text = element_text(size =8),
        legend.text = element_text(size = 8), 
        panel.border = element_rect(colour = "black", fill = NA, size=0.5))