#Importing data
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
Airbnb <- read.csv('Airbnb.csv')

#Making a copy of the data 
airbnb_nyc <- Airbnb

#Getting rid of the datasets that I don’t need
rm(Airbnb)
#Loading dplyr and lubridate packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(extrafont)
## Registering fonts with R
#Checking the structure of each of columns of the data
str(airbnb_nyc) 
## 'data.frame':    30478 obs. of  13 variables:
##  $ Host.Id                   : int  5162530 33134899 39608626 500 500 1039 1783 2078 2339 2339 ...
##  $ Host.Since                : chr  "" "" "" "6/26/2008" ...
##  $ Name                      : chr  "1 Bedroom in Prime Williamsburg" "Sunny, Private room in Bushwick" "Sunny Room in Harlem" "Gorgeous 1 BR with Private Balcony" ...
##  $ Neighbourhood.            : chr  "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
##  $ Property.Type             : chr  "Apartment" "Apartment" "Apartment" "Apartment" ...
##  $ Review.Scores.Rating..bin.: int  NA NA NA NA 95 100 100 90 90 95 ...
##  $ Room.Type                 : chr  "Entire home/apt" "Private room" "Private room" "Entire home/apt" ...
##  $ Zipcode                   : int  11249 11206 10032 10024 10036 11222 10004 11201 10009 10009 ...
##  $ Beds                      : int  1 1 1 3 3 1 1 1 2 2 ...
##  $ Number.of.Records         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Number.Of.Reviews         : int  0 1 1 0 39 4 9 80 95 23 ...
##  $ Price                     : chr  "145" "37" "28" "199" ...
##  $ Review.Scores.Rating      : int  NA NA NA NA 96 100 100 94 90 96 ...
#Converting Zipcode from integer to character 
airbnb_nyc$Zipcode <- as.character(airbnb_nyc$Zipcode) 

#Converting Price from character to integer
airbnb_nyc$Price <- as.integer(airbnb_nyc$Price)   
## Warning: NAs introduced by coercion
#Rechecking the structure of converted columns of the data
str(airbnb_nyc) 
## 'data.frame':    30478 obs. of  13 variables:
##  $ Host.Id                   : int  5162530 33134899 39608626 500 500 1039 1783 2078 2339 2339 ...
##  $ Host.Since                : chr  "" "" "" "6/26/2008" ...
##  $ Name                      : chr  "1 Bedroom in Prime Williamsburg" "Sunny, Private room in Bushwick" "Sunny Room in Harlem" "Gorgeous 1 BR with Private Balcony" ...
##  $ Neighbourhood.            : chr  "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
##  $ Property.Type             : chr  "Apartment" "Apartment" "Apartment" "Apartment" ...
##  $ Review.Scores.Rating..bin.: int  NA NA NA NA 95 100 100 90 90 95 ...
##  $ Room.Type                 : chr  "Entire home/apt" "Private room" "Private room" "Entire home/apt" ...
##  $ Zipcode                   : chr  "11249" "11206" "10032" "10024" ...
##  $ Beds                      : int  1 1 1 3 3 1 1 1 2 2 ...
##  $ Number.of.Records         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Number.Of.Reviews         : int  0 1 1 0 39 4 9 80 95 23 ...
##  $ Price                     : int  145 37 28 199 549 149 250 90 270 290 ...
##  $ Review.Scores.Rating      : int  NA NA NA NA 96 100 100 94 90 96 ...
#Cleaning messy data
airbnb_nyc$Neighbourhood <- ifelse((airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 10009) | (airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 10022) | (airbnb_nyc$Neighbourhood == "Bronx" & airbnb_nyc$Zipcode == 10032), "Manhattan", airbnb_nyc$Neighbourhood)
airbnb_nyc$Neighbourhood <- ifelse((airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 11385) | (airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 11416) | (airbnb_nyc$Neighbourhood == "Manhattan" & airbnb_nyc$Zipcode == 11419), "Queens", airbnb_nyc$Neighbourhood)
airbnb_nyc$Neighbourhood <- ifelse(airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 7030, "Northeast New Jersey", airbnb_nyc$Neighbourhood)
airbnb_nyc$Neighbourhood <- ifelse(airbnb_nyc$Neighbourhood == "Manhattan" & airbnb_nyc$Zipcode == 7150, "North Inbound", airbnb_nyc$Neighbourhood)
#Renaming the columns for convenience  
airbnb_nyc <- airbnb_nyc %>% rename(listing_name=Name, borough=Neighbourhood, property_type=Property.Type, room_type=Room.Type, zipcode=Zipcode, beds=Beds, no_of_reviews=Number.Of.Reviews, price=Price, rating=Review.Scores.Rating) 
#Getting rid of the rows that I don’t need
airbnb_nyc <- airbnb_nyc %>% filter(borough=='Bronx' | borough=='Brooklyn' | borough=='Manhattan' | borough=='Queens' | borough=='Staten Island')
#Creating a new date column, a year column, and factor type borough and zipcode columns
airbnb_nyc <- airbnb_nyc %>% mutate(date=mdy(Host.Since), year=substr(date,1,4))
airbnb_nyc$borough_fac <- as.factor(airbnb_nyc$borough)
airbnb_nyc$zipcode_fac <- as.factor(airbnb_nyc$zipcode)
#Getting rid of the columns that I don’t need
airbnb_nyc <- airbnb_nyc %>% select(-Host.Id, -Host.Since, -Review.Scores.Rating..bin., -Number.of.Records)
#Renaming one of the categories of room type 
airbnb_nyc$room_type[airbnb_nyc$room_type=="Entire home/apt"] <- "Entire home/apt."
#Analysis 1: A comparative look at the number of total rented rooms, total price, average price, and median price in the five boroughs of NYC from June 2008 to August 2015 and their changes over the years under this timeframe
#Comparing the number of total rented rooms, total price, average price, and median price in five boroughs of NYC (2008-15)
price_by_boro2 <- airbnb_nyc %>% group_by(borough) %>% summarize(total_price=sum(price, na.rm=TRUE), average_price=mean(price, na.rm=TRUE), median_price=median(price, na.rm=TRUE), total_rented_rooms=n())    

#Comparing the Change in Number of total rented rooms, total price, average price, and median price in five boroughs of NYC (2008-15)
#Creating a new dataset for the convenience of the comparison 
price_by_boro <- airbnb_nyc %>% group_by(borough, year) %>% summarize(total_price=sum(price, na.rm=TRUE), average_price=mean(price, na.rm=TRUE), median_price=median(price, na.rm=TRUE), total_rented_rooms=n())        
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
price_by_boro <- price_by_boro[complete.cases(price_by_boro$year), ]

options(scipen=99)
# Comparing Total rented rooms 
price_by_boro %>% ggplot(aes(year, total_rented_rooms, group= borough, color= borough))+geom_line()+labs(title="Change in Total Rented Rooms in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Total Rented Rooms")+theme_classic()

#Comparing total price 
price_by_boro %>% ggplot(aes(year, total_price, group= borough, color= borough))+geom_line()+labs(title="Change in Total Price in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Total Price")+theme_classic()

#Comparing average price 
price_by_boro %>% ggplot(aes(year, average_price, group= borough, color= borough))+geom_line()+labs(title="Change in Average Price in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Average Price")+theme_classic()

#Comparing median price 
price_by_boro %>% ggplot(aes(year, median_price, group= borough, color= borough))+geom_line()+labs(title="Change in Median Price in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Median Price")+theme_classic()

#Analysis 2: Comparing the total rented rooms, total price, average price, and median price based on room types in five boroughs of NYC (2008-15)
price_by_boro_rt <- airbnb_nyc %>% group_by(borough, room_type) %>% summarize(total_price=sum(price, na.rm=TRUE), average_price=mean(price, na.rm=TRUE), median_price=median(price, na.rm=TRUE), total_rented_rooms=n())         
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
#Bar Graphs of total rented rooms based on room types in five boroughs 
airbnb_nyc %>% group_by(borough, room_type) %>% summarise(Count=n()) %>% ggplot(aes(reorder(borough, -Count), Count, fill=borough))+geom_col(show.legend=FALSE, color="black")+geom_text(aes(label=comma(Count)))+labs(title="Number of Rented Rooms Based on Room Type in each Borough", subtitle="June 2008 - August 2015", x="Borough Name", y="Total Number")+facet_wrap(~room_type)+theme_classic()  
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.

#Analysis 3: Is room price affecting the Airbnb rating in NYC?
#Filter to low-rated rooms in a new dataset
airbnb_nyc_bad <- airbnb_nyc %>% filter(rating<60)
airbnb_nyc_bad %>% group_by(borough) %>% summarize(Count=n())
## # A tibble: 4 × 2
##   borough       Count
##   <chr>         <int>
## 1 Brooklyn         44
## 2 Manhattan        69
## 3 Queens           14
## 4 Staten Island     1
airbnb_nyc_bad1 <- airbnb_nyc_bad %>% group_by(borough) %>% summarize(Count=n())
sum(airbnb_nyc_bad1$Count)
## [1] 128
#Filter to Bronx, Brooklyn, Manhattan, Queens, and Staten Island in new datasets
airbnb_bx <- airbnb_nyc %>% filter(borough=='Bronx')
airbnb_bk <- airbnb_nyc %>% filter(borough=='Brooklyn')
airbnb_mn <- airbnb_nyc %>% filter(borough=='Manhattan')
airbnb_qn <- airbnb_nyc %>% filter(borough=='Queens')
airbnb_si <- airbnb_nyc %>% filter(borough=='Staten Island')

#Correlation between Room Price and Rating in NYC and its five boroughs
cor(airbnb_nyc$price, airbnb_nyc$rating, use='complete.obs')
## [1] 0.0875274
cor(airbnb_bx$price, airbnb_bx$rating, use='complete.obs')
## [1] 0.1176378
cor(airbnb_bk$price, airbnb_bk$rating, use='complete.obs')
## [1] 0.1252373
cor(airbnb_mn$price, airbnb_mn$rating, use='complete.obs')
## [1] 0.08765012
cor(airbnb_qn$price, airbnb_qn$rating, use='complete.obs')
## [1] 0.08084138
cor(airbnb_si$price, airbnb_si$rating, use='complete.obs')
## [1] 0.02259818
#Regression analysis between Room Price and Rating in five boroughs of NYC
airbnb_nyc %>% ggplot(aes(price, rating))+geom_point(color="blue")+geom_smooth(method="lm", color="red")+labs(x="Room Price", y="Rating", title="Room Price Vs Rating in five Boroughs of NYC", subtitle="Data from June 2008 - August 2015")+facet_wrap(~borough)+theme_test() 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8351 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8351 rows containing missing values or values outside the scale range
## (`geom_point()`).

lm(rating~price, data=airbnb_nyc)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_nyc)
## 
## Coefficients:
## (Intercept)        price  
##   90.819486     0.007742
reg_nyc <- lm(rating~price, data=airbnb_nyc)
summary(reg_nyc)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_nyc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -73.142  -3.245   1.665   6.703   9.103 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 90.8194861  0.1074618  845.13 <0.0000000000000002 ***
## price        0.0077416  0.0005938   13.04 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.82 on 22014 degrees of freedom
##   (8351 observations deleted due to missingness)
## Multiple R-squared:  0.007661,   Adjusted R-squared:  0.007616 
## F-statistic:   170 on 1 and 22014 DF,  p-value: < 0.00000000000000022
lm(rating~price, data=airbnb_bx)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_bx)
## 
## Coefficients:
## (Intercept)        price  
##    89.84439      0.02439
reg_bx <- lm(rating~price, data=airbnb_bx)
summary(reg_bx)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_bx)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.405  -3.412   1.314   6.729   9.912 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 89.84439    1.19457  75.211 <0.0000000000000002 ***
## price        0.02439    0.01414   1.725               0.086 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.13 on 212 degrees of freedom
##   (128 observations deleted due to missingness)
## Multiple R-squared:  0.01384,    Adjusted R-squared:  0.009187 
## F-statistic: 2.975 on 1 and 212 DF,  p-value: 0.08602
lm(rating~price, data=airbnb_bk)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_bk)
## 
## Coefficients:
## (Intercept)        price  
##    90.60748      0.01387
reg_bk <- lm(rating~price, data=airbnb_bk)
summary(reg_bk)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_bk)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.133  -3.035   1.632   6.341   9.018 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 90.607484   0.176910  512.17 <0.0000000000000002 ***
## price        0.013872   0.001198   11.58 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.606 on 8422 degrees of freedom
##   (3185 observations deleted due to missingness)
## Multiple R-squared:  0.01568,    Adjusted R-squared:  0.01557 
## F-statistic: 134.2 on 1 and 8422 DF,  p-value: < 0.00000000000000022
lm(rating~price, data=airbnb_mn)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_mn)
## 
## Coefficients:
## (Intercept)        price  
##   90.539047     0.007053
reg_mn <- lm(rating~price, data=airbnb_mn)
summary(reg_mn)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_mn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.655  -3.367   1.685   6.640   9.285 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 90.5390468  0.1553710 582.728 <0.0000000000000002 ***
## price        0.0070527  0.0007416   9.511 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.815 on 11683 degrees of freedom
##   (4292 observations deleted due to missingness)
## Multiple R-squared:  0.007683,   Adjusted R-squared:  0.007598 
## F-statistic: 90.45 on 1 and 11683 DF,  p-value: < 0.00000000000000022
lm(rating~price, data=airbnb_qn)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_qn)
## 
## Coefficients:
## (Intercept)        price  
##    90.30490      0.01297
reg_qn <- lm(rating~price, data=airbnb_qn)
summary(reg_qn)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_qn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.251  -3.278   2.099   7.479   9.371 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 90.304903   0.459463  196.54 < 0.0000000000000002 ***
## price        0.012975   0.004004    3.24              0.00122 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.853 on 1596 degrees of freedom
##   (694 observations deleted due to missingness)
## Multiple R-squared:  0.006535,   Adjusted R-squared:  0.005913 
## F-statistic:  10.5 on 1 and 1596 DF,  p-value: 0.001219
lm(rating~price, data=airbnb_si)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_si)
## 
## Coefficients:
## (Intercept)        price  
##     91.1984       0.0019
reg_si <- lm(rating~price, data=airbnb_si)
summary(reg_si)
## 
## Call:
## lm(formula = rating ~ price, data = airbnb_si)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -31.3484  -3.8114   0.6117   5.7086   8.7352 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 91.198352   1.135015  80.350 <0.0000000000000002 ***
## price        0.001900   0.008714   0.218               0.828    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.562 on 93 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.0005107,  Adjusted R-squared:  -0.01024 
## F-statistic: 0.04752 on 1 and 93 DF,  p-value: 0.8279
#Analysis 4: What price categories people are more prone to?
#Number of rented rooms based on price categories  
#Making a summary of price to see how it can be broken down into few categories 
summary(airbnb_nyc$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    10.0    80.0   125.0   153.3   192.0   999.0     181
#breaking down price into three categories: low, medium low and medium 
breaks <- c(-Inf, 50, 100, 200, 300, 400, Inf) 
labels <- c("Low", "Medium-low", "Reasonable", "Medium", "Medium-high", "High")
airbnb_nyc <- airbnb_nyc %>% mutate(price_cat=cut(price, breaks=breaks, labels=labels)) 
#Checking the datatype of the new column ‘price_cat’ to see if it is factor-type or not
str(airbnb_nyc)
## 'data.frame':    30367 obs. of  15 variables:
##  $ listing_name  : chr  "1 Bedroom in Prime Williamsburg" "Sunny, Private room in Bushwick" "Sunny Room in Harlem" "Gorgeous 1 BR with Private Balcony" ...
##  $ Neighbourhood.: chr  "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
##  $ property_type : chr  "Apartment" "Apartment" "Apartment" "Apartment" ...
##  $ room_type     : chr  "Entire home/apt." "Private room" "Private room" "Entire home/apt." ...
##  $ zipcode       : chr  "11249" "11206" "10032" "10024" ...
##  $ beds          : int  1 1 1 3 3 1 1 1 2 2 ...
##  $ no_of_reviews : int  0 1 1 0 39 4 9 80 95 23 ...
##  $ price         : int  145 37 28 199 549 149 250 90 270 290 ...
##  $ rating        : int  NA NA NA NA 96 100 100 94 90 96 ...
##  $ borough       : chr  "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
##  $ date          : Date, format: NA NA ...
##  $ year          : chr  NA NA NA "2008" ...
##  $ borough_fac   : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 2 3 3 3 2 3 2 3 3 ...
##  $ zipcode_fac   : Factor w/ 186 levels "10001","10002",..: 135 102 31 22 35 118 4 98 8 8 ...
##  $ price_cat     : Factor w/ 6 levels "Low","Medium-low",..: 3 1 1 3 6 3 4 2 4 4 ...
#Number of hired rooms in each price category
summary(airbnb_nyc$price_cat)
##         Low  Medium-low  Reasonable      Medium Medium-high        High 
##        1941       10136       12349        3837        1012         911 
##        NA's 
##         181
Price_category <- c("Low", "Medium-low", "Reasonable", "Medium", "Medium-high", "High")
Range <- c("0-50","51-100", "101-200", "201-300", "301-400", "500-999")
Total_hired_rooms <- c(1942, 10136, 12349, 3837, 1012, 911)
price_hired_rooms <- data.frame(Price_category, Range, Total_hired_rooms)

#Filter to reasonably priced ‘very good-rated’ rooms in a new dataset
airbnb_nyc_good_price <- airbnb_nyc %>% filter(price<200, rating>80)
#Analysis 5: What is the percentage of reasonably priced very good-rated rooms in five boroughs of NYC? 
summary(airbnb_nyc_good_price$borough_fac)
##         Bronx      Brooklyn     Manhattan        Queens Staten Island 
##           186          6517          7122          1303            77
summary(airbnb_nyc$borough_fac)
##         Bronx      Brooklyn     Manhattan        Queens Staten Island 
##           342         11609         15977          2292           147
Borough <- c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island")
Total_hired_rooms <- c(342, 11609, 15978, 2292, 147)
Low_to_reasonably_priced_good_rated_hired_rooms <- c(186, 6517, 7122, 1303, 77)
good_rated_percentage <- data.frame(Borough, Total_hired_rooms, Low_to_reasonably_priced_good_rated_hired_rooms)
good_rated_percentage %>% mutate(Percentage=Low_to_reasonably_priced_good_rated_hired_rooms*100/Total_hired_rooms)
##         Borough Total_hired_rooms
## 1         Bronx               342
## 2      Brooklyn             11609
## 3     Manhattan             15978
## 4        Queens              2292
## 5 Staten Island               147
##   Low_to_reasonably_priced_good_rated_hired_rooms Percentage
## 1                                             186   54.38596
## 2                                            6517   56.13748
## 3                                            7122   44.57379
## 4                                            1303   56.84991
## 5                                              77   52.38095
good_rated_percentage <- good_rated_percentage %>% mutate(Percentage=Low_to_reasonably_priced_good_rated_hired_rooms*100/Total_hired_rooms)

Reference:

  1. Airbnb Listings in New York City [Data Set]. Tableau. https://public.tableau.com/app/learn/sample-data
  2. Inside Airbnb Data Dictionary. https://docs.google.com/spreadsheets/d/1iWCNJcSutYqpULSQHlNyGInUvHg2BoUGoNRIGa6Szc4/edit#gid=1322284596
  3. Harrison, E., & Pius, R. (2021). R for Health Data Science. CRC Press. https://argoshare.is.ed.ac.uk/healthyr_book/fitting-simple-models.html
  4. Group variables in ggplot2 (geom_col). Stack Overflow. https://stackoverflow.com/questions/63551040/group-variables-in-ggplot2-geom-col
  5. Wickham, H., Çetinkaya-Rundel, M., & Grolemund, G. (2023). R for Data Science (2nd ed.). O’Reilly Media, Inc. https://r4ds.hadley.nz/data-visualize
  6. Freed, N., Jones, S., & Bergquist, T. (2014). Understanding Business Statistics.