Tips Data Analysis


Import Libraries

The Tidyverse Library alongside it packages (dplyr, ggplot and readr) were used for the analysis.

Note:

Ignore the GwalkR - Will talk later on it.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GWalkR)
tips <- read_csv('tips.csv')
## Rows: 744 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): gender, smoker, day, time
## dbl (2): tip, size
## num (1): total_bill
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Categorical Variable Count and Visualization.

Gender Distribution

gender_count <- tips %>% 
    count(gender) %>% 
    arrange(desc(n))

gender_count_plot <- ggplot(tips) +
    geom_bar(mapping = aes(x = gender, fill = gender)) +
    theme_dark() +
    labs(
        title = 'Gender Distribution',
        x = 'Gender'
    )

gender_count_plot

Smoker Distribution

smoker_count <- tips %>% 
    count(smoker) %>% 
    arrange(desc(n))

smoker_count_plot <- ggplot(tips) +
    geom_bar(mapping = aes(x = smoker, fill = gender)) +
    theme_dark() +
    labs(
        title = 'Smoker Distribution',
        x = 'Gender'
    )

smoker_count_plot

day_count <- tips %>% 
    count(day) %>% 
    arrange(c('Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat'))

day_count_plot <- ggplot(tips) +
    geom_bar(mapping = aes(x = day, fill = day)) +
    theme_dark() +
    labs(
        title = 'Visits Per Day',
        x = 'Days'
    )

day_count_plot

time_count <- tips %>% 
    count(time) %>% 
    arrange(desc(n))


gender_count_plot <- ggplot(tips) +
    geom_bar(mapping = aes(x = time, fill = time)) +
    theme_dark() +
    labs(
        title = 'Time Frequency',
        x = 'Time (Lunch Or Dinner)'
    )

gender_count_plot

Total Bill and Tips Distribution.

Bill Distribution by Gender [Sum & Average]

bill_by_gender <- tips %>% 
    group_by(gender) %>% 
    summarise(
        bill = sum(total_bill),
        bill_avg = round(mean(total_bill), 2)
    ) %>% 
    arrange(desc(bill))

bill_by_gender
## # A tibble: 2 × 3
##   gender    bill bill_avg
##   <chr>    <dbl>    <dbl>
## 1 Male   892557.    2182.
## 2 Female 718208.    2144.

Bill Distribution by Smoker [Sum & Average]

bill_by_smoker <- tips %>% 
    group_by(smoker) %>% 
    summarise(
        bill = sum(total_bill),
        bill_avg = mean(total_bill)
    )

bill_by_smoker
## # A tibble: 2 × 3
##   smoker    bill bill_avg
##   <chr>    <dbl>    <dbl>
## 1 No     850031.    2136.
## 2 Yes    760734.    2199.

Bill Distribution by Day [Sum & Average]

bill_by_day <- tips %>% 
    group_by(day) %>% 
    summarise(
        bill = sum(total_bill),
        bill_avg = round(mean(total_bill), 2)
    ) %>% arrange(desc(bill))

bill_by_day
## # A tibble: 7 × 3
##   day      bill bill_avg
##   <chr>   <dbl>    <dbl>
## 1 Sat   360134.    2183.
## 2 Sun   320493.    2289.
## 3 Thur  272477.    2033.
## 4 Mon   166548.    2221.
## 5 Tues  164399.    2108.
## 6 Wed   164323.    2251 
## 7 Fri   162391.    2056.

Bill Distribution by Time [Sum & Average]

bill_by_time <- tips %>% 
    group_by(time) %>% 
    summarise(
        bill = sum(total_bill),
        bill_avg = round(mean(total_bill), 2)
    )

bill_by_time
## # A tibble: 2 × 3
##   time      bill bill_avg
##   <chr>    <dbl>    <dbl>
## 1 Dinner 950331.    2226.
## 2 Lunch  660434.    2083.

Tips Distribution by Gender [Sum & Average]

tips_by_gender <- tips %>% 
    group_by(gender) %>% 
    summarise(
        tip_total = sum(tip),
        tip_avg = round(mean(tip), 2)
    )

tips_by_gender
## # A tibble: 2 × 3
##   gender tip_total tip_avg
##   <chr>      <dbl>   <dbl>
## 1 Female   106928.    319.
## 2 Male     135577.    331.

Tips Distribution by Smoker [Sum & Average]

tips_by_smoker <- tips %>% 
    group_by(smoker) %>% 
    summarise(
        tip_total = sum(tip),
        tip_avg = round(mean(tip), 2)
    )

tips_by_smoker <- as.data.frame(tips_by_smoker)
tips_by_smoker
##   smoker tip_total tip_avg
## 1     No  129688.2  325.85
## 2    Yes  112817.2  326.06

Tips Distribution by Day [Sum & Average]

tip_by_day <- tips %>% 
    group_by(day) %>% 
    summarise(
        tip_total = sum(tip),
        tip_avg = round(mean(tip), 2)
    ) %>% 
    arrange(desc(tip_total))

tip_by_day
## # A tibble: 7 × 3
##   day   tip_total tip_avg
##   <chr>     <dbl>   <dbl>
## 1 Sat      54115.    328.
## 2 Sun      45474.    325.
## 3 Thur     44552.    332.
## 4 Wed      25205.    345.
## 5 Tues     24547.    315.
## 6 Mon      24504.    327.
## 7 Fri      24109.    305.

Tip Distribution by Time [Sum & Average]

tip_by_time <- tips %>% 
    group_by(time) %>% 
    summarise(
        tip_total = sum(tip),
        tip_avg = round(mean(tip), 2)
    ) %>% arrange(desc(tip_total))

tip_by_time
## # A tibble: 2 × 3
##   time   tip_total tip_avg
##   <chr>      <dbl>   <dbl>
## 1 Dinner   139994.    328.
## 2 Lunch    102511.    323.

Relationships Between Numerical Variable: Total Bills, Tips and Size.

Total bill vs Size

In Correlation..

bill_size_corr = cor(tips$total_bill, tips$size)  # to find the correlation of between the two numerical variable.
bill_size_corr
## [1] 0.09694219

Findings

Using the cor() to find the correlation between the total_bill and size, it is deductible that there is not relationship between the size of table and the bills paid by customers. In simpler terms, the table size does not influence the bills been paid by customers.


In Chart

bill_size <- ggplot(data = tips) +
  geom_point(mapping = aes(x = total_bill, y = size))+
  geom_abline() +
  labs(title = "Bill-Table Size Relationship",
       subtitle = 'Conclusion: No Relationship',
       x = "Total Bill",
       y = "Size") 

bill_size

Visualization Template: GGplot

ggplot(data = data to visualize) +

geom_chart_name(mapping = aes(columns you want to plot: x = first column, y=second column))+ #

geom_chart_name() + # Optional/additional chart

labs(This include the labels to the chart, most common being: title, subtitle, x(x-axis label), y(y-axis label))

Tips vs Size

In Correlation..

tip_size_corr <- cor(tips$tip, tips$size)
tip_size_corr
## [1] 0.09076619

Findings

With almost the same correlation coeffecient value as the total_bill vs size, it is also deductible that there is no relationship between the size of table and the tips given ot. In simpler terms, the table size does not influence the tips waiter received.


Chart

tip_size <- ggplot(data = tips) +
  geom_point(mapping = aes(x = tip, y = size))+
  geom_abline() +
  labs(title = "Tip-Table Size Relationship",
       subtitle = 'Conclusion: No Relationship',
       x = "Tip",
       y = "Size") 

tip_size

NOT THE END OF ANALYSIS!

Assignment

Plot the find the relationship between the total bill and tip ( Correlation Coeffecient and Chart).