The Tidyverse Library alongside it packages (dplyr, ggplot and readr) were used for the analysis.
Ignore the GwalkR - Will talk later on it.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GWalkR)
tips <- read_csv('tips.csv')
## Rows: 744 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): gender, smoker, day, time
## dbl (2): tip, size
## num (1): total_bill
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gender_count <- tips %>%
count(gender) %>%
arrange(desc(n))
gender_count_plot <- ggplot(tips) +
geom_bar(mapping = aes(x = gender, fill = gender)) +
theme_dark() +
labs(
title = 'Gender Distribution',
x = 'Gender'
)
gender_count_plot
smoker_count <- tips %>%
count(smoker) %>%
arrange(desc(n))
smoker_count_plot <- ggplot(tips) +
geom_bar(mapping = aes(x = smoker, fill = gender)) +
theme_dark() +
labs(
title = 'Smoker Distribution',
x = 'Gender'
)
smoker_count_plot
day_count <- tips %>%
count(day) %>%
arrange(c('Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat'))
day_count_plot <- ggplot(tips) +
geom_bar(mapping = aes(x = day, fill = day)) +
theme_dark() +
labs(
title = 'Visits Per Day',
x = 'Days'
)
day_count_plot
time_count <- tips %>%
count(time) %>%
arrange(desc(n))
gender_count_plot <- ggplot(tips) +
geom_bar(mapping = aes(x = time, fill = time)) +
theme_dark() +
labs(
title = 'Time Frequency',
x = 'Time (Lunch Or Dinner)'
)
gender_count_plot
bill_by_gender <- tips %>%
group_by(gender) %>%
summarise(
bill = sum(total_bill),
bill_avg = round(mean(total_bill), 2)
) %>%
arrange(desc(bill))
bill_by_gender
## # A tibble: 2 × 3
## gender bill bill_avg
## <chr> <dbl> <dbl>
## 1 Male 892557. 2182.
## 2 Female 718208. 2144.
bill_by_smoker <- tips %>%
group_by(smoker) %>%
summarise(
bill = sum(total_bill),
bill_avg = mean(total_bill)
)
bill_by_smoker
## # A tibble: 2 × 3
## smoker bill bill_avg
## <chr> <dbl> <dbl>
## 1 No 850031. 2136.
## 2 Yes 760734. 2199.
bill_by_day <- tips %>%
group_by(day) %>%
summarise(
bill = sum(total_bill),
bill_avg = round(mean(total_bill), 2)
) %>% arrange(desc(bill))
bill_by_day
## # A tibble: 7 × 3
## day bill bill_avg
## <chr> <dbl> <dbl>
## 1 Sat 360134. 2183.
## 2 Sun 320493. 2289.
## 3 Thur 272477. 2033.
## 4 Mon 166548. 2221.
## 5 Tues 164399. 2108.
## 6 Wed 164323. 2251
## 7 Fri 162391. 2056.
bill_by_time <- tips %>%
group_by(time) %>%
summarise(
bill = sum(total_bill),
bill_avg = round(mean(total_bill), 2)
)
bill_by_time
## # A tibble: 2 × 3
## time bill bill_avg
## <chr> <dbl> <dbl>
## 1 Dinner 950331. 2226.
## 2 Lunch 660434. 2083.
tips_by_gender <- tips %>%
group_by(gender) %>%
summarise(
tip_total = sum(tip),
tip_avg = round(mean(tip), 2)
)
tips_by_gender
## # A tibble: 2 × 3
## gender tip_total tip_avg
## <chr> <dbl> <dbl>
## 1 Female 106928. 319.
## 2 Male 135577. 331.
tips_by_smoker <- tips %>%
group_by(smoker) %>%
summarise(
tip_total = sum(tip),
tip_avg = round(mean(tip), 2)
)
tips_by_smoker <- as.data.frame(tips_by_smoker)
tips_by_smoker
## smoker tip_total tip_avg
## 1 No 129688.2 325.85
## 2 Yes 112817.2 326.06
tip_by_day <- tips %>%
group_by(day) %>%
summarise(
tip_total = sum(tip),
tip_avg = round(mean(tip), 2)
) %>%
arrange(desc(tip_total))
tip_by_day
## # A tibble: 7 × 3
## day tip_total tip_avg
## <chr> <dbl> <dbl>
## 1 Sat 54115. 328.
## 2 Sun 45474. 325.
## 3 Thur 44552. 332.
## 4 Wed 25205. 345.
## 5 Tues 24547. 315.
## 6 Mon 24504. 327.
## 7 Fri 24109. 305.
tip_by_time <- tips %>%
group_by(time) %>%
summarise(
tip_total = sum(tip),
tip_avg = round(mean(tip), 2)
) %>% arrange(desc(tip_total))
tip_by_time
## # A tibble: 2 × 3
## time tip_total tip_avg
## <chr> <dbl> <dbl>
## 1 Dinner 139994. 328.
## 2 Lunch 102511. 323.
bill_size_corr = cor(tips$total_bill, tips$size) # to find the correlation of between the two numerical variable.
bill_size_corr
## [1] 0.09694219
Using the cor() to find the correlation between the
total_bill and size, it is
deductible that there is not relationship between the size of table and
the bills paid by customers. In simpler terms, the table size does not
influence the bills been paid by customers.
bill_size <- ggplot(data = tips) +
geom_point(mapping = aes(x = total_bill, y = size))+
geom_abline() +
labs(title = "Bill-Table Size Relationship",
subtitle = 'Conclusion: No Relationship',
x = "Total Bill",
y = "Size")
bill_size
Visualization Template: GGplot
ggplot(data = data to visualize) +
geom_chart_name(mapping = aes(columns you want to plot: x = first column, y=second column))+ #
geom_chart_name() + # Optional/additional chart
labs(This include the labels to the chart, most common being: title,
subtitle, x(x-axis label), y(y-axis label))
tip_size_corr <- cor(tips$tip, tips$size)
tip_size_corr
## [1] 0.09076619
With almost the same correlation coeffecient value as the total_bill vs size, it is also deductible that there is no relationship between the size of table and the tips given ot. In simpler terms, the table size does not influence the tips waiter received.
tip_size <- ggplot(data = tips) +
geom_point(mapping = aes(x = tip, y = size))+
geom_abline() +
labs(title = "Tip-Table Size Relationship",
subtitle = 'Conclusion: No Relationship',
x = "Tip",
y = "Size")
tip_size
Plot the find the relationship between the total bill and tip ( Correlation Coeffecient and Chart).