LOAD Packages

knitr stringr ggplot

library("knitr")
## Warning: package 'knitr' was built under R version 3.5.3
library("stringr")
## Warning: package 'stringr' was built under R version 3.5.3
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.5.3

GitHub Raw Data load

Tips Data Set

tips<- read.csv(file="https://raw.githubusercontent.com/josephsimone/tips/master/tips.csv", header=TRUE, sep=",")
summary(tips)
##        X            total_bill         tip             sex      smoker   
##  Min.   :  1.00   Min.   : 3.07   Min.   : 1.000   Female: 87   No :151  
##  1st Qu.: 61.75   1st Qu.:13.35   1st Qu.: 2.000   Male  :157   Yes: 93  
##  Median :122.50   Median :17.80   Median : 2.900                         
##  Mean   :122.50   Mean   :19.79   Mean   : 2.998                         
##  3rd Qu.:183.25   3rd Qu.:24.13   3rd Qu.: 3.562                         
##  Max.   :244.00   Max.   :50.81   Max.   :10.000                         
##    day         time          size     
##  Fri :19   Dinner:176   Min.   :1.00  
##  Sat :87   Lunch : 68   1st Qu.:2.00  
##  Sun :76                Median :2.00  
##  Thur:62                Mean   :2.57  
##                         3rd Qu.:3.00  
##                         Max.   :6.00

Thesis for analysis

Does the total bill amount and the time of day of the meal have a correlation effect on the tipping rate of the server?

Data Exploration

tipmeans <- sapply(tips[, c("total_bill", "tip")], mean)
tipmedians <- sapply(tips[, c("total_bill", "tip")], median)
m_m_DF <- data.frame(rbind(tipmeans, tipmedians))

Means and Medians of the total bill and the tip of an order

print(m_m_DF) 
##            total_bill      tip
## tipmeans     19.78594 2.998279
## tipmedians   17.79500 2.900000

The correlation between the total bill and the tip of an order

cl <- cor(tips$total_bill, tips$tip)
print(cl)
## [1] 0.6757341

Data Wrangling

NEW table of specific columns

tipsData <- tips[, c("X", "total_bill", "tip", "time")]
names(tipsData)[names(tipsData)=="X"] <- "Id"
names(tipsData)[names(tipsData)=="total_bill"] <- "Total_Bill"
names(tipsData)[names(tipsData)=="tip"] <- "Tip_Amount"
names(tipsData)[names(tipsData)=="time"] <- "Time"
head(tipsData)
##   Id Total_Bill Tip_Amount   Time
## 1  1      16.99       1.01 Dinner
## 2  2      10.34       1.66 Dinner
## 3  3      21.01       3.50 Dinner
## 4  4      23.68       3.31 Dinner
## 5  5      24.59       3.61 Dinner
## 6  6      25.29       4.71 Dinner

Creation of NEW colum

tipsData$Tip_Percent <- tipsData$Tip_Amount / tipsData$Total_Bill *100
head(tipsData)
##   Id Total_Bill Tip_Amount   Time Tip_Percent
## 1  1      16.99       1.01 Dinner    5.944673
## 2  2      10.34       1.66 Dinner   16.054159
## 3  3      21.01       3.50 Dinner   16.658734
## 4  4      23.68       3.31 Dinner   13.978041
## 5  5      24.59       3.61 Dinner   14.680765
## 6  6      25.29       4.71 Dinner   18.623962

Creation of ADDITIONAL columns based on time of day and CORRELATIONS based on Time of Day

Lunch <- tipsData[which(tipsData$Time == "Lunch"), ]
head(Lunch)
##    Id Total_Bill Tip_Amount  Time Tip_Percent
## 78 78      27.20       4.00 Lunch    14.70588
## 79 79      22.76       3.00 Lunch    13.18102
## 80 80      17.29       2.71 Lunch    15.67380
## 81 81      19.44       3.00 Lunch    15.43210
## 82 82      16.66       3.40 Lunch    20.40816
## 83 83      10.07       1.83 Lunch    18.17279
Dinner <- tipsData[which(tipsData$Time == "Dinner"), ]
head(Dinner)
##   Id Total_Bill Tip_Amount   Time Tip_Percent
## 1  1      16.99       1.01 Dinner    5.944673
## 2  2      10.34       1.66 Dinner   16.054159
## 3  3      21.01       3.50 Dinner   16.658734
## 4  4      23.68       3.31 Dinner   13.978041
## 5  5      24.59       3.61 Dinner   14.680765
## 6  6      25.29       4.71 Dinner   18.623962
x <- tipsData[, "Time"]
x <- sapply(x, as.character)
unique(x)
## [1] "Dinner" "Lunch"
Allday <- cor(tipsData$Total_Bill, tipsData$Tip_Percent)
Lunchtime <- cor(Lunch$Total_Bill, Lunch$Tip_Percent)
Dinnertime <- cor(Dinner$Total_Bill, Dinner$Tip_Percent)
rbind(Allday, Lunchtime, Dinnertime)
##                  [,1]
## Allday     -0.3386241
## Lunchtime  -0.2935099
## Dinnertime -0.3502704

Graphics

Boxplot:

tipsBoxplot <- ggplot(tipsData, aes(x = Time, y = Tip_Percent)) + geom_boxplot()
tipsBoxplot <- tipsBoxplot + ylab("Tip_Percent") + xlab("Time")
tipsBoxplot <- tipsBoxplot + coord_flip()
tipsBoxplot <- tipsBoxplot + ggtitle("ggplot2 Boxplot \nTip_Percent by Time") 
tipsBoxplot 

Histogram:

tipsHistogram <- ggplot(tipsData, aes(x = Tip_Percent)) + geom_histogram(binwidth = 2.5)  
tipsHistogram <- tipsHistogram + facet_grid(~ Time)   
tipsHistogram <- tipsHistogram + ggtitle("ggplot2 Histogram \nTip_Percent by Time")
tipsHistogram 

Scatterplot:

tipsScatterplot <- ggplot(tipsData, aes(x = Total_Bill, y = Tip_Percent)) + geom_point()  
tipsScatterplot <- tipsScatterplot + geom_point(aes(color = Time))  
tipsScatterplot <- tipsScatterplot + facet_grid(~ Time)   
tipsScatterplot <- tipsScatterplot + geom_smooth(method = 'lm',formula = y ~ x) 
tipsScatterplot <- tipsScatterplot + ggtitle("ggplot2 Scatterplot \nTip_Percent versus Total_Bill by Time") #add title
tipsScatterplot 

Conslusions

The total bill has a negative correlation on the varation of the time rate. For example, as the total amount of the bill increases, the tip rate decreases. This correlation also holds true for both lunch and dinner meals, even though the negativce correlation factor is higher during dinner.