This is the final project for the Summer 2018 R Bridge course to demonstrate the skills learned through the course. It is laid out to follow the general flow of systematic review:
Does the tipping rate vary in correlation with the total bill? Also, does what time of day have any impact on this relationsip?
theURL <- "https://raw.githubusercontent.com/ChadRyanBailey/BridgeRHomeworkData/master/tips.csv"
tippingData <- read.table(file = theURL, header = TRUE, sep = ",")
head(tippingData)
## X total_bill tip sex smoker day time size
## 1 1 16.99 1.01 Female No Sun Dinner 2
## 2 2 10.34 1.66 Male No Sun Dinner 3
## 3 3 21.01 3.50 Male No Sun Dinner 3
## 4 4 23.68 3.31 Male No Sun Dinner 2
## 5 5 24.59 3.61 Female No Sun Dinner 4
## 6 6 25.29 4.71 Male No Sun Dinner 4
nrows <- nrow(tippingData)
ncolumns <- ncol(tippingData)
dfDimensions <- data.frame(cbind(nrows, ncolumns))
dfDimensions
## nrows ncolumns
## 1 244 8
summary(tippingData)
## X total_bill tip sex smoker
## Min. : 1.00 Min. : 3.07 Min. : 1.000 Female: 87 No :151
## 1st Qu.: 61.75 1st Qu.:13.35 1st Qu.: 2.000 Male :157 Yes: 93
## Median :122.50 Median :17.80 Median : 2.900
## Mean :122.50 Mean :19.79 Mean : 2.998
## 3rd Qu.:183.25 3rd Qu.:24.13 3rd Qu.: 3.562
## Max. :244.00 Max. :50.81 Max. :10.000
## day time size
## Fri :19 Dinner:176 Min. :1.00
## Sat :87 Lunch : 68 1st Qu.:2.00
## Sun :76 Median :2.00
## Thur:62 Mean :2.57
## 3rd Qu.:3.00
## Max. :6.00
means <- sapply(tippingData[, c("total_bill", "tip")], mean)
medians <- sapply(tippingData[, c("total_bill", "tip")], median)
means_mediansDF <- data.frame(rbind(means, medians))
print(means_mediansDF)
## total_bill tip
## means 19.78594 2.998279
## medians 17.79500 2.900000
cor(tippingData$total_bill, tippingData$tip)
## [1] 0.6757341
tippingDataSlim <- tippingData[, c("X", "total_bill", "tip", "time")]
head(tippingDataSlim)
## X total_bill tip time
## 1 1 16.99 1.01 Dinner
## 2 2 10.34 1.66 Dinner
## 3 3 21.01 3.50 Dinner
## 4 4 23.68 3.31 Dinner
## 5 5 24.59 3.61 Dinner
## 6 6 25.29 4.71 Dinner
names(tippingDataSlim)[names(tippingDataSlim)=="X"] <- "Id"
names(tippingDataSlim)[names(tippingDataSlim)=="total_bill"] <- "Total_Bill"
names(tippingDataSlim)[names(tippingDataSlim)=="tip"] <- "Tip_Amount"
names(tippingDataSlim)[names(tippingDataSlim)=="time"] <- "Time"
head(tippingDataSlim)
## Id Total_Bill Tip_Amount Time
## 1 1 16.99 1.01 Dinner
## 2 2 10.34 1.66 Dinner
## 3 3 21.01 3.50 Dinner
## 4 4 23.68 3.31 Dinner
## 5 5 24.59 3.61 Dinner
## 6 6 25.29 4.71 Dinner
tippingDataSlim$Tip_Percent <- tippingDataSlim$Tip_Amount / tippingDataSlim$Total_Bill *100
head(tippingDataSlim)
## Id Total_Bill Tip_Amount Time Tip_Percent
## 1 1 16.99 1.01 Dinner 5.944673
## 2 2 10.34 1.66 Dinner 16.054159
## 3 3 21.01 3.50 Dinner 16.658734
## 4 4 23.68 3.31 Dinner 13.978041
## 5 5 24.59 3.61 Dinner 14.680765
## 6 6 25.29 4.71 Dinner 18.623962
tippingDataSlimLunch <- tippingDataSlim[which(tippingDataSlim$Time == "Lunch"), ]
head(tippingDataSlimLunch)
## Id Total_Bill Tip_Amount Time Tip_Percent
## 78 78 27.20 4.00 Lunch 14.70588
## 79 79 22.76 3.00 Lunch 13.18102
## 80 80 17.29 2.71 Lunch 15.67380
## 81 81 19.44 3.00 Lunch 15.43210
## 82 82 16.66 3.40 Lunch 20.40816
## 83 83 10.07 1.83 Lunch 18.17279
tippingDataSlimDinner <- tippingDataSlim[which(tippingDataSlim$Time == "Dinner"), ]
head(tippingDataSlimDinner)
## Id Total_Bill Tip_Amount Time Tip_Percent
## 1 1 16.99 1.01 Dinner 5.944673
## 2 2 10.34 1.66 Dinner 16.054159
## 3 3 21.01 3.50 Dinner 16.658734
## 4 4 23.68 3.31 Dinner 13.978041
## 5 5 24.59 3.61 Dinner 14.680765
## 6 6 25.29 4.71 Dinner 18.623962
The current dataset is relatively clean and does not need any values to be adjusted. However, the following R code shows how to find and replace text within a column, had that been needed. This example would have been applicable had the field tippingDataSlim$Time been values {“L”, “D”} rather than {“Lunch”, “Dinner”}.
require("stringr")
#force the text from factor to chartacter
vectorx <- tippingDataSlim[, "Time"]
vectorx <- sapply(vectorx, as.character)
unique(vectorx)
vectorx <- str_replace(string = vectorx, pattern = "L", "Lunch")
vectorx <- str_replace(string = vectorx, pattern = "D", "Dinner")
unique(vectorx)
corAll <- cor(tippingDataSlim$Total_Bill, tippingDataSlim$Tip_Percent)
corLunch <- cor(tippingDataSlimLunch$Total_Bill, tippingDataSlimLunch$Tip_Percent)
corDinner <- cor(tippingDataSlimDinner$Total_Bill, tippingDataSlimDinner$Tip_Percent)
rbind(corAll, corLunch, corDinner)
## [,1]
## corAll -0.3386241
## corLunch -0.2935099
## corDinner -0.3502704
There does appear to be a negative correllation between Total_Bill and Tip_Percent (i.e., as Total_Bill increases, Tip_Percent decreases). Next we will need to consider which graphics best display this information.
#boxplot
boxplot(Tip_Percent ~ Time, data = tippingDataSlim, main = "Base R Boxplot \nTime versus Tip_Percent", ylab = "Time", xlab = "Tip_Percent", horizontal = TRUE)
#histogram
hist(tippingDataSlim$Tip_Percent, breaks= 10, xlim = c(0, 80), ylim = c(0, 100), xlab = "Tip_Percent", main = "Base R Histogram \nTip_Percent")
#scatterplot
plot(Tip_Percent ~ Total_Bill, data = tippingDataSlim, main = "Base R Scatterplot \nTip_Percent versus Total_Bill")
require("ggplot2")
## Loading required package: ggplot2
#ggplot2 boxplot
ggBoxplot <- ggplot(tippingDataSlim, aes(x = Time, y = Tip_Percent)) +geom_boxplot()
ggBoxplot <- ggBoxplot + ylab("Tip_Percent") + xlab("Time")
ggBoxplot <- ggBoxplot + coord_flip()
ggBoxplot <- ggBoxplot + ggtitle("ggplot2 Boxplot \nTip_Percent by Time") #add title
ggBoxplot #output graphic
#ggplot2 histogram
ggHist <- ggplot(tippingDataSlim, aes(x = Tip_Percent)) + geom_histogram(binwidth = 2.5) #basic histogram
ggHist <- ggHist + facet_grid(~ Time) #grid scatter by Size_of_Party
ggHist <- ggHist + ggtitle("ggplot2 Histogram \nTip_Percent by Time") #add title
ggHist #output graphic
#ggplot2 scatterplot
ggScatPlot <- ggplot(tippingDataSlim, aes(x = Total_Bill, y = Tip_Percent)) + geom_point() #basic scatterplot
ggScatPlot <- ggScatPlot + geom_point(aes(color = Time))
ggScatPlot <- ggScatPlot + facet_grid(~ Time) #grid scatter by Size_of_Party
ggScatPlot <- ggScatPlot + geom_smooth(method = 'lm',formula = y ~ x) #add trend lines
ggScatPlot <- ggScatPlot + ggtitle("ggplot2 Scatterplot \nTip_Percent versus Total_Bill by Time") #add title
ggScatPlot #output graphic
Overall the tip rate has a mildly weak negative correlation to the total bill. That is, as the total bill gets larger, the percent tipped gets smaller. This is true for both lunch and dinner, although the negative correlation is stronger for dinner than for lunch. This phenomena was readily detected through the use of the base R function cor() and was best displayed visually through the use of ggplot2 scatterplots.