Introduction

This is the final project for the Summer 2018 R Bridge course to demonstrate the skills learned through the course. It is laid out to follow the general flow of systematic review:

  1. Pose a question
  2. Gather/load some data related to the question
  3. Explore the data
  4. Visualize the data
  5. Draw conclusions about the data related to the question


Question:

Does the tipping rate vary in correlation with the total bill? Also, does what time of day have any impact on this relationsip?


Load the data

theURL <- "https://raw.githubusercontent.com/ChadRyanBailey/BridgeRHomeworkData/master/tips.csv"
tippingData <- read.table(file = theURL, header = TRUE, sep = ",")
head(tippingData)
##   X total_bill  tip    sex smoker day   time size
## 1 1      16.99 1.01 Female     No Sun Dinner    2
## 2 2      10.34 1.66   Male     No Sun Dinner    3
## 3 3      21.01 3.50   Male     No Sun Dinner    3
## 4 4      23.68 3.31   Male     No Sun Dinner    2
## 5 5      24.59 3.61 Female     No Sun Dinner    4
## 6 6      25.29 4.71   Male     No Sun Dinner    4


Explore the data

Check the dimensions of the data

nrows <- nrow(tippingData)
ncolumns <- ncol(tippingData)
dfDimensions <- data.frame(cbind(nrows, ncolumns))
dfDimensions
##   nrows ncolumns
## 1   244        8

Get a general summary of each column in the dataset

summary(tippingData)
##        X            total_bill         tip             sex      smoker   
##  Min.   :  1.00   Min.   : 3.07   Min.   : 1.000   Female: 87   No :151  
##  1st Qu.: 61.75   1st Qu.:13.35   1st Qu.: 2.000   Male  :157   Yes: 93  
##  Median :122.50   Median :17.80   Median : 2.900                         
##  Mean   :122.50   Mean   :19.79   Mean   : 2.998                         
##  3rd Qu.:183.25   3rd Qu.:24.13   3rd Qu.: 3.562                         
##  Max.   :244.00   Max.   :50.81   Max.   :10.000                         
##    day         time          size     
##  Fri :19   Dinner:176   Min.   :1.00  
##  Sat :87   Lunch : 68   1st Qu.:2.00  
##  Sun :76                Median :2.00  
##  Thur:62                Mean   :2.57  
##                         3rd Qu.:3.00  
##                         Max.   :6.00

Review just the means and medians of the primary variables

means <- sapply(tippingData[, c("total_bill", "tip")], mean)
medians <- sapply(tippingData[, c("total_bill", "tip")], median)
means_mediansDF <- data.frame(rbind(means, medians))
print(means_mediansDF)
##         total_bill      tip
## means     19.78594 2.998279
## medians   17.79500 2.900000

Review the correlation of the primary variables

cor(tippingData$total_bill, tippingData$tip)
## [1] 0.6757341


Wrangle the data

Limit to the desired colums

tippingDataSlim <- tippingData[, c("X", "total_bill", "tip", "time")]
head(tippingDataSlim)
##   X total_bill  tip   time
## 1 1      16.99 1.01 Dinner
## 2 2      10.34 1.66 Dinner
## 3 3      21.01 3.50 Dinner
## 4 4      23.68 3.31 Dinner
## 5 5      24.59 3.61 Dinner
## 6 6      25.29 4.71 Dinner

Rename columns as needed

names(tippingDataSlim)[names(tippingDataSlim)=="X"] <- "Id"
names(tippingDataSlim)[names(tippingDataSlim)=="total_bill"] <- "Total_Bill"
names(tippingDataSlim)[names(tippingDataSlim)=="tip"] <- "Tip_Amount"
names(tippingDataSlim)[names(tippingDataSlim)=="time"] <- "Time"
head(tippingDataSlim)
##   Id Total_Bill Tip_Amount   Time
## 1  1      16.99       1.01 Dinner
## 2  2      10.34       1.66 Dinner
## 3  3      21.01       3.50 Dinner
## 4  4      23.68       3.31 Dinner
## 5  5      24.59       3.61 Dinner
## 6  6      25.29       4.71 Dinner

Add computed columns as needed

tippingDataSlim$Tip_Percent <- tippingDataSlim$Tip_Amount / tippingDataSlim$Total_Bill *100
head(tippingDataSlim)
##   Id Total_Bill Tip_Amount   Time Tip_Percent
## 1  1      16.99       1.01 Dinner    5.944673
## 2  2      10.34       1.66 Dinner   16.054159
## 3  3      21.01       3.50 Dinner   16.658734
## 4  4      23.68       3.31 Dinner   13.978041
## 5  5      24.59       3.61 Dinner   14.680765
## 6  6      25.29       4.71 Dinner   18.623962

Limit to desired rows

tippingDataSlimLunch <- tippingDataSlim[which(tippingDataSlim$Time == "Lunch"), ]
head(tippingDataSlimLunch)
##    Id Total_Bill Tip_Amount  Time Tip_Percent
## 78 78      27.20       4.00 Lunch    14.70588
## 79 79      22.76       3.00 Lunch    13.18102
## 80 80      17.29       2.71 Lunch    15.67380
## 81 81      19.44       3.00 Lunch    15.43210
## 82 82      16.66       3.40 Lunch    20.40816
## 83 83      10.07       1.83 Lunch    18.17279
tippingDataSlimDinner <- tippingDataSlim[which(tippingDataSlim$Time == "Dinner"), ]
head(tippingDataSlimDinner)
##   Id Total_Bill Tip_Amount   Time Tip_Percent
## 1  1      16.99       1.01 Dinner    5.944673
## 2  2      10.34       1.66 Dinner   16.054159
## 3  3      21.01       3.50 Dinner   16.658734
## 4  4      23.68       3.31 Dinner   13.978041
## 5  5      24.59       3.61 Dinner   14.680765
## 6  6      25.29       4.71 Dinner   18.623962

Adjust values for clarity or consistency

The current dataset is relatively clean and does not need any values to be adjusted. However, the following R code shows how to find and replace text within a column, had that been needed. This example would have been applicable had the field tippingDataSlim$Time been values {“L”, “D”} rather than {“Lunch”, “Dinner”}.

require("stringr")

#force the text from factor to chartacter
vectorx <- tippingDataSlim[, "Time"]
vectorx <- sapply(vectorx, as.character)
unique(vectorx)

vectorx <- str_replace(string = vectorx, pattern = "L", "Lunch")
vectorx <- str_replace(string = vectorx, pattern = "D", "Dinner")
unique(vectorx)

Re-review correlations

corAll <- cor(tippingDataSlim$Total_Bill, tippingDataSlim$Tip_Percent)
corLunch <- cor(tippingDataSlimLunch$Total_Bill, tippingDataSlimLunch$Tip_Percent)
corDinner <- cor(tippingDataSlimDinner$Total_Bill, tippingDataSlimDinner$Tip_Percent)
rbind(corAll, corLunch, corDinner)
##                 [,1]
## corAll    -0.3386241
## corLunch  -0.2935099
## corDinner -0.3502704

Exploration notes

There does appear to be a negative correllation between Total_Bill and Tip_Percent (i.e., as Total_Bill increases, Tip_Percent decreases). Next we will need to consider which graphics best display this information.


Visualize the data

Base R Graphics

#boxplot
boxplot(Tip_Percent ~ Time, data = tippingDataSlim, main = "Base R Boxplot \nTime versus Tip_Percent", ylab = "Time", xlab = "Tip_Percent", horizontal = TRUE)

#histogram
hist(tippingDataSlim$Tip_Percent, breaks= 10, xlim = c(0, 80), ylim = c(0, 100), xlab = "Tip_Percent", main = "Base R Histogram \nTip_Percent")

#scatterplot
plot(Tip_Percent ~ Total_Bill, data = tippingDataSlim, main = "Base R Scatterplot \nTip_Percent versus Total_Bill")

ggplot2 Graphics

require("ggplot2")
## Loading required package: ggplot2
#ggplot2 boxplot
ggBoxplot <- ggplot(tippingDataSlim, aes(x = Time, y = Tip_Percent)) +geom_boxplot()
ggBoxplot <- ggBoxplot + ylab("Tip_Percent") + xlab("Time")
ggBoxplot <- ggBoxplot + coord_flip()
ggBoxplot <- ggBoxplot + ggtitle("ggplot2 Boxplot \nTip_Percent by Time") #add title
ggBoxplot #output graphic

#ggplot2 histogram
ggHist <- ggplot(tippingDataSlim, aes(x = Tip_Percent)) + geom_histogram(binwidth = 2.5)  #basic histogram
ggHist <- ggHist + facet_grid(~ Time)   #grid scatter by Size_of_Party
ggHist <- ggHist + ggtitle("ggplot2 Histogram \nTip_Percent by Time") #add title
ggHist #output graphic

#ggplot2 scatterplot
ggScatPlot <- ggplot(tippingDataSlim, aes(x = Total_Bill, y = Tip_Percent)) + geom_point()  #basic scatterplot
ggScatPlot <- ggScatPlot + geom_point(aes(color = Time))  
ggScatPlot <- ggScatPlot + facet_grid(~ Time)   #grid scatter by Size_of_Party
ggScatPlot <- ggScatPlot + geom_smooth(method = 'lm',formula = y ~ x) #add trend lines
ggScatPlot <- ggScatPlot + ggtitle("ggplot2 Scatterplot \nTip_Percent versus Total_Bill by Time") #add title
ggScatPlot #output graphic


Conclusions

Overall the tip rate has a mildly weak negative correlation to the total bill. That is, as the total bill gets larger, the percent tipped gets smaller. This is true for both lunch and dinner, although the negative correlation is stronger for dinner than for lunch. This phenomena was readily detected through the use of the base R function cor() and was best displayed visually through the use of ggplot2 scatterplots.