knitr stringr ggplot
library("knitr")
## Warning: package 'knitr' was built under R version 3.5.3
library("stringr")
## Warning: package 'stringr' was built under R version 3.5.3
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.5.3
Tips Data Set
tips<- read.csv(file="https://raw.githubusercontent.com/josephsimone/tips/master/tips.csv", header=TRUE, sep=",")
summary(tips)
## X total_bill tip sex smoker
## Min. : 1.00 Min. : 3.07 Min. : 1.000 Female: 87 No :151
## 1st Qu.: 61.75 1st Qu.:13.35 1st Qu.: 2.000 Male :157 Yes: 93
## Median :122.50 Median :17.80 Median : 2.900
## Mean :122.50 Mean :19.79 Mean : 2.998
## 3rd Qu.:183.25 3rd Qu.:24.13 3rd Qu.: 3.562
## Max. :244.00 Max. :50.81 Max. :10.000
## day time size
## Fri :19 Dinner:176 Min. :1.00
## Sat :87 Lunch : 68 1st Qu.:2.00
## Sun :76 Median :2.00
## Thur:62 Mean :2.57
## 3rd Qu.:3.00
## Max. :6.00
Does the total bill amount and the time of day of the meal have a correlation effect on the tipping rate of the server?
tipmeans <- sapply(tips[, c("total_bill", "tip")], mean)
tipmedians <- sapply(tips[, c("total_bill", "tip")], median)
m_m_DF <- data.frame(rbind(tipmeans, tipmedians))
print(m_m_DF)
## total_bill tip
## tipmeans 19.78594 2.998279
## tipmedians 17.79500 2.900000
cl <- cor(tips$total_bill, tips$tip)
print(cl)
## [1] 0.6757341
NEW table of specific columns
tipsData <- tips[, c("X", "total_bill", "tip", "time")]
names(tipsData)[names(tipsData)=="X"] <- "Id"
names(tipsData)[names(tipsData)=="total_bill"] <- "Total_Bill"
names(tipsData)[names(tipsData)=="tip"] <- "Tip_Amount"
names(tipsData)[names(tipsData)=="time"] <- "Time"
head(tipsData)
## Id Total_Bill Tip_Amount Time
## 1 1 16.99 1.01 Dinner
## 2 2 10.34 1.66 Dinner
## 3 3 21.01 3.50 Dinner
## 4 4 23.68 3.31 Dinner
## 5 5 24.59 3.61 Dinner
## 6 6 25.29 4.71 Dinner
Creation of NEW colum
tipsData$Tip_Percent <- tipsData$Tip_Amount / tipsData$Total_Bill *100
head(tipsData)
## Id Total_Bill Tip_Amount Time Tip_Percent
## 1 1 16.99 1.01 Dinner 5.944673
## 2 2 10.34 1.66 Dinner 16.054159
## 3 3 21.01 3.50 Dinner 16.658734
## 4 4 23.68 3.31 Dinner 13.978041
## 5 5 24.59 3.61 Dinner 14.680765
## 6 6 25.29 4.71 Dinner 18.623962
Creation of ADDITIONAL columns based on time of day and CORRELATIONS based on Time of Day
Lunch <- tipsData[which(tipsData$Time == "Lunch"), ]
head(Lunch)
## Id Total_Bill Tip_Amount Time Tip_Percent
## 78 78 27.20 4.00 Lunch 14.70588
## 79 79 22.76 3.00 Lunch 13.18102
## 80 80 17.29 2.71 Lunch 15.67380
## 81 81 19.44 3.00 Lunch 15.43210
## 82 82 16.66 3.40 Lunch 20.40816
## 83 83 10.07 1.83 Lunch 18.17279
Dinner <- tipsData[which(tipsData$Time == "Dinner"), ]
head(Dinner)
## Id Total_Bill Tip_Amount Time Tip_Percent
## 1 1 16.99 1.01 Dinner 5.944673
## 2 2 10.34 1.66 Dinner 16.054159
## 3 3 21.01 3.50 Dinner 16.658734
## 4 4 23.68 3.31 Dinner 13.978041
## 5 5 24.59 3.61 Dinner 14.680765
## 6 6 25.29 4.71 Dinner 18.623962
x <- tipsData[, "Time"]
x <- sapply(x, as.character)
unique(x)
## [1] "Dinner" "Lunch"
Allday <- cor(tipsData$Total_Bill, tipsData$Tip_Percent)
Lunchtime <- cor(Lunch$Total_Bill, Lunch$Tip_Percent)
Dinnertime <- cor(Dinner$Total_Bill, Dinner$Tip_Percent)
rbind(Allday, Lunchtime, Dinnertime)
## [,1]
## Allday -0.3386241
## Lunchtime -0.2935099
## Dinnertime -0.3502704
Boxplot:
tipsBoxplot <- ggplot(tipsData, aes(x = Time, y = Tip_Percent)) + geom_boxplot()
tipsBoxplot <- tipsBoxplot + ylab("Tip_Percent") + xlab("Time")
tipsBoxplot <- tipsBoxplot + coord_flip()
tipsBoxplot <- tipsBoxplot + ggtitle("ggplot2 Boxplot \nTip_Percent by Time")
tipsBoxplot
Histogram:
tipsHistogram <- ggplot(tipsData, aes(x = Tip_Percent)) + geom_histogram(binwidth = 2.5)
tipsHistogram <- tipsHistogram + facet_grid(~ Time)
tipsHistogram <- tipsHistogram + ggtitle("ggplot2 Histogram \nTip_Percent by Time")
tipsHistogram
Scatterplot:
tipsScatterplot <- ggplot(tipsData, aes(x = Total_Bill, y = Tip_Percent)) + geom_point()
tipsScatterplot <- tipsScatterplot + geom_point(aes(color = Time))
tipsScatterplot <- tipsScatterplot + facet_grid(~ Time)
tipsScatterplot <- tipsScatterplot + geom_smooth(method = 'lm',formula = y ~ x)
tipsScatterplot <- tipsScatterplot + ggtitle("ggplot2 Scatterplot \nTip_Percent versus Total_Bill by Time") #add title
tipsScatterplot
The total bill has a negative correlation on the varation of the time rate. For example, as the total amount of the bill increases, the tip rate decreases. This correlation also holds true for both lunch and dinner meals, even though the negativce correlation factor is higher during dinner.