library(readxl)
ToyotaPrices <- read_excel("C:/RBS/Data Analysis/Homework/Week 4/ToyotaPrices.xlsx")
## Convert Tow_Bar and ABS to Factors with Yes/No Levels
# 2.2 Convert categorical variable to a factor
ToyotaPrices$Mfr_Guarantee <- factor(ToyotaPrices$Mfr_Guarantee)
levels(ToyotaPrices$Mfr_Guarantee) <- c("No", "Yes")
ToyotaPrices$BOVAG_Guarantee <- factor(ToyotaPrices$BOVAG_Guarantee)
levels(ToyotaPrices$BOVAG_Guarantee) <- c("No", "Yes")
ToyotaPrices$ABS <- factor(ToyotaPrices$ABS)
levels(ToyotaPrices$ABS) <- c("No", "Yes")
ToyotaPrices$Airbag_1 <- factor(ToyotaPrices$Airbag_1)
levels(ToyotaPrices$Airbag_1) <- c("No", "Yes")
ToyotaPrices$Airbag_2 <- factor(ToyotaPrices$Airbag_2)
levels(ToyotaPrices$Airbag_2) <- c("No", "Yes")
ToyotaPrices$Airco <- factor(ToyotaPrices$Airco)
levels(ToyotaPrices$Airco) <- c("No", "Yes")
ToyotaPrices$Automatic_airco <- factor(ToyotaPrices$Automatic_airco)
levels(ToyotaPrices$Automatic_airco) <- c("No", "Yes")
ToyotaPrices$Boardcomputer <- factor(ToyotaPrices$Boardcomputer)
levels(ToyotaPrices$Boardcomputer) <- c("No", "Yes")
ToyotaPrices$CD_Player <- factor(ToyotaPrices$CD_Player)
levels(ToyotaPrices$CD_Player) <- c("No", "Yes")
ToyotaPrices$Central_Lock <- factor(ToyotaPrices$Central_Lock)
levels(ToyotaPrices$Central_Lock) <- c("No", "Yes")
ToyotaPrices$Powered_Windows <- factor(ToyotaPrices$Powered_Windows)
levels(ToyotaPrices$Powered_Windows) <- c("No", "Yes")
ToyotaPrices$Powered_Windows <- factor(ToyotaPrices$Powered_Windows)
levels(ToyotaPrices$Powered_Windows) <- c("No", "Yes")
ToyotaPrices$Power_Steering <- factor(ToyotaPrices$Power_Steering)
levels(ToyotaPrices$Power_Steering) <- c("No", "Yes")
ToyotaPrices$Radio <- factor(ToyotaPrices$Radio)
levels(ToyotaPrices$Radio) <- c("No", "Yes")
ToyotaPrices$Mistlamps <- factor(ToyotaPrices$Mistlamps)
levels(ToyotaPrices$Mistlamps) <- c("No", "Yes")
ToyotaPrices$Tow_Bar <- factor(ToyotaPrices$Tow_Bar)
levels(ToyotaPrices$Tow_Bar) <- c("No", "Yes")
ToyotaPrices$Sport_Model <- factor(ToyotaPrices$Sport_Model)
levels(ToyotaPrices$Sport_Model) <- c("No", "Yes")
ToyotaPrices$Backseat_Divider <- factor(ToyotaPrices$Backseat_Divider)
levels(ToyotaPrices$Backseat_Divider) <- c("No", "Yes")
ToyotaPrices$Metallic_Rim <- factor(ToyotaPrices$Metallic_Rim)
levels(ToyotaPrices$Metallic_Rim) <- c("No", "Yes")
ToyotaPrices$Radio_cassette <- factor(ToyotaPrices$Radio_cassette)
levels(ToyotaPrices$Radio_cassette) <- c("No", "Yes")
ToyotaPrices$Automatic <- factor(ToyotaPrices$Automatic)
levels(ToyotaPrices$Automatic) <- c("No", "Yes")
ToyotaPrices$KM[ToyotaPrices$KM < 500] <- NA
ToyotaPrices$cc[ToyotaPrices$cc == 16000] <- NA
ToyotaPrices$Automatic[ToyotaPrices$Automatic == 0] <- NA
# omit na
ToyotaPrices <- na.omit(ToyotaPrices)
summary(ToyotaPrices)
## Id Price Age_08_04 Mfg_Month
## Min. : 1.0 Min. : 4350 Min. : 4.00 Min. : 1.000
## 1st Qu.: 368.8 1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 3.000
## Median : 726.5 Median : 9900 Median :61.00 Median : 5.000
## Mean : 725.1 Mean :10669 Mean :56.28 Mean : 5.537
## 3rd Qu.:1083.2 3rd Qu.:11900 3rd Qu.:70.00 3rd Qu.: 8.000
## Max. :1441.0 Max. :31275 Max. :80.00 Max. :12.000
## Mfg_Year KM HP Automatic cc
## Min. :1998 Min. : 1500 Min. : 69.0 No :1345 Min. :1300
## 1st Qu.:1998 1st Qu.: 43427 1st Qu.: 86.0 Yes: 79 1st Qu.:1400
## Median :1999 Median : 63831 Median :110.0 Median :1600
## Mean :2000 Mean : 69096 Mean :101.5 Mean :1567
## 3rd Qu.:2001 3rd Qu.: 87422 3rd Qu.:110.0 3rd Qu.:1600
## Max. :2004 Max. :243000 Max. :192.0 Max. :2000
## Doors Cylinders Gears Quarterly_Tax Weight
## Min. :2.000 Min. :4 Min. :3.000 Min. : 19.00 Min. :1000
## 1st Qu.:3.000 1st Qu.:4 1st Qu.:5.000 1st Qu.: 69.00 1st Qu.:1040
## Median :4.000 Median :4 Median :5.000 Median : 85.00 Median :1065
## Mean :4.029 Mean :4 Mean :5.027 Mean : 87.24 Mean :1072
## 3rd Qu.:5.000 3rd Qu.:4 3rd Qu.:5.000 3rd Qu.: 85.00 3rd Qu.:1085
## Max. :5.000 Max. :4 Max. :6.000 Max. :283.00 Max. :1615
## Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS Airbag_1
## No :839 No : 143 Min. : 3.00 No : 267 No : 42
## Yes:585 Yes:1281 1st Qu.: 3.00 Yes:1157 Yes:1382
## Median : 3.00
## Mean : 3.73
## 3rd Qu.: 3.00
## Max. :28.00
## Airbag_2 Airco Automatic_airco Boardcomputer CD_Player Central_Lock
## No : 396 No :705 No :1346 No :1010 No :1118 No :602
## Yes:1028 Yes:719 Yes: 78 Yes: 414 Yes: 306 Yes:822
##
##
##
##
## Powered_Windows Power_Steering Radio Mistlamps Sport_Model
## No :628 No : 32 No :1215 No :1057 No :998
## Yes:796 Yes:1392 Yes: 209 Yes: 367 Yes:426
##
##
##
##
## Backseat_Divider Metallic_Rim Radio_cassette Tow_Bar
## No : 323 No :1132 No :1216 No :1026
## Yes:1101 Yes: 292 Yes: 208 Yes: 398
##
##
##
##
# Load library
library(ggplot2)
# 1. Side-by-side Boxplot
boxplot(scale(ToyotaPrices[,c(2:7,9,10,13,14)]))

## Price, KM, Mfg_Month, Mfg_Year HP, CC, Tax, and Weight are skewed to varying degrees. KM has the most outliers while weight has the largest scaled outliers
# 2. Scatterplot Price vs. KM
library(ggplot2)
# scatterplot
PriceKM <- ggplot(ToyotaPrices) +
aes(x = KM, y = Price) +
geom_point(alpha=I(1/4)) +
geom_smooth(method = "loess",
se=FALSE,
color = "blue")
PriceKM
## `geom_smooth()` using formula 'y ~ x'

## The relationship looks like a curve. Eventually the impact of mileage does diminishes the more miles there are/
# 3. Box-Whisker plot of Price versus ABS
plot(Price~ABS, data=ToyotaPrices)

# 4. Correlation between Price and KM
with(ToyotaPrices, cor(Price, KM))
## [1] -0.5639865
# It is a medium strong negative correlation. Generally, the higher the kilometers, the lower the price
myData_PKWT <- subset(ToyotaPrices, select = c(Price, KM, Weight, Tow_Bar))
myData2<- subset(myData_PKWT, select = -Tow_Bar)
#5. Price vs. Tow Bar
plot(KM~Tow_Bar, data=ToyotaPrices)
# Generally, Toyotas without a tow bar cost slightly less than Toyotas with a tow bar. The two box plots have similar median prices, and both skew right with numerous outliers at the high end of the data. The presence of a tow bar does not appear to be a strong driver of price.
#6. KM vs. Tow Bar
plot(KM~Tow_Bar, data=ToyotaPrices)

# The two boxplots are very similar. Vehicles with a tow bar seem to have higher mileage, though the outliers with the highest mileage are in vehicles without tow bars. I don't believe the presence of a tow bar is a strong predictor for high mileage.
# 7. The deepened mystery
PriceKMTowBar <- ggplot(ToyotaPrices) +
aes(x = KM, y = Price, color = Tow_Bar) +
geom_point(alpha=I(1))+
geom_smooth()
PriceKMTowBar
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# The plot shows that vehicles with the Tow Bar are consistently cheaper than vehicles without the tow bar. The impact of a tow bar on price is most significant at under 50,000 km.