getwd()
## [1] "E:/My_R_Work"
setwd("E:/DADM/Lecture notes/# data")
ToyotaPrices = read.csv("ToyotaPrices.csv", header=TRUE)
##5-Number Summary of all the variables:
summary(ToyotaPrices)
## Id Price Age_08_04 Mfg_Month
## Min. : 1.0 Min. : 4350 Min. : 1.00 Min. : 1.000
## 1st Qu.: 361.8 1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 3.000
## Median : 721.5 Median : 9900 Median :61.00 Median : 5.000
## Mean : 721.6 Mean :10731 Mean :55.95 Mean : 5.549
## 3rd Qu.:1081.2 3rd Qu.:11950 3rd Qu.:70.00 3rd Qu.: 8.000
## Max. :1442.0 Max. :32500 Max. :80.00 Max. :12.000
## Mfg_Year KM HP Automatic
## Min. :1998 Min. : 1 Min. : 69.0 Min. :0.00000
## 1st Qu.:1998 1st Qu.: 43000 1st Qu.: 90.0 1st Qu.:0.00000
## Median :1999 Median : 63390 Median :110.0 Median :0.00000
## Mean :2000 Mean : 68533 Mean :101.5 Mean :0.05571
## 3rd Qu.:2001 3rd Qu.: 87021 3rd Qu.:110.0 3rd Qu.:0.00000
## Max. :2004 Max. :243000 Max. :192.0 Max. :1.00000
## cc Doors Cylinders Gears
## Min. : 1300 Min. :2.000 Min. :4 Min. :3.000
## 1st Qu.: 1400 1st Qu.:3.000 1st Qu.:4 1st Qu.:5.000
## Median : 1600 Median :4.000 Median :4 Median :5.000
## Mean : 1577 Mean :4.033 Mean :4 Mean :5.026
## 3rd Qu.: 1600 3rd Qu.:5.000 3rd Qu.:4 3rd Qu.:5.000
## Max. :16000 Max. :5.000 Max. :4 Max. :6.000
## Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## Min. : 19.00 Min. :1000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 69.00 1st Qu.:1040 1st Qu.:0.0000 1st Qu.:1.0000
## Median : 85.00 Median :1070 Median :0.0000 Median :1.0000
## Mean : 87.12 Mean :1072 Mean :0.4095 Mean :0.8955
## 3rd Qu.: 85.00 3rd Qu.:1085 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :283.00 Max. :1615 Max. :1.0000 Max. :1.0000
## Guarantee_Period ABS Airbag_1 Airbag_2
## Min. : 3.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 3.000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median : 3.000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean : 3.815 Mean :0.8134 Mean :0.9708 Mean :0.7228
## 3rd Qu.: 3.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :36.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Airco Automatic_airco Boardcomputer CD_Player
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.5084 Mean :0.05641 Mean :0.2946 Mean :0.2187
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Central_Lock Powered_Windows Power_Steering Radio
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.000 Median :1.0000 Median :0.0000
## Mean :0.5801 Mean :0.562 Mean :0.9777 Mean :0.1462
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## Mistlamps Sport_Model Backseat_Divider Metallic_Rim
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.257 Mean :0.3001 Mean :0.7702 Mean :0.2047
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Radio_cassette Tow_Bar
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.1455 Mean :0.2779
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
myData_PKWT <- subset(ToyotaPrices, select = c(Price, KM, Weight, Tow_Bar))
head(myData_PKWT)
## Price KM Weight Tow_Bar
## 1 13500 46986 1165 0
## 2 13750 72937 1165 0
## 3 13950 41711 1165 0
## 4 14950 48000 1165 0
## 5 13750 38500 1170 0
## 6 12950 61000 1170 0
myData_PKW <- subset(myData_PKWT, select = -Tow_Bar)
head(myData_PKW)
## Price KM Weight
## 1 13500 46986 1165
## 2 13750 72937 1165
## 3 13950 41711 1165
## 4 14950 48000 1165
## 5 13750 38500 1170
## 6 12950 61000 1170
# Exercise 1: Exploratory Data Analysis
# a - Data Summary
summary(myData_PKWT)
## Price KM Weight Tow_Bar
## Min. : 4350 Min. : 1 Min. :1000 Min. :0.0000
## 1st Qu.: 8450 1st Qu.: 43000 1st Qu.:1040 1st Qu.:0.0000
## Median : 9900 Median : 63390 Median :1070 Median :0.0000
## Mean :10731 Mean : 68533 Mean :1072 Mean :0.2779
## 3rd Qu.:11950 3rd Qu.: 87021 3rd Qu.:1085 3rd Qu.:1.0000
## Max. :32500 Max. :243000 Max. :1615 Max. :1.0000
## Yes, all of the variables exhibit skewness
## Negative skewness indicates that the mean of the data values
## is less than the median, and the data distribution is left-skewed.
## Positive skewness would indicates that the mean of the data values
## is larger than the median, and the data distribution is right-skewed.
## We apply the function skewness from the e1071 package to compute the
## skewness coefficient of all the variables.
require(e1071)
## Loading required package: e1071
skewness(myData_PKWT$Price)
## [1] 1.700327
## The skewness of ID is 1.700327
## It indicates that its distribution is skewed towards the right.
skewness(myData_PKWT$KM)
## [1] 1.013791
## right-skewed.
skewness(myData_PKWT$Weight)
## [1] 3.102148
## right-skewed.
skewness(myData_PKWT$Tow_Bar)
## [1] 0.9908115
## right-skwewed.
# Exercise 1: Exploratory Data Analysis
# b - Density-Plot and Normal QQ-Plot
require(ggplot2)
## Loading required package: ggplot2
# Kernal Density Plot of Price
plot1 <- qplot(Price, data = myData_PKWT, geom="density", main = "Kernel Density Plot of Price")
plot1

# Normal QQ-Plot of Price
plot2 <- qplot(sample=Price, data = myData_PKWT, main = "QQ-Plot (Price)")
plot2

##From the patterns in these graphs, we see that variable Price is right skewed
# Kernal Density Plot of KM
plot3 <- qplot(KM, data = myData_PKWT, geom="density", main = "Kernel Density Plot of KM")
plot3

# Normal QQ-Plot of KM
plot4 <- qplot(sample=KM, data = myData_PKWT, main = "QQ-Plot (KM)")
plot4

#From the patterns in these graphs, we see that the variable KM is normally distributed
# Exercise 1: Exploratory Data Analysis
# c - Recode a categotical variable as a factor
myData_PKWT$Tow_Bar <- factor(myData_PKWT$Tow_Bar)
levels(myData_PKWT$Tow_Bar) <- c("no", "yes")
summary(myData_PKWT$Tow_Bar)
## no yes
## 1037 399
# Exercise 1: Exploratory Data Analysis
# d - Boxplot of Price vs Tow_Bar
plot5<- qplot(Tow_Bar, Price, data=myData_PKWT, geom=c("boxplot"))
plot5

## From the two boxplots, we see that they are different.
## The Tow_Bar doesnt seem to predict price, rather they seem to be independent attributes.
# Exercise 1: Exploratory Data Analysis
# e - KM vs Tow_Bar
plot6<- qplot(Tow_Bar, KM, data=myData_PKWT, geom=c("boxplot"))
plot6

## These boxplots are different in terms of outliers.
## There is no prediction between the attributes.
# Exercise 1: Exploratory Data Analysis
# f - Explain the pattern
allt <- par(mfrow=c(1,2))
plot(Price~Tow_Bar, data=myData_PKWT)
plot(KM~Tow_Bar, data=myData_PKWT)

par(allt)
allt <- par(mfrow=c(1,2))
stripchart(Price ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")
stripchart(KM ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")

par(allt)
## If there is a tow_bar in the car the price of the car is less.
## When the car does not have a tow_bar there are seen many outliers as the value for KMs tends to increase. .
# Exercise 2: Scatterplot Matrix
# a - Basic scatterplot matrix
pairs(~ Price + KM + Weight, data = myData_PKWT)

fit = lm(Price ~ Weight + KM, data = myData_PKWT)
fit
##
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
##
## Coefficients:
## (Intercept) Weight KM
## -2.737e+04 3.895e+01 -5.355e-02
coef(fit)
## (Intercept) Weight KM
## -27374.759440 38.953211 -0.053553
summary(fit)
##
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19112.5 -1294.3 -46.6 1248.9 8928.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.737e+04 1.174e+03 -23.32 <2e-16 ***
## Weight 3.895e+01 1.086e+00 35.87 <2e-16 ***
## KM -5.355e-02 1.524e-03 -35.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2165 on 1433 degrees of freedom
## Multiple R-squared: 0.6442, Adjusted R-squared: 0.6437
## F-statistic: 1297 on 2 and 1433 DF, p-value: < 2.2e-16
## We can see that the Price decreases as the value for KM tends to increase.
## Clearly, there are many outliers.
## KM and Weught appear to be redundant,
# Exercise 2: Scatterplot Matrix
# b - Scatterplot matrix with factor information
pairs(~ Price + KM + Weight, data = myData_PKWT, col = myData_PKWT$Tow_Bar)

## We can say that relation between Price and KM is the same for cars with
## and without a tow bar
##there are no clear relationships visible that appear to be different for group of cars
#with or without a tow bar
# Exercise 3: Multiple regression model
# a - Fit the model
## Price=??0+??1KM+??2Weight+??3TowBar+??
fit = lm(Price~ KM + Weight + Tow_Bar, data= myData_PKWT)
options(show.signif.stars = FALSE)
summary(fit)
##
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19077.1 -1248.8 -38.2 1230.7 8795.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04 1.168e+03 -22.930 < 2e-16
## KM -5.288e-02 1.515e-03 -34.910 < 2e-16
## Weight 3.853e+01 1.079e+00 35.726 < 2e-16
## Tow_Baryes -6.835e+02 1.271e+02 -5.378 8.8e-08
##
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared: 0.6513, Adjusted R-squared: 0.6505
## F-statistic: 891.4 on 3 and 1432 DF, p-value: < 2.2e-16
# Exercise 3: Multiple regression model
# b - Residual five-number summary
summary(fit)
##
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19077.1 -1248.8 -38.2 1230.7 8795.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04 1.168e+03 -22.930 < 2e-16
## KM -5.288e-02 1.515e-03 -34.910 < 2e-16
## Weight 3.853e+01 1.079e+00 35.726 < 2e-16
## Tow_Baryes -6.835e+02 1.271e+02 -5.378 8.8e-08
##
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared: 0.6513, Adjusted R-squared: 0.6505
## F-statistic: 891.4 on 3 and 1432 DF, p-value: < 2.2e-16
summary(fit$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -19080.00 -1249.00 -38.23 0.00 1231.00 8795.00
## The residuals appear to be a non-parametric summary of their distribution.
## Also, the residuals appear to be skewed little on the left.
# Exercise 3: Multiple regression model
# c - Intercept
coef(fit)
## (Intercept) KM Weight Tow_Baryes
## -2.677787e+04 -5.288276e-02 3.853090e+01 -6.835050e+02
#negative coefficient indicates that they are negatively correlated
#and inversely proptional to each other.
#Thus, it indicates that KM is negatively correlated and that
#with increase in KM there is a decrease in Price.
#The intercept is the expected mean value of Y i.e. Price
# Exercise 3: Multiple regression model
# d - Signs of the slope coeffcients
## A positive sign of the correlation coefficient indicates that as the value of one variable increases, the value of the other variable increases;
## as one decreases the other decreases.
## A negative correlation coefficient indicates that as one variable increases, the other decreases i,e. inverse relation
# Exercise 3: Multiple regression model
# e - Price vs KM
plot(Price ~ KM, myData_PKWT)

## The two variables are inversely proportional
## i.e. the price of the car decreases if it has more mileage (KM)
## The price of the car goes down if it has more kilometres on it
## by 100k and it makes sense because it determines the age of the car
# Exercise 3: Multiple regression model
# f - Price vs Weight
plot(Price ~ Weight, myData_PKWT)

## The price of the car is higher if it weighs more;
## The price goes up by 100k and it makes sense as the car parts can be of a better quality.
# Exercise 3: Multiple regression model
# g - Price of a Tow_Bar
## There is not much price difference of automobiles
## with and without Tow_bar with the same KM and weight.
## So this values does not make much sense.
# Exercise 3: Multiple regression model
# h - The Coefficient of Determination, R2
summary(fit)$r.square
## [1] 0.6512679
library(ggplot2)
qplot(fitted.values(fit), Price, data=myData_PKWT) +
geom_abline(intercept=0, slope=1)

## This value indicates a good fitting model.
## The points are tight to the slope,so the model will be able to predict actual values
## Thus the value indicates a good fitting model.
# Exercise 4: Goodness-of-Fit
# a - Get R2R2 using fitted values
deviance = deviance(fit)
y <- myData_PKWT$Price
TotalSS <- sum((y-mean(y))^2)
TotalSS
## [1] 18877241464
1 - deviance/TotalSS
## [1] 0.6512679
summary(fit)$r.square
## [1] 0.6512679
r <- cor(fitted.values(fit), myData_PKWT$Price)^2
r
## [1] 0.6512679
## Thus, the Pearson correlation between Price and the Fitted Values, after squaring
## is equal to the R-square value found in the model summary.
# Exercise 4: Goodness-of-Fit
# b - Fit Plot
qplot(fit$fitted.value, Price, data=myData_PKWT) +
geom_abline(intercept = 0, slope = 1, color="hot pink") +
ggtitle("Fit Plot")

## The above plot appears close to the diagonal line.
## So this model is predicting the prices well