getwd()
## [1] "E:/My_R_Work"
setwd("E:/DADM/Lecture notes/# data")

ToyotaPrices = read.csv("ToyotaPrices.csv", header=TRUE)

##5-Number Summary of all the variables:

summary(ToyotaPrices)
##        Id             Price         Age_08_04       Mfg_Month     
##  Min.   :   1.0   Min.   : 4350   Min.   : 1.00   Min.   : 1.000  
##  1st Qu.: 361.8   1st Qu.: 8450   1st Qu.:44.00   1st Qu.: 3.000  
##  Median : 721.5   Median : 9900   Median :61.00   Median : 5.000  
##  Mean   : 721.6   Mean   :10731   Mean   :55.95   Mean   : 5.549  
##  3rd Qu.:1081.2   3rd Qu.:11950   3rd Qu.:70.00   3rd Qu.: 8.000  
##  Max.   :1442.0   Max.   :32500   Max.   :80.00   Max.   :12.000  
##     Mfg_Year          KM               HP          Automatic      
##  Min.   :1998   Min.   :     1   Min.   : 69.0   Min.   :0.00000  
##  1st Qu.:1998   1st Qu.: 43000   1st Qu.: 90.0   1st Qu.:0.00000  
##  Median :1999   Median : 63390   Median :110.0   Median :0.00000  
##  Mean   :2000   Mean   : 68533   Mean   :101.5   Mean   :0.05571  
##  3rd Qu.:2001   3rd Qu.: 87021   3rd Qu.:110.0   3rd Qu.:0.00000  
##  Max.   :2004   Max.   :243000   Max.   :192.0   Max.   :1.00000  
##        cc            Doors         Cylinders     Gears      
##  Min.   : 1300   Min.   :2.000   Min.   :4   Min.   :3.000  
##  1st Qu.: 1400   1st Qu.:3.000   1st Qu.:4   1st Qu.:5.000  
##  Median : 1600   Median :4.000   Median :4   Median :5.000  
##  Mean   : 1577   Mean   :4.033   Mean   :4   Mean   :5.026  
##  3rd Qu.: 1600   3rd Qu.:5.000   3rd Qu.:4   3rd Qu.:5.000  
##  Max.   :16000   Max.   :5.000   Max.   :4   Max.   :6.000  
##  Quarterly_Tax        Weight     Mfr_Guarantee    BOVAG_Guarantee 
##  Min.   : 19.00   Min.   :1000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 69.00   1st Qu.:1040   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median : 85.00   Median :1070   Median :0.0000   Median :1.0000  
##  Mean   : 87.12   Mean   :1072   Mean   :0.4095   Mean   :0.8955  
##  3rd Qu.: 85.00   3rd Qu.:1085   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :283.00   Max.   :1615   Max.   :1.0000   Max.   :1.0000  
##  Guarantee_Period      ABS            Airbag_1         Airbag_2     
##  Min.   : 3.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 3.000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 3.000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   : 3.815   Mean   :0.8134   Mean   :0.9708   Mean   :0.7228  
##  3rd Qu.: 3.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :36.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      Airco        Automatic_airco   Boardcomputer      CD_Player     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.5084   Mean   :0.05641   Mean   :0.2946   Mean   :0.2187  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##   Central_Lock    Powered_Windows Power_Steering       Radio       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.000   Median :1.0000   Median :0.0000  
##  Mean   :0.5801   Mean   :0.562   Mean   :0.9777   Mean   :0.1462  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##    Mistlamps      Sport_Model     Backseat_Divider  Metallic_Rim   
##  Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.257   Mean   :0.3001   Mean   :0.7702   Mean   :0.2047  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Radio_cassette      Tow_Bar      
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000  
##  Mean   :0.1455   Mean   :0.2779  
##  3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000
myData_PKWT <- subset(ToyotaPrices, select = c(Price, KM, Weight, Tow_Bar))
head(myData_PKWT)
##   Price    KM Weight Tow_Bar
## 1 13500 46986   1165       0
## 2 13750 72937   1165       0
## 3 13950 41711   1165       0
## 4 14950 48000   1165       0
## 5 13750 38500   1170       0
## 6 12950 61000   1170       0
myData_PKW <- subset(myData_PKWT, select = -Tow_Bar)
head(myData_PKW)
##   Price    KM Weight
## 1 13500 46986   1165
## 2 13750 72937   1165
## 3 13950 41711   1165
## 4 14950 48000   1165
## 5 13750 38500   1170
## 6 12950 61000   1170
# Exercise 1: Exploratory Data Analysis
# a - Data Summary

summary(myData_PKWT)
##      Price             KM             Weight        Tow_Bar      
##  Min.   : 4350   Min.   :     1   Min.   :1000   Min.   :0.0000  
##  1st Qu.: 8450   1st Qu.: 43000   1st Qu.:1040   1st Qu.:0.0000  
##  Median : 9900   Median : 63390   Median :1070   Median :0.0000  
##  Mean   :10731   Mean   : 68533   Mean   :1072   Mean   :0.2779  
##  3rd Qu.:11950   3rd Qu.: 87021   3rd Qu.:1085   3rd Qu.:1.0000  
##  Max.   :32500   Max.   :243000   Max.   :1615   Max.   :1.0000
## Yes, all of the variables exhibit skewness

## Negative skewness indicates that the mean of the data values 
## is less than the median, and the data distribution is left-skewed. 
## Positive skewness would indicates that the mean of the data values 
## is larger than the median, and the data distribution is right-skewed.

## We apply the function skewness from the e1071 package to compute the 
## skewness coefficient of all the variables.

require(e1071)
## Loading required package: e1071
skewness(myData_PKWT$Price)
## [1] 1.700327
## The skewness of ID is 1.700327 
## It indicates that its distribution is skewed towards the right.

skewness(myData_PKWT$KM)
## [1] 1.013791
## right-skewed.

skewness(myData_PKWT$Weight)
## [1] 3.102148
## right-skewed.

skewness(myData_PKWT$Tow_Bar)
## [1] 0.9908115
## right-skwewed.


# Exercise 1: Exploratory Data Analysis
# b - Density-Plot and Normal QQ-Plot

require(ggplot2)
## Loading required package: ggplot2
# Kernal Density Plot of Price
plot1 <- qplot(Price, data = myData_PKWT, geom="density", main = "Kernel Density Plot of Price")
plot1

# Normal QQ-Plot of Price
plot2 <- qplot(sample=Price, data = myData_PKWT, main = "QQ-Plot (Price)") 
plot2

##From the patterns in these graphs, we see that variable Price is right skewed

# Kernal Density Plot of KM
plot3 <- qplot(KM, data = myData_PKWT, geom="density", main = "Kernel Density Plot of KM")
plot3

# Normal QQ-Plot of KM
plot4 <- qplot(sample=KM, data = myData_PKWT, main = "QQ-Plot (KM)") 
plot4

#From the patterns in these graphs, we see that the variable KM is normally distributed


# Exercise 1: Exploratory Data Analysis
# c - Recode a categotical variable as a factor

myData_PKWT$Tow_Bar <- factor(myData_PKWT$Tow_Bar)
levels(myData_PKWT$Tow_Bar) <- c("no", "yes")
summary(myData_PKWT$Tow_Bar)
##   no  yes 
## 1037  399
# Exercise 1: Exploratory Data Analysis
# d - Boxplot of Price vs Tow_Bar

plot5<- qplot(Tow_Bar, Price, data=myData_PKWT, geom=c("boxplot"))
plot5

## From the two boxplots, we see that they are different.
## The Tow_Bar doesnt seem to predict price, rather they seem to be independent attributes.


# Exercise 1: Exploratory Data Analysis
# e - KM vs Tow_Bar

plot6<- qplot(Tow_Bar, KM, data=myData_PKWT, geom=c("boxplot"))
plot6

## These boxplots are different in terms of outliers.
## There is no prediction between the attributes.


# Exercise 1: Exploratory Data Analysis
# f - Explain the pattern

allt <- par(mfrow=c(1,2))
plot(Price~Tow_Bar, data=myData_PKWT)
plot(KM~Tow_Bar, data=myData_PKWT)

par(allt)

allt <- par(mfrow=c(1,2))
stripchart(Price ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")
stripchart(KM ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")

par(allt)

## If there is a tow_bar in the car the price of the car is less.
 
## When the car does not have a tow_bar there are seen many outliers as the value for KMs tends to increase. .



# Exercise 2: Scatterplot Matrix
# a - Basic scatterplot matrix

pairs(~ Price + KM + Weight, data = myData_PKWT)

fit = lm(Price ~ Weight + KM, data = myData_PKWT)
fit
## 
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
## 
## Coefficients:
## (Intercept)       Weight           KM  
##  -2.737e+04    3.895e+01   -5.355e-02
coef(fit)
##   (Intercept)        Weight            KM 
## -27374.759440     38.953211     -0.053553
summary(fit)
## 
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19112.5  -1294.3    -46.6   1248.9   8928.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.737e+04  1.174e+03  -23.32   <2e-16 ***
## Weight       3.895e+01  1.086e+00   35.87   <2e-16 ***
## KM          -5.355e-02  1.524e-03  -35.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2165 on 1433 degrees of freedom
## Multiple R-squared:  0.6442, Adjusted R-squared:  0.6437 
## F-statistic:  1297 on 2 and 1433 DF,  p-value: < 2.2e-16
## We can see that the Price decreases as the value for KM tends to increase.
## Clearly, there are many outliers.
## KM and Weught appear to be redundant,


# Exercise 2: Scatterplot Matrix
# b - Scatterplot matrix with factor information

pairs(~ Price + KM + Weight, data = myData_PKWT, col = myData_PKWT$Tow_Bar)

## We can say that relation between Price and KM is the same for cars with 
## and without a tow bar

##there are no clear relationships visible that appear to be different for group of cars
#with or without a tow bar


# Exercise 3: Multiple regression model
# a - Fit the model

## Price=??0+??1KM+??2Weight+??3TowBar+??

fit = lm(Price~ KM + Weight + Tow_Bar, data= myData_PKWT)

options(show.signif.stars = FALSE)

summary(fit)
## 
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19077.1  -1248.8    -38.2   1230.7   8795.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04  1.168e+03 -22.930  < 2e-16
## KM          -5.288e-02  1.515e-03 -34.910  < 2e-16
## Weight       3.853e+01  1.079e+00  35.726  < 2e-16
## Tow_Baryes  -6.835e+02  1.271e+02  -5.378  8.8e-08
## 
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared:  0.6513, Adjusted R-squared:  0.6505 
## F-statistic: 891.4 on 3 and 1432 DF,  p-value: < 2.2e-16
# Exercise 3: Multiple regression model
# b - Residual five-number summary

summary(fit)
## 
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19077.1  -1248.8    -38.2   1230.7   8795.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04  1.168e+03 -22.930  < 2e-16
## KM          -5.288e-02  1.515e-03 -34.910  < 2e-16
## Weight       3.853e+01  1.079e+00  35.726  < 2e-16
## Tow_Baryes  -6.835e+02  1.271e+02  -5.378  8.8e-08
## 
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared:  0.6513, Adjusted R-squared:  0.6505 
## F-statistic: 891.4 on 3 and 1432 DF,  p-value: < 2.2e-16
summary(fit$residuals)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -19080.00  -1249.00    -38.23      0.00   1231.00   8795.00
## The residuals appear to be a non-parametric summary of their distribution.
## Also, the residuals appear to be skewed little on the left.

# Exercise 3: Multiple regression model
# c - Intercept

coef(fit)
##   (Intercept)            KM        Weight    Tow_Baryes 
## -2.677787e+04 -5.288276e-02  3.853090e+01 -6.835050e+02
#negative coefficient indicates that they are negatively correlated 
#and inversely proptional to each other.
#Thus, it indicates that KM is negatively correlated and that 
#with increase in KM there is a decrease in Price.
#The intercept is the expected mean value of Y i.e. Price

# Exercise 3: Multiple regression model
# d - Signs of the slope coeffcients

## A positive sign of the correlation coefficient indicates that as the value of one variable increases, the value of the other variable increases; 
## as one decreases the other decreases.
## A negative correlation coefficient indicates that as one variable increases, the other decreases i,e. inverse relation



# Exercise 3: Multiple regression model
# e - Price vs KM

plot(Price ~ KM, myData_PKWT)

## The two variables are inversely proportional 
## i.e. the price of the car decreases if it has more mileage (KM)
## The price of the car goes down if it has more kilometres on it 
## by 100k and it makes sense because it determines the age of the car



# Exercise 3: Multiple regression model
# f - Price vs Weight

plot(Price ~ Weight, myData_PKWT)

## The price of the car is higher if it weighs more;
## The price goes up by 100k and it makes sense as the car parts can be of a better quality.



# Exercise 3: Multiple regression model
# g - Price of a Tow_Bar

## There is not much price difference of automobiles 
## with and without Tow_bar with the same KM and weight. 
## So this values does not make much sense.



# Exercise 3: Multiple regression model
# h - The Coefficient of Determination, R2

summary(fit)$r.square
## [1] 0.6512679
library(ggplot2)
qplot(fitted.values(fit), Price, data=myData_PKWT) +
  geom_abline(intercept=0, slope=1)

## This value indicates a good fitting model.
## The points are tight to the slope,so the model will be able to predict actual values
## Thus the value indicates a good fitting model.



# Exercise 4: Goodness-of-Fit
# a - Get R2R2 using fitted values

deviance = deviance(fit)

y <- myData_PKWT$Price

TotalSS <- sum((y-mean(y))^2) 

TotalSS
## [1] 18877241464
1 - deviance/TotalSS
## [1] 0.6512679
summary(fit)$r.square
## [1] 0.6512679
r <- cor(fitted.values(fit), myData_PKWT$Price)^2

r
## [1] 0.6512679
## Thus, the Pearson correlation between Price and the Fitted Values, after squaring
## is equal to the R-square value found in the model summary.



# Exercise 4: Goodness-of-Fit
# b - Fit Plot

qplot(fit$fitted.value, Price, data=myData_PKWT) +
  geom_abline(intercept = 0, slope = 1, color="hot pink") +
  ggtitle("Fit Plot")

## The above plot appears close to the diagonal line. 
## So this model is predicting the prices well