#Assignment 3
ToyotaPrices <- read.csv("C:/Users/aksha/Downloads/ToyotaPrices.csv")
names(ToyotaPrices)
## [1] "Id" "Price" "Age_08_04"
## [4] "Mfg_Month" "Mfg_Year" "KM"
## [7] "HP" "Automatic" "cc"
## [10] "Doors" "Cylinders" "Gears"
## [13] "Quarterly_Tax" "Weight" "Mfr_Guarantee"
## [16] "BOVAG_Guarantee" "Guarantee_Period" "ABS"
## [19] "Airbag_1" "Airbag_2" "Airco"
## [22] "Automatic_airco" "Boardcomputer" "CD_Player"
## [25] "Central_Lock" "Powered_Windows" "Power_Steering"
## [28] "Radio" "Mistlamps" "Sport_Model"
## [31] "Backseat_Divider" "Metallic_Rim" "Radio_cassette"
## [34] "Tow_Bar"
myData_PKWT <- subset(ToyotaPrices, select = c(Price, KM, Weight, Tow_Bar))
head(myData_PKWT)
## Price KM Weight Tow_Bar
## 1 13500 46986 1165 0
## 2 13750 72937 1165 0
## 3 13950 41711 1165 0
## 4 14950 48000 1165 0
## 5 13750 38500 1170 0
## 6 12950 61000 1170 0
#Exercise 1(a)
summary(myData_PKWT)
## Price KM Weight Tow_Bar
## Min. : 4350 Min. : 1 Min. :1000 Min. :0.0000
## 1st Qu.: 8450 1st Qu.: 43000 1st Qu.:1040 1st Qu.:0.0000
## Median : 9900 Median : 63390 Median :1070 Median :0.0000
## Mean :10731 Mean : 68533 Mean :1072 Mean :0.2779
## 3rd Qu.:11950 3rd Qu.: 87021 3rd Qu.:1085 3rd Qu.:1.0000
## Max. :32500 Max. :243000 Max. :1615 Max. :1.0000
#from inspection of the median and mean we see that Price and KM are skewed.
#Exercise 1(b)
plot(density(myData_PKWT$Price),xlab = 'Price', main = 'Density plot for Price')

plot(density(myData_PKWT$KM),xlab='KM',main='Density plot for KM')

qqnorm(myData_PKWT$Price, xlab = 'Price')
qqline(myData_PKWT$Price)

qqnorm(myData_PKWT$KM)
qqline(myData_PKWT$KM)

#Price and KM are skewed.
#KM is normally distributed and Price is not normally distributed
#Exercise 1(c)
myData_PKWT$Tow_Bar = factor(myData_PKWT$Tow_Bar)
levels(myData_PKWT$Tow_Bar) = c('no','Yes')
summary(myData_PKWT$Tow_Bar)
## no Yes
## 1037 399
#Exercise 1(d)
boxplot(myData_PKWT$Price ~ myData_PKWT$Tow_Bar)

#The boxplots are different in the terms compared to Price.
#The tow_bar does not appear to predict Price
#Exercise 1(e)
boxplot(myData_PKWT$KM ~ myData_PKWT$Tow_Bar)

#The boxplots are different in terms of outliers.
#There is no prediction that can be made between the two.
#Exercise 1(f)
allt <- par(mfrow=c(1,2))
plot(Price~Tow_Bar, data=myData_PKWT)
plot(KM~Tow_Bar, data=myData_PKWT)

par(allt)
allt <- par(mfrow=c(1,2))
stripchart(Price ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")
stripchart(KM ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")

par(allt)
#If there is a tow_bar in the car the price of the car is less.
#When the car does not have a tow_bar there are many outliers as the car has been driven for many KMs.
#Exercise 2(a)
pairs(~ Price+ KM + Weight, data= myData_PKWT)

fit = lm(Price ~ Weight + KM, data=myData_PKWT)
fit
##
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
##
## Coefficients:
## (Intercept) Weight KM
## -2.737e+04 3.895e+01 -5.355e-02
coef(fit)
## (Intercept) Weight KM
## -27374.759440 38.953211 -0.053553
summary(fit)
##
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19112.5 -1294.3 -46.6 1248.9 8928.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.737e+04 1.174e+03 -23.32 <2e-16 ***
## Weight 3.895e+01 1.086e+00 35.87 <2e-16 ***
## KM -5.355e-02 1.524e-03 -35.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2165 on 1433 degrees of freedom
## Multiple R-squared: 0.6442, Adjusted R-squared: 0.6437
## F-statistic: 1297 on 2 and 1433 DF, p-value: < 2.2e-16
#With increase in KM the price of the car decreases
#THere is no relation between the price and weight
#Clearly there are several outliers
#KM and Weight appear to be redundant
#Exercise 2(b)
pairs(~ Price+ KM + Weight, data= myData_PKWT , col=myData_PKWT$Tow_Bar)

#It appears the relation between Price and KM is the same for cars with
#& without a tow bar
#there is no clear relationship visible that appear to be different for group of cars
#with or without a tow bar
#Exercise 3
#Question 3(a)
options(show.signif.stars = FALSE)
fit = lm(Price~ KM + Weight + Tow_Bar, data= myData_PKWT)
summary(fit)
##
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19077.1 -1248.8 -38.2 1230.7 8795.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04 1.168e+03 -22.930 < 2e-16
## KM -5.288e-02 1.515e-03 -34.910 < 2e-16
## Weight 3.853e+01 1.079e+00 35.726 < 2e-16
## Tow_BarYes -6.835e+02 1.271e+02 -5.378 8.8e-08
##
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared: 0.6513, Adjusted R-squared: 0.6505
## F-statistic: 891.4 on 3 and 1432 DF, p-value: < 2.2e-16
#Question 3(b)
summary(fit)
##
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19077.1 -1248.8 -38.2 1230.7 8795.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04 1.168e+03 -22.930 < 2e-16
## KM -5.288e-02 1.515e-03 -34.910 < 2e-16
## Weight 3.853e+01 1.079e+00 35.726 < 2e-16
## Tow_BarYes -6.835e+02 1.271e+02 -5.378 8.8e-08
##
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared: 0.6513, Adjusted R-squared: 0.6505
## F-statistic: 891.4 on 3 and 1432 DF, p-value: < 2.2e-16
#The residuals appear to be a non-parametric summary of their distribution.
#The residuals appears to be skewed little on the left
#Question 3(c)
coef(fit)
## (Intercept) KM Weight Tow_BarYes
## -2.677787e+04 -5.288276e-02 3.853090e+01 -6.835050e+02
#negative coefficient indicate that they are negatively corelated and inversely proptional to each other
#which indiacte that KM is negatively corelated indicating that with increase in KM there is a decrease in Price
#question 3(d)
#The signs of slope indicates that if it is negative it is inversely proportional
#and with positive signs it indicates that they are directly proportional.
#question 3(e)
plot(myData_PKWT$Price,myData_PKWT$KM)

#As the Km increases the Price decreases.
#The Prices go down.
#question 3(f)
plot(myData_PKWT$Price,myData_PKWT$Weight)
#As the weight increases there is no sudden change in the price
#Question 3(g)
#There is not much price difference of automobiles with and without Tow_bar with the same KM and weight. So this values does not make sense
#Question 3(h)
summary(fit)$r.square
## [1] 0.6512679
#This value indicates a good fitting model.
#Exercise 4
#Exercise 4(a)
deviance=deviance(fit)
y=myData_PKWT$Price
TotalSS=sum((y-mean(y))^2)
TotalSS
## [1] 18877241464
1-deviance/TotalSS
## [1] 0.6512679
summary(fit)$r.square
## [1] 0.6512679
#This value is equal to the R^2 value found in the model summary
#Exercise 4(b)
require(ggplot2)
## Loading required package: ggplot2

qplot(fitted.values(fit), Price, data= myData_PKWT)+ geom_abline(intercept = 0, slope = 1)
