#Assignment 3

ToyotaPrices <- read.csv("C:/Users/aksha/Downloads/ToyotaPrices.csv")

names(ToyotaPrices)
##  [1] "Id"               "Price"            "Age_08_04"       
##  [4] "Mfg_Month"        "Mfg_Year"         "KM"              
##  [7] "HP"               "Automatic"        "cc"              
## [10] "Doors"            "Cylinders"        "Gears"           
## [13] "Quarterly_Tax"    "Weight"           "Mfr_Guarantee"   
## [16] "BOVAG_Guarantee"  "Guarantee_Period" "ABS"             
## [19] "Airbag_1"         "Airbag_2"         "Airco"           
## [22] "Automatic_airco"  "Boardcomputer"    "CD_Player"       
## [25] "Central_Lock"     "Powered_Windows"  "Power_Steering"  
## [28] "Radio"            "Mistlamps"        "Sport_Model"     
## [31] "Backseat_Divider" "Metallic_Rim"     "Radio_cassette"  
## [34] "Tow_Bar"
myData_PKWT <- subset(ToyotaPrices, select = c(Price, KM, Weight, Tow_Bar))
head(myData_PKWT)
##   Price    KM Weight Tow_Bar
## 1 13500 46986   1165       0
## 2 13750 72937   1165       0
## 3 13950 41711   1165       0
## 4 14950 48000   1165       0
## 5 13750 38500   1170       0
## 6 12950 61000   1170       0
#Exercise 1(a)

summary(myData_PKWT)
##      Price             KM             Weight        Tow_Bar      
##  Min.   : 4350   Min.   :     1   Min.   :1000   Min.   :0.0000  
##  1st Qu.: 8450   1st Qu.: 43000   1st Qu.:1040   1st Qu.:0.0000  
##  Median : 9900   Median : 63390   Median :1070   Median :0.0000  
##  Mean   :10731   Mean   : 68533   Mean   :1072   Mean   :0.2779  
##  3rd Qu.:11950   3rd Qu.: 87021   3rd Qu.:1085   3rd Qu.:1.0000  
##  Max.   :32500   Max.   :243000   Max.   :1615   Max.   :1.0000
#from inspection of the median and mean we see that Price and KM are skewed.

#Exercise 1(b)
plot(density(myData_PKWT$Price),xlab = 'Price', main = 'Density plot for Price')

plot(density(myData_PKWT$KM),xlab='KM',main='Density plot for KM')    

qqnorm(myData_PKWT$Price, xlab = 'Price')
qqline(myData_PKWT$Price)

qqnorm(myData_PKWT$KM)
qqline(myData_PKWT$KM)

#Price and KM are skewed.
#KM is normally distributed and Price is not normally distributed


#Exercise 1(c)
myData_PKWT$Tow_Bar = factor(myData_PKWT$Tow_Bar)

levels(myData_PKWT$Tow_Bar) = c('no','Yes')
summary(myData_PKWT$Tow_Bar)
##   no  Yes 
## 1037  399
#Exercise 1(d)

boxplot(myData_PKWT$Price ~ myData_PKWT$Tow_Bar)

#The boxplots are different in the terms compared to Price.
#The tow_bar does not appear to predict Price

#Exercise 1(e)

boxplot(myData_PKWT$KM ~ myData_PKWT$Tow_Bar)

#The boxplots are different in terms of outliers.
#There is no prediction that can be made between the two.



#Exercise 1(f)

allt <- par(mfrow=c(1,2))
plot(Price~Tow_Bar, data=myData_PKWT)
plot(KM~Tow_Bar, data=myData_PKWT)

par(allt)

allt <- par(mfrow=c(1,2))
stripchart(Price ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")
stripchart(KM ~ Tow_Bar, data=myData_PKWT, method = "jitter", vertical = TRUE, xlab="Tow Bar")

par(allt)
#If there is a tow_bar in the car the price of the car is less.
#When the car does not have a tow_bar there are many outliers as the car has been driven for many KMs.

#Exercise 2(a)

pairs(~ Price+ KM + Weight, data= myData_PKWT)

fit = lm(Price ~ Weight + KM, data=myData_PKWT)
fit
## 
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
## 
## Coefficients:
## (Intercept)       Weight           KM  
##  -2.737e+04    3.895e+01   -5.355e-02
coef(fit)
##   (Intercept)        Weight            KM 
## -27374.759440     38.953211     -0.053553
summary(fit)
## 
## Call:
## lm(formula = Price ~ Weight + KM, data = myData_PKWT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19112.5  -1294.3    -46.6   1248.9   8928.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.737e+04  1.174e+03  -23.32   <2e-16 ***
## Weight       3.895e+01  1.086e+00   35.87   <2e-16 ***
## KM          -5.355e-02  1.524e-03  -35.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2165 on 1433 degrees of freedom
## Multiple R-squared:  0.6442, Adjusted R-squared:  0.6437 
## F-statistic:  1297 on 2 and 1433 DF,  p-value: < 2.2e-16
#With increase in KM the price of the car decreases
#THere is no relation between the price and weight
#Clearly there are several outliers
#KM and Weight appear to be redundant

#Exercise 2(b)
pairs(~ Price+ KM + Weight, data= myData_PKWT , col=myData_PKWT$Tow_Bar)

#It appears the relation between Price and KM is the same for cars with 
#& without a tow bar

#there is no clear relationship visible that appear to be different for group of cars
#with or without a tow bar

#Exercise 3
#Question 3(a)
options(show.signif.stars = FALSE)
fit = lm(Price~ KM + Weight + Tow_Bar, data= myData_PKWT)
summary(fit)
## 
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19077.1  -1248.8    -38.2   1230.7   8795.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04  1.168e+03 -22.930  < 2e-16
## KM          -5.288e-02  1.515e-03 -34.910  < 2e-16
## Weight       3.853e+01  1.079e+00  35.726  < 2e-16
## Tow_BarYes  -6.835e+02  1.271e+02  -5.378  8.8e-08
## 
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared:  0.6513, Adjusted R-squared:  0.6505 
## F-statistic: 891.4 on 3 and 1432 DF,  p-value: < 2.2e-16
#Question 3(b)
summary(fit)
## 
## Call:
## lm(formula = Price ~ KM + Weight + Tow_Bar, data = myData_PKWT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19077.1  -1248.8    -38.2   1230.7   8795.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.678e+04  1.168e+03 -22.930  < 2e-16
## KM          -5.288e-02  1.515e-03 -34.910  < 2e-16
## Weight       3.853e+01  1.079e+00  35.726  < 2e-16
## Tow_BarYes  -6.835e+02  1.271e+02  -5.378  8.8e-08
## 
## Residual standard error: 2144 on 1432 degrees of freedom
## Multiple R-squared:  0.6513, Adjusted R-squared:  0.6505 
## F-statistic: 891.4 on 3 and 1432 DF,  p-value: < 2.2e-16
#The residuals appear to be a non-parametric summary of their distribution.
#The residuals appears to be skewed little on the left

#Question 3(c)
coef(fit)
##   (Intercept)            KM        Weight    Tow_BarYes 
## -2.677787e+04 -5.288276e-02  3.853090e+01 -6.835050e+02
#negative coefficient indicate that they are negatively corelated and inversely proptional to each other
#which indiacte that KM is negatively corelated indicating that with increase in KM there is a  decrease in Price

#question 3(d)
#The signs of slope indicates that if  it is negative it is inversely proportional
#and with positive signs it indicates that they are directly proportional.

#question 3(e)
plot(myData_PKWT$Price,myData_PKWT$KM)

#As the Km increases the Price decreases.
#The Prices go down.

#question 3(f)
plot(myData_PKWT$Price,myData_PKWT$Weight)
#As the weight increases there is no sudden change in the price

#Question 3(g)
#There is not much price difference of automobiles with and without Tow_bar with the same KM and weight. So this values does not make sense 


#Question 3(h)
summary(fit)$r.square
## [1] 0.6512679
#This value indicates a good fitting model.


#Exercise 4
#Exercise 4(a)
deviance=deviance(fit)
y=myData_PKWT$Price
TotalSS=sum((y-mean(y))^2)
TotalSS
## [1] 18877241464
1-deviance/TotalSS
## [1] 0.6512679
summary(fit)$r.square
## [1] 0.6512679
#This value is equal to the R^2 value found in the model summary


#Exercise 4(b)
require(ggplot2)
## Loading required package: ggplot2

qplot(fitted.values(fit), Price, data= myData_PKWT)+ geom_abline(intercept = 0, slope = 1)