library(ggplot2)
#2. Load the Cars93 dataset from this package.
cars_data <- read.csv("Cars93 dataset-83422.csv")
summary(cars_data)
## Manufacturer Model Type Min.Price Price
## Chevrolet: 8 100 : 1 Compact:16 Min. : 6.70 Min. : 7.40
## Ford : 8 190E : 1 Large :11 1st Qu.:10.80 1st Qu.:12.20
## Dodge : 6 240 : 1 Midsize:22 Median :14.70 Median :17.70
## Mazda : 5 300E : 1 Small :21 Mean :17.13 Mean :19.51
## Pontiac : 5 323 : 1 Sporty :14 3rd Qu.:20.30 3rd Qu.:23.30
## Buick : 4 535i : 1 Van : 9 Max. :45.40 Max. :61.90
## (Other) :57 (Other):87
## Max.Price MPG.city MPG.highway AirBags
## Min. : 7.9 Min. :15.00 Min. :20.00 Driver & Passenger:16
## 1st Qu.:14.7 1st Qu.:18.00 1st Qu.:26.00 Driver only :43
## Median :19.6 Median :21.00 Median :28.00 None :34
## Mean :21.9 Mean :22.37 Mean :29.09
## 3rd Qu.:25.3 3rd Qu.:25.00 3rd Qu.:31.00
## Max. :80.0 Max. :46.00 Max. :50.00
##
## DriveTrain Cylinders EngineSize Horsepower RPM
## 4WD :10 3 : 3 Min. :1.000 Min. : 55.0 Min. :3800
## Front:67 4 :49 1st Qu.:1.800 1st Qu.:103.0 1st Qu.:4800
## Rear :16 5 : 2 Median :2.400 Median :140.0 Median :5200
## 6 :31 Mean :2.668 Mean :143.8 Mean :5281
## 8 : 7 3rd Qu.:3.300 3rd Qu.:170.0 3rd Qu.:5750
## rotary: 1 Max. :5.700 Max. :300.0 Max. :6500
##
## Rev.per.mile Man.trans.avail Fuel.tank.capacity Passengers
## Min. :1320 No :32 Min. : 9.20 Min. :2.000
## 1st Qu.:1985 Yes:61 1st Qu.:14.50 1st Qu.:4.000
## Median :2340 Median :16.40 Median :5.000
## Mean :2332 Mean :16.66 Mean :5.086
## 3rd Qu.:2565 3rd Qu.:18.80 3rd Qu.:6.000
## Max. :3755 Max. :27.00 Max. :8.000
##
## Length Wheelbase Width Turn.circle
## Min. :141.0 Min. : 90.0 Min. :60.00 Min. :32.00
## 1st Qu.:174.0 1st Qu.: 98.0 1st Qu.:67.00 1st Qu.:37.00
## Median :183.0 Median :103.0 Median :69.00 Median :39.00
## Mean :183.2 Mean :103.9 Mean :69.38 Mean :38.96
## 3rd Qu.:192.0 3rd Qu.:110.0 3rd Qu.:72.00 3rd Qu.:41.00
## Max. :219.0 Max. :119.0 Max. :78.00 Max. :45.00
##
## Rear.seat.room Luggage.room Weight Origin Make
## Min. :19.00 Min. : 6.00 Min. :1695 non-USA:45 Acura Integra: 1
## 1st Qu.:26.00 1st Qu.:12.00 1st Qu.:2620 USA :48 Acura Legend : 1
## Median :27.50 Median :14.00 Median :3040 Audi 100 : 1
## Mean :27.83 Mean :13.89 Mean :3073 Audi 90 : 1
## 3rd Qu.:30.00 3rd Qu.:15.00 3rd Qu.:3525 BMW 535i : 1
## Max. :36.00 Max. :22.00 Max. :4105 Buick Century: 1
## NA's :2 NA's :11 (Other) :87
#3. Use a graph to visualize the relationship between Price and Horsepower
ggplot(cars_data, aes(x = Price, y = Horsepower)) + geom_point() + xlab("Price") + ylab("Horsepower")

#4. What is the correlation between Price and Horsepower?
cor(cars_data$Price, cars_data$Horsepower)
## [1] 0.7882176
# As price increases, horsepower increases.
# COR = 0.7882176
#5. Fit a model to predict Price from Horsepower. How does Price change with Horsepower?
m1<-lm(Price~Horsepower, data=cars_data)
summary(m1)
##
## Call:
## lm(formula = Price ~ Horsepower, data = cars_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.413 -2.792 -0.821 1.803 31.753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.3988 1.8200 -0.769 0.444
## Horsepower 0.1454 0.0119 12.218 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.977 on 91 degrees of freedom
## Multiple R-squared: 0.6213, Adjusted R-squared: 0.6171
## F-statistic: 149.3 on 1 and 91 DF, p-value: < 2.2e-16
# As price increases by 1, horsepower increases by 0.145
# 6. Add AirBags to the model. Is AirBags associated with Price? What is the relationship?
m2 <- lm(Price~Horsepower+AirBags, data=cars_data)
summary(m2)
##
## Call:
## lm(formula = Price ~ Horsepower + AirBags, data = cars_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.7645 -2.6767 -0.4829 1.7651 29.4552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.59057 2.74910 2.034 0.044970 *
## Horsepower 0.12375 0.01281 9.663 1.6e-15 ***
## AirBagsDriver only -3.15178 1.70752 -1.846 0.068243 .
## AirBagsNone -6.62673 1.93085 -3.432 0.000911 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.657 on 89 degrees of freedom
## Multiple R-squared: 0.6681, Adjusted R-squared: 0.657
## F-statistic: 59.73 on 3 and 89 DF, p-value: < 2.2e-16
# As price increases, the likelihood of having only driver airbags decreases (by 3.15) and the likelihood of having no airbags decreases more (by 6.62)
#7. Add Origin to the model. How is Origin associated with Price?
m3 <- lm(Price~Horsepower+AirBags+Origin, data=cars_data)
summary(m3)
##
## Call:
## lm(formula = Price ~ Horsepower + AirBags + Origin, data = cars_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4955 -2.4832 -0.6324 1.9598 27.5824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.11761 2.69844 2.638 0.009870 **
## Horsepower 0.12535 0.01233 10.162 < 2e-16 ***
## AirBagsDriver only -3.18973 1.64286 -1.942 0.055388 .
## AirBagsNone -6.81385 1.85882 -3.666 0.000421 ***
## OriginUSA -3.23611 1.13352 -2.855 0.005368 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.443 on 88 degrees of freedom
## Multiple R-squared: 0.6963, Adjusted R-squared: 0.6825
## F-statistic: 50.43 on 4 and 88 DF, p-value: < 2.2e-16
# As price increases, the likelihood of the origin being US decreases (by 3.236)
#8. Does the final model (7) fit the data better than the model in (4) above? Use a suitable test to check this.
anova(m1,m3)
## Analysis of Variance Table
##
## Model 1: Price ~ Horsepower
## Model 2: Price ~ Horsepower + AirBags + Origin
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 91 3250.9
## 2 88 2607.2 3 643.72 7.2425 0.0002128 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS of m1 is 3250.9, of m3 is 2607.2
# m3 has P value <0.001
# ==> m3 better than m1 at predicting price
#9. Check the residuals vs fitted values plot and the normal Q-Q plot to see if the model violates assumptions of homogeneous variance and normality of residuals. Comment on this.
plot(m3)




# Residuals v. Fitted values plot demonstrates homogenous variance, and residuals seem normally distributed.
#10. Use a suitable test to compare mean Price by Origin.
t.test(Price~Origin, data=cars_data)
##
## Welch Two Sample t-test
##
## data: Price by Origin
## t = 0.95449, df = 77.667, p-value = 0.3428
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.102311 5.974255
## sample estimates:
## mean in group non-USA mean in group USA
## 20.50889 18.57292
# mean in non-USA group: 20.5; mean in USA group: 18.6