library(ggplot2)

#2. Load the Cars93 dataset from this package.
cars_data <- read.csv("Cars93 dataset-83422.csv")
summary(cars_data)
##     Manufacturer     Model         Type      Min.Price         Price      
##  Chevrolet: 8    100    : 1   Compact:16   Min.   : 6.70   Min.   : 7.40  
##  Ford     : 8    190E   : 1   Large  :11   1st Qu.:10.80   1st Qu.:12.20  
##  Dodge    : 6    240    : 1   Midsize:22   Median :14.70   Median :17.70  
##  Mazda    : 5    300E   : 1   Small  :21   Mean   :17.13   Mean   :19.51  
##  Pontiac  : 5    323    : 1   Sporty :14   3rd Qu.:20.30   3rd Qu.:23.30  
##  Buick    : 4    535i   : 1   Van    : 9   Max.   :45.40   Max.   :61.90  
##  (Other)  :57    (Other):87                                               
##    Max.Price       MPG.city      MPG.highway                  AirBags  
##  Min.   : 7.9   Min.   :15.00   Min.   :20.00   Driver & Passenger:16  
##  1st Qu.:14.7   1st Qu.:18.00   1st Qu.:26.00   Driver only       :43  
##  Median :19.6   Median :21.00   Median :28.00   None              :34  
##  Mean   :21.9   Mean   :22.37   Mean   :29.09                          
##  3rd Qu.:25.3   3rd Qu.:25.00   3rd Qu.:31.00                          
##  Max.   :80.0   Max.   :46.00   Max.   :50.00                          
##                                                                        
##  DriveTrain  Cylinders    EngineSize      Horsepower         RPM      
##  4WD  :10   3     : 3   Min.   :1.000   Min.   : 55.0   Min.   :3800  
##  Front:67   4     :49   1st Qu.:1.800   1st Qu.:103.0   1st Qu.:4800  
##  Rear :16   5     : 2   Median :2.400   Median :140.0   Median :5200  
##             6     :31   Mean   :2.668   Mean   :143.8   Mean   :5281  
##             8     : 7   3rd Qu.:3.300   3rd Qu.:170.0   3rd Qu.:5750  
##             rotary: 1   Max.   :5.700   Max.   :300.0   Max.   :6500  
##                                                                       
##   Rev.per.mile  Man.trans.avail Fuel.tank.capacity   Passengers   
##  Min.   :1320   No :32          Min.   : 9.20      Min.   :2.000  
##  1st Qu.:1985   Yes:61          1st Qu.:14.50      1st Qu.:4.000  
##  Median :2340                   Median :16.40      Median :5.000  
##  Mean   :2332                   Mean   :16.66      Mean   :5.086  
##  3rd Qu.:2565                   3rd Qu.:18.80      3rd Qu.:6.000  
##  Max.   :3755                   Max.   :27.00      Max.   :8.000  
##                                                                   
##      Length        Wheelbase         Width        Turn.circle   
##  Min.   :141.0   Min.   : 90.0   Min.   :60.00   Min.   :32.00  
##  1st Qu.:174.0   1st Qu.: 98.0   1st Qu.:67.00   1st Qu.:37.00  
##  Median :183.0   Median :103.0   Median :69.00   Median :39.00  
##  Mean   :183.2   Mean   :103.9   Mean   :69.38   Mean   :38.96  
##  3rd Qu.:192.0   3rd Qu.:110.0   3rd Qu.:72.00   3rd Qu.:41.00  
##  Max.   :219.0   Max.   :119.0   Max.   :78.00   Max.   :45.00  
##                                                                 
##  Rear.seat.room   Luggage.room       Weight         Origin              Make   
##  Min.   :19.00   Min.   : 6.00   Min.   :1695   non-USA:45   Acura Integra: 1  
##  1st Qu.:26.00   1st Qu.:12.00   1st Qu.:2620   USA    :48   Acura Legend : 1  
##  Median :27.50   Median :14.00   Median :3040                Audi 100     : 1  
##  Mean   :27.83   Mean   :13.89   Mean   :3073                Audi 90      : 1  
##  3rd Qu.:30.00   3rd Qu.:15.00   3rd Qu.:3525                BMW 535i     : 1  
##  Max.   :36.00   Max.   :22.00   Max.   :4105                Buick Century: 1  
##  NA's   :2       NA's   :11                                  (Other)      :87
#3. Use a graph to visualize the relationship between Price and Horsepower

ggplot(cars_data, aes(x = Price, y = Horsepower)) + geom_point() + xlab("Price") +  ylab("Horsepower")

#4. What is the correlation between Price and Horsepower?
cor(cars_data$Price, cars_data$Horsepower)
## [1] 0.7882176
# As price increases, horsepower increases. 
# COR = 0.7882176



#5. Fit a model to predict Price from Horsepower. How does Price change with Horsepower?
m1<-lm(Price~Horsepower, data=cars_data)
summary(m1)
## 
## Call:
## lm(formula = Price ~ Horsepower, data = cars_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.413  -2.792  -0.821   1.803  31.753 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.3988     1.8200  -0.769    0.444    
## Horsepower    0.1454     0.0119  12.218   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.977 on 91 degrees of freedom
## Multiple R-squared:  0.6213, Adjusted R-squared:  0.6171 
## F-statistic: 149.3 on 1 and 91 DF,  p-value: < 2.2e-16
# As price increases by 1, horsepower increases by 0.145



# 6.    Add AirBags to the model. Is AirBags associated with Price? What is the relationship?
m2 <- lm(Price~Horsepower+AirBags, data=cars_data)
summary(m2)
## 
## Call:
## lm(formula = Price ~ Horsepower + AirBags, data = cars_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.7645  -2.6767  -0.4829   1.7651  29.4552 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.59057    2.74910   2.034 0.044970 *  
## Horsepower          0.12375    0.01281   9.663  1.6e-15 ***
## AirBagsDriver only -3.15178    1.70752  -1.846 0.068243 .  
## AirBagsNone        -6.62673    1.93085  -3.432 0.000911 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.657 on 89 degrees of freedom
## Multiple R-squared:  0.6681, Adjusted R-squared:  0.657 
## F-statistic: 59.73 on 3 and 89 DF,  p-value: < 2.2e-16
# As price increases, the likelihood of having only driver airbags decreases (by 3.15) and the likelihood of having no airbags decreases more (by 6.62)



#7. Add Origin to the model. How is Origin associated with Price?
m3 <- lm(Price~Horsepower+AirBags+Origin, data=cars_data)
summary(m3)
## 
## Call:
## lm(formula = Price ~ Horsepower + AirBags + Origin, data = cars_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4955  -2.4832  -0.6324   1.9598  27.5824 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         7.11761    2.69844   2.638 0.009870 ** 
## Horsepower          0.12535    0.01233  10.162  < 2e-16 ***
## AirBagsDriver only -3.18973    1.64286  -1.942 0.055388 .  
## AirBagsNone        -6.81385    1.85882  -3.666 0.000421 ***
## OriginUSA          -3.23611    1.13352  -2.855 0.005368 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.443 on 88 degrees of freedom
## Multiple R-squared:  0.6963, Adjusted R-squared:  0.6825 
## F-statistic: 50.43 on 4 and 88 DF,  p-value: < 2.2e-16
# As price increases, the likelihood of the origin being US decreases (by 3.236)



#8. Does the final model (7) fit the data better than the model in (4) above? Use a suitable test to check this.
anova(m1,m3)
## Analysis of Variance Table
## 
## Model 1: Price ~ Horsepower
## Model 2: Price ~ Horsepower + AirBags + Origin
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     91 3250.9                                  
## 2     88 2607.2  3    643.72 7.2425 0.0002128 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS of m1 is 3250.9, of m3 is 2607.2 
# m3 has P value <0.001 
# ==> m3 better than m1 at predicting price



#9. Check the residuals vs fitted values plot and the normal Q-Q plot to see if the model violates assumptions of homogeneous variance and normality of residuals. Comment on this.
plot(m3)

# Residuals v. Fitted values plot demonstrates homogenous variance, and residuals seem normally distributed. 



#10.    Use a suitable test to compare mean Price by Origin.
t.test(Price~Origin, data=cars_data)
## 
##  Welch Two Sample t-test
## 
## data:  Price by Origin
## t = 0.95449, df = 77.667, p-value = 0.3428
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.102311  5.974255
## sample estimates:
## mean in group non-USA     mean in group USA 
##              20.50889              18.57292
# mean in non-USA group: 20.5; mean in USA group: 18.6