Upload the database

library(readxl)
library(MASS)
library(stats)
library(psych)
library(openxlsx)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
car_prices <- read.csv("~/Desktop/car_prices.csv")
summary(car_prices)
##       year          make              model               trim          
##  Min.   :1982   Length:558837      Length:558837      Length:558837     
##  1st Qu.:2007   Class :character   Class :character   Class :character  
##  Median :2012   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2010                                                           
##  3rd Qu.:2013                                                           
##  Max.   :2015                                                           
##                                                                         
##      body           transmission           vin               state          
##  Length:558837      Length:558837      Length:558837      Length:558837     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    condition        odometer         color             interior        
##  Min.   : 1.00   Min.   :     1   Length:558837      Length:558837     
##  1st Qu.:23.00   1st Qu.: 28371   Class :character   Class :character  
##  Median :35.00   Median : 52254   Mode  :character   Mode  :character  
##  Mean   :30.67   Mean   : 68320                                        
##  3rd Qu.:42.00   3rd Qu.: 99109                                        
##  Max.   :49.00   Max.   :999999                                        
##  NA's   :11820   NA's   :94                                            
##     seller               mmr          sellingprice      saledate        
##  Length:558837      Min.   :    25   Min.   :     1   Length:558837     
##  Class :character   1st Qu.:  7100   1st Qu.:  6900   Class :character  
##  Mode  :character   Median : 12250   Median : 12100   Mode  :character  
##                     Mean   : 13769   Mean   : 13611                     
##                     3rd Qu.: 18300   3rd Qu.: 18200                     
##                     Max.   :182000   Max.   :230000                     
##                     NA's   :38       NA's   :12

Simple linear regresion

# Choosing 2 numeric variables that could make sense
DataRegression1 <- car_prices %>%
  select(make, model, odometer, sellingprice, year, mmr)
#  simple linear regression model 1
lm.fit = lm(sellingprice ~ odometer, data = DataRegression1)
summary(lm.fit)
## 
## Call:
## lm(formula = sellingprice ~ odometer, data = DataRegression1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20876  -4960  -1771   2966 212079 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.088e+04  1.722e+01  1212.6   <2e-16 ***
## odometer    -1.063e-01  1.986e-04  -535.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7925 on 558729 degrees of freedom
##   (106 observations deleted due to missingness)
## Multiple R-squared:  0.3392, Adjusted R-squared:  0.3392 
## F-statistic: 2.868e+05 on 1 and 558729 DF,  p-value: < 2.2e-16
# The p-values are very close to zero, suggesting that the relationship between selling price and year is significant. However the R Square is near to 34% which is not ideal
#  simple linear regression model 2
lm.fit = lm(sellingprice ~ year, data = DataRegression1)
summary(lm.fit)
## 
## Call:
## lm(formula = sellingprice ~ year, data = DataRegression1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19662  -5121  -1674   3103 210679 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.884e+06  5.353e+03  -538.7   <2e-16 ***
## year         1.441e+03  2.663e+00   541.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7897 on 558823 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.344,  Adjusted R-squared:  0.344 
## F-statistic: 2.93e+05 on 1 and 558823 DF,  p-value: < 2.2e-16
# The p-values are very close to zero, suggesting that the relationship between selling price and year is significant. However the R Square is near to 34% which is not ideal
#  simple linear regression model 2
lm.fit = lm(sellingprice ~ mmr, data = DataRegression1)
summary(lm.fit)
## 
## Call:
## lm(formula = sellingprice ~ mmr, data = DataRegression1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -86699   -652     92    796 207442 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -3.029e+01  4.086e+00   -7.413 1.24e-13 ***
## mmr          9.907e-01  2.428e-04 4081.020  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1757 on 558797 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.9675, Adjusted R-squared:  0.9675 
## F-statistic: 1.665e+07 on 1 and 558797 DF,  p-value: < 2.2e-16
# The p-values are very close to zero, suggesting that the relationship between selling price and year is significant. The R Square is near to 96% which is ideal
# plot 1
plot(DataRegression1$sellingprice,DataRegression1$odometer,pch=20,col="blue")
abline(lm.fit,lwd=3,col="red")

# plot 2
plot(DataRegression1$sellingprice,DataRegression1$year,pch=20,col="blue")
abline(lm.fit,lwd=3,col="red")

# this shows how the newest the car is, the higher it will sell
# plot 3
plot(DataRegression1$sellingprice,DataRegression1$mmr,pch=20,col="blue")
abline(lm.fit,lwd=3,col="red")

Multiple linear regresion

lm.fit2=lm(sellingprice~odometer+mmr,data=car_prices)
summary(lm.fit2)
## 
## Call:
## lm(formula = sellingprice ~ odometer + mmr, data = car_prices)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -86407   -649     89    793 207429 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.332e+01  7.378e+00   12.65   <2e-16 ***
## odometer    -1.094e-03  5.438e-05  -20.12   <2e-16 ***
## mmr          9.872e-01  3.000e-04 3291.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1755 on 558702 degrees of freedom
##   (132 observations deleted due to missingness)
## Multiple R-squared:  0.9676, Adjusted R-squared:  0.9676 
## F-statistic: 8.339e+06 on 2 and 558702 DF,  p-value: < 2.2e-16
# The p-values associated with all coefficients are very close to zero indicating that the relationships between selling price, odometer, and mmr are statistically significant.  The model explains approximately 96.76% of. the selling price. This means we can trust the model. The model was improved.

Polyinomial Regression

polynomialmodel <- lm(sellingprice ~ odometer + mmr + year, data = car_prices)
summary(polynomialmodel)
## 
## Call:
## lm(formula = sellingprice ~ odometer + mmr + year, data = car_prices)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -86556   -651     95    796 207453 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.882e+04  1.953e+03   19.88   <2e-16 ***
## odometer    -2.016e-03  7.152e-05  -28.18   <2e-16 ***
## mmr          9.889e-01  3.121e-04 3169.02   <2e-16 ***
## year        -1.924e+01  9.704e-01  -19.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1755 on 558701 degrees of freedom
##   (132 observations deleted due to missingness)
## Multiple R-squared:  0.9676, Adjusted R-squared:  0.9676 
## F-statistic: 5.563e+06 on 3 and 558701 DF,  p-value: < 2.2e-16
sellingpricesq <- car_prices$sellingprice^2 # square

polymodel <- lm(year ~ sellingprice + mmr, data = car_prices) 
polymodel2 <- lm(year ~ poly(sellingprice, degree = 2, raw = TRUE), data = car_prices) # different syntax

summary(polymodel) # Stats for the polynomial regression
## 
## Call:
## lm(formula = year ~ sellingprice + mmr, data = car_prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.158  -1.933   0.483   2.529   6.574 
## 
## Coefficients:
##                Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)   2.007e+03  7.405e-03 270980.965   <2e-16 ***
## sellingprice -4.587e-06  2.424e-06     -1.892   0.0585 .  
## mmr           2.490e-04  2.442e-06    101.993   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.183 on 558796 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.356,  Adjusted R-squared:  0.356 
## F-statistic: 1.544e+05 on 2 and 558796 DF,  p-value: < 2.2e-16