Upload the database
library(readxl)
library(MASS)
library(stats)
library(psych)
library(openxlsx)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
car_prices <- read.csv("~/Desktop/car_prices.csv")
summary(car_prices)
## year make model trim
## Min. :1982 Length:558837 Length:558837 Length:558837
## 1st Qu.:2007 Class :character Class :character Class :character
## Median :2012 Mode :character Mode :character Mode :character
## Mean :2010
## 3rd Qu.:2013
## Max. :2015
##
## body transmission vin state
## Length:558837 Length:558837 Length:558837 Length:558837
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## condition odometer color interior
## Min. : 1.00 Min. : 1 Length:558837 Length:558837
## 1st Qu.:23.00 1st Qu.: 28371 Class :character Class :character
## Median :35.00 Median : 52254 Mode :character Mode :character
## Mean :30.67 Mean : 68320
## 3rd Qu.:42.00 3rd Qu.: 99109
## Max. :49.00 Max. :999999
## NA's :11820 NA's :94
## seller mmr sellingprice saledate
## Length:558837 Min. : 25 Min. : 1 Length:558837
## Class :character 1st Qu.: 7100 1st Qu.: 6900 Class :character
## Mode :character Median : 12250 Median : 12100 Mode :character
## Mean : 13769 Mean : 13611
## 3rd Qu.: 18300 3rd Qu.: 18200
## Max. :182000 Max. :230000
## NA's :38 NA's :12
Simple linear regresion
# Choosing 2 numeric variables that could make sense
DataRegression1 <- car_prices %>%
select(make, model, odometer, sellingprice, year, mmr)
# simple linear regression model 1
lm.fit = lm(sellingprice ~ odometer, data = DataRegression1)
summary(lm.fit)
##
## Call:
## lm(formula = sellingprice ~ odometer, data = DataRegression1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20876 -4960 -1771 2966 212079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.088e+04 1.722e+01 1212.6 <2e-16 ***
## odometer -1.063e-01 1.986e-04 -535.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7925 on 558729 degrees of freedom
## (106 observations deleted due to missingness)
## Multiple R-squared: 0.3392, Adjusted R-squared: 0.3392
## F-statistic: 2.868e+05 on 1 and 558729 DF, p-value: < 2.2e-16
# The p-values are very close to zero, suggesting that the relationship between selling price and year is significant. However the R Square is near to 34% which is not ideal
# simple linear regression model 2
lm.fit = lm(sellingprice ~ year, data = DataRegression1)
summary(lm.fit)
##
## Call:
## lm(formula = sellingprice ~ year, data = DataRegression1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19662 -5121 -1674 3103 210679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.884e+06 5.353e+03 -538.7 <2e-16 ***
## year 1.441e+03 2.663e+00 541.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7897 on 558823 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.344, Adjusted R-squared: 0.344
## F-statistic: 2.93e+05 on 1 and 558823 DF, p-value: < 2.2e-16
# The p-values are very close to zero, suggesting that the relationship between selling price and year is significant. However the R Square is near to 34% which is not ideal
# simple linear regression model 2
lm.fit = lm(sellingprice ~ mmr, data = DataRegression1)
summary(lm.fit)
##
## Call:
## lm(formula = sellingprice ~ mmr, data = DataRegression1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86699 -652 92 796 207442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.029e+01 4.086e+00 -7.413 1.24e-13 ***
## mmr 9.907e-01 2.428e-04 4081.020 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1757 on 558797 degrees of freedom
## (38 observations deleted due to missingness)
## Multiple R-squared: 0.9675, Adjusted R-squared: 0.9675
## F-statistic: 1.665e+07 on 1 and 558797 DF, p-value: < 2.2e-16
# The p-values are very close to zero, suggesting that the relationship between selling price and year is significant. The R Square is near to 96% which is ideal
# plot 1
plot(DataRegression1$sellingprice,DataRegression1$odometer,pch=20,col="blue")
abline(lm.fit,lwd=3,col="red")

# plot 2
plot(DataRegression1$sellingprice,DataRegression1$year,pch=20,col="blue")
abline(lm.fit,lwd=3,col="red")

# this shows how the newest the car is, the higher it will sell
# plot 3
plot(DataRegression1$sellingprice,DataRegression1$mmr,pch=20,col="blue")
abline(lm.fit,lwd=3,col="red")

Multiple linear regresion
lm.fit2=lm(sellingprice~odometer+mmr,data=car_prices)
summary(lm.fit2)
##
## Call:
## lm(formula = sellingprice ~ odometer + mmr, data = car_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86407 -649 89 793 207429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.332e+01 7.378e+00 12.65 <2e-16 ***
## odometer -1.094e-03 5.438e-05 -20.12 <2e-16 ***
## mmr 9.872e-01 3.000e-04 3291.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1755 on 558702 degrees of freedom
## (132 observations deleted due to missingness)
## Multiple R-squared: 0.9676, Adjusted R-squared: 0.9676
## F-statistic: 8.339e+06 on 2 and 558702 DF, p-value: < 2.2e-16
# The p-values associated with all coefficients are very close to zero indicating that the relationships between selling price, odometer, and mmr are statistically significant. The model explains approximately 96.76% of. the selling price. This means we can trust the model. The model was improved.
Polyinomial Regression
polynomialmodel <- lm(sellingprice ~ odometer + mmr + year, data = car_prices)
summary(polynomialmodel)
##
## Call:
## lm(formula = sellingprice ~ odometer + mmr + year, data = car_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86556 -651 95 796 207453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.882e+04 1.953e+03 19.88 <2e-16 ***
## odometer -2.016e-03 7.152e-05 -28.18 <2e-16 ***
## mmr 9.889e-01 3.121e-04 3169.02 <2e-16 ***
## year -1.924e+01 9.704e-01 -19.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1755 on 558701 degrees of freedom
## (132 observations deleted due to missingness)
## Multiple R-squared: 0.9676, Adjusted R-squared: 0.9676
## F-statistic: 5.563e+06 on 3 and 558701 DF, p-value: < 2.2e-16
sellingpricesq <- car_prices$sellingprice^2 # square
polymodel <- lm(year ~ sellingprice + mmr, data = car_prices)
polymodel2 <- lm(year ~ poly(sellingprice, degree = 2, raw = TRUE), data = car_prices) # different syntax
summary(polymodel) # Stats for the polynomial regression
##
## Call:
## lm(formula = year ~ sellingprice + mmr, data = car_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.158 -1.933 0.483 2.529 6.574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.007e+03 7.405e-03 270980.965 <2e-16 ***
## sellingprice -4.587e-06 2.424e-06 -1.892 0.0585 .
## mmr 2.490e-04 2.442e-06 101.993 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.183 on 558796 degrees of freedom
## (38 observations deleted due to missingness)
## Multiple R-squared: 0.356, Adjusted R-squared: 0.356
## F-statistic: 1.544e+05 on 2 and 558796 DF, p-value: < 2.2e-16