#import data
getwd()
## [1] "/Users/dengd/Desktop/Research Project"
setwd("/Users/dengd/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - Deng’s MacBook Pro - 1/Documents/STAT 840 Spring 2022/Final Project/STAT840FP/finalproject 2")
data<-read.csv("CarPrice_Assignment.csv",header = T)
#form a data frame with only the required variables
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data2<-data %>% dplyr::select(c("price", "highwaympg", "citympg", "horsepower"
,"fuelsystem", "enginesize", "cylindernumber"
,"drivewheel", "fueltype" ))
#check for missing values
sum(is.na(data2))
## [1] 0
#convert chr to factor for categorical variables
data2$fuelsystem<-as.factor(data2$fuelsystem)
data2$cylindernumber<-as.factor(data2$cylindernumber)
data2$drivewheel<-as.factor(data2$drivewheel)
data2$fueltype<-as.factor(data2$fueltype)
#visualize data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
##
## The following object is masked from 'package:ggplot2':
##
## alpha
# fuel system
#frequency table
tab1(data2$fuelsystem, sort.group = "decreasing",
cum.percent = TRUE)

## data2$fuelsystem :
## Frequency Percent Cum. percent
## mpfi 94 45.9 45.9
## 2bbl 66 32.2 78.0
## idi 20 9.8 87.8
## 1bbl 11 5.4 93.2
## spdi 9 4.4 97.6
## 4bbl 3 1.5 99.0
## spfi 1 0.5 99.5
## mfi 1 0.5 100.0
## Total 205 100.0 100.0
ggplot(data = data2, aes(x =fuelsystem, fill =fuelsystem)) +
geom_bar()

#cylinder number
#frequency table
tab1(data2$cylindernumber, sort.group = "decreasing",
cum.percent = TRUE)

## data2$cylindernumber :
## Frequency Percent Cum. percent
## four 159 77.6 77.6
## six 24 11.7 89.3
## five 11 5.4 94.6
## eight 5 2.4 97.1
## two 4 2.0 99.0
## twelve 1 0.5 99.5
## three 1 0.5 100.0
## Total 205 100.0 100.0
ggplot(data = data2, aes(x =cylindernumber, fill =cylindernumber)) +
geom_bar()

#drive wheel
#frequency table
tab1(data2$drivewheel, sort.group = "decreasing",
cum.percent = TRUE)

## data2$drivewheel :
## Frequency Percent Cum. percent
## fwd 120 58.5 58.5
## rwd 76 37.1 95.6
## 4wd 9 4.4 100.0
## Total 205 100.0 100.0
ggplot(data = data2, aes(x =drivewheel, fill =drivewheel)) +
geom_bar()

# fuel type
#Frequency table
tab1(data2$fueltype, sort.group = "decreasing",
cum.percent = TRUE)

## data2$fueltype :
## Frequency Percent Cum. percent
## gas 185 90.2 90.2
## diesel 20 9.8 100.0
## Total 205 100.0 100.0
ggplot(data = data2, aes(x =fueltype, fill =fueltype)) +
geom_bar()

# summary descriptive statistics for continuous data
data2 %>% group_by(fuelsystem) %>%
summarise(mean_price=mean(price),
sd_price=sd(price))
## # A tibble: 8 × 3
## fuelsystem mean_price sd_price
## <fct> <dbl> <dbl>
## 1 1bbl 7556. 1390.
## 2 2bbl 7478. 1385.
## 3 4bbl 12145 1375.
## 4 idi 15838. 7760.
## 5 mfi 12964 NA
## 6 mpfi 17755. 8686.
## 7 spdi 10990. 2742.
## 8 spfi 11048 NA
data2 %>% group_by(fueltype) %>%
summarise(mean_price=mean(price),
sd_price=sd(price))
## # A tibble: 2 × 3
## fueltype mean_price sd_price
## <fct> <dbl> <dbl>
## 1 diesel 15838. 7760.
## 2 gas 13000. 7984.
data2 %>% group_by(cylindernumber) %>%
summarise(mean_price=mean(price),
sd_price=sd(price))
## # A tibble: 7 × 3
## cylindernumber mean_price sd_price
## <fct> <dbl> <dbl>
## 1 eight 37400. 5668.
## 2 five 21630. 6091.
## 3 four 10286. 3922.
## 4 six 23672. 8850.
## 5 three 5151 NA
## 6 twelve 36000 NA
## 7 two 13020 2079.
data2 %>% group_by(drivewheel) %>%
summarise(mean_price=mean(price),
sd_price=sd(price))
## # A tibble: 3 × 3
## drivewheel mean_price sd_price
## <fct> <dbl> <dbl>
## 1 4wd 11087. 3989.
## 2 fwd 9239. 3318.
## 3 rwd 19911. 9120.
#assumptions test
#normality
#q-q plot
qqnorm(data2$price, pch = 1, frame = FALSE)
qqline(data2$price, col = "steelblue", lwd = 2)

#using shapiro test
shapiro.test(data2$price)
##
## Shapiro-Wilk normality test
##
## data: data2$price
## W = 0.80066, p-value = 1.849e-15
#Linear relationship
#scatter plot matrix
plot(data2[-c(5,7:9)], col="blue")

#correlation for multicollinearity
cor(data2[-c(1,5,7:9)])
## highwaympg citympg horsepower enginesize
## highwaympg 1.0000000 0.9713370 -0.7705439 -0.6774699
## citympg 0.9713370 1.0000000 -0.8014562 -0.6536579
## horsepower -0.7705439 -0.8014562 1.0000000 0.8097687
## enginesize -0.6774699 -0.6536579 0.8097687 1.0000000
#the full model
model<-lm(log10(price)~highwaympg+citympg+horsepower+fuelsystem
+enginesize+cylindernumber+drivewheel+fueltype, data=data2)
summary(model)
##
## Call:
## lm(formula = log10(price) ~ highwaympg + citympg + horsepower +
## fuelsystem + enginesize + cylindernumber + drivewheel + fueltype,
## data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.242788 -0.044721 -0.002504 0.049700 0.181253
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9161791 0.1284815 30.480 < 2e-16 ***
## highwaympg 0.0039184 0.0037937 1.033 0.303012
## citympg -0.0124920 0.0043557 -2.868 0.004611 **
## horsepower 0.0012699 0.0003570 3.557 0.000476 ***
## fuelsystem2bbl -0.0253120 0.0256602 -0.986 0.325210
## fuelsystem4bbl -0.0042693 0.0932270 -0.046 0.963523
## fuelsystemidi 0.1417864 0.0320979 4.417 1.70e-05 ***
## fuelsystemmfi -0.0535653 0.0841034 -0.637 0.524978
## fuelsystemmpfi 0.0346036 0.0287782 1.202 0.230736
## fuelsystemspdi -0.0536225 0.0388492 -1.380 0.169168
## fuelsystemspfi -0.0081712 0.0820039 -0.100 0.920735
## enginesize 0.0017259 0.0003798 4.544 9.92e-06 ***
## cylindernumberfive 0.0512946 0.0561305 0.914 0.361988
## cylindernumberfour -0.0453595 0.0564298 -0.804 0.422532
## cylindernumbersix -0.0387203 0.0448459 -0.863 0.389031
## cylindernumberthree 0.0488243 0.1014790 0.481 0.630995
## cylindernumbertwelve -0.2411926 0.0888768 -2.714 0.007281 **
## cylindernumbertwo -0.0033038 0.1060834 -0.031 0.975189
## drivewheelfwd -0.0145856 0.0289198 -0.504 0.614617
## drivewheelrwd 0.0471363 0.0308101 1.530 0.127751
## fueltypegas NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07672 on 185 degrees of freedom
## Multiple R-squared: 0.8885, Adjusted R-squared: 0.8771
## F-statistic: 77.6 on 19 and 185 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model)
## Warning: not plotting observations with leverage one:
## 19, 30, 47, 50, 59

#model
library(leaps)
m1<-regsubsets(log10(price)~highwaympg+citympg+horsepower+fuelsystem
+enginesize+cylindernumber+drivewheel+fueltype, data=data2)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
models<-summary(m1)
#see the number of recommended regression models
which.max(models$adjr2)
## [1] 8
#best models based on adj r2, cp and bic
data.frame(
Adj.R2 = which.max(models$adjr2),
CP = which.min(models$cp),
BIC = which.min(models$bic)
)
## Adj.R2 CP BIC
## 1 8 8 8
#get the corresponding adjusted r2, cp, and bic for the best model
models$adjr2[8]
## [1] 0.8789096
models$bic[8]
## [1] -393.0937
models$cp[8]
## [1] 5.016099
#now obtain the best predictors
models$which[8,]
## (Intercept) highwaympg citympg
## TRUE FALSE TRUE
## horsepower fuelsystem2bbl fuelsystem4bbl
## TRUE FALSE FALSE
## fuelsystemidi fuelsystemmfi fuelsystemmpfi
## FALSE FALSE TRUE
## fuelsystemspdi fuelsystemspfi enginesize
## FALSE FALSE TRUE
## cylindernumberfive cylindernumberfour cylindernumbersix
## TRUE FALSE FALSE
## cylindernumberthree cylindernumbertwelve cylindernumbertwo
## FALSE TRUE FALSE
## drivewheelfwd drivewheelrwd fueltypegas
## FALSE TRUE TRUE
#regression coefficients for the best model
coef(m1, 8)
## (Intercept) citympg horsepower
## 3.990491774 -0.007909900 0.001375438
## fuelsystemmpfi enginesize cylindernumberfive
## 0.059329349 0.001669728 0.097017056
## cylindernumbertwelve drivewheelrwd fueltypegas
## -0.208613350 0.068676187 -0.155448648