#import data
getwd()
## [1] "/Users/dengd/Documents/STATS PROJECT /finalproject 2"
setwd("/Users/dengd/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - Deng’s MacBook Pro - 1/Documents/STAT 840 Spring 2022/Final Project/STAT840FP/finalproject 2")
data<-read.csv("CarPrice_Assignment.csv",header = T)
#form a data frame with only the required variables
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data2<-data %>% dplyr::select(c("price", "highwaympg", "citympg", "horsepower"
                               ,"fuelsystem", "enginesize", "cylindernumber"
                               ,"drivewheel", "fueltype" ))
#check for missing values
sum(is.na(data2))
## [1] 0
#convert chr to factor for categorical variables
data2$fuelsystem<-as.factor(data2$fuelsystem)
data2$cylindernumber<-as.factor(data2$cylindernumber)
data2$drivewheel<-as.factor(data2$drivewheel)
data2$fueltype<-as.factor(data2$fueltype)
#visualize data 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha
# fuel system

#frequency table
tab1(data2$fuelsystem, sort.group = "decreasing",
     cum.percent = TRUE)

## data2$fuelsystem : 
##         Frequency Percent Cum. percent
## mpfi           94    45.9         45.9
## 2bbl           66    32.2         78.0
## idi            20     9.8         87.8
## 1bbl           11     5.4         93.2
## spdi            9     4.4         97.6
## 4bbl            3     1.5         99.0
## spfi            1     0.5         99.5
## mfi             1     0.5        100.0
##   Total       205   100.0        100.0
ggplot(data = data2, aes(x =fuelsystem, fill =fuelsystem)) +
  geom_bar()

#cylinder number

#frequency table
tab1(data2$cylindernumber, sort.group = "decreasing", 
     cum.percent = TRUE)

## data2$cylindernumber : 
##         Frequency Percent Cum. percent
## four          159    77.6         77.6
## six            24    11.7         89.3
## five           11     5.4         94.6
## eight           5     2.4         97.1
## two             4     2.0         99.0
## twelve          1     0.5         99.5
## three           1     0.5        100.0
##   Total       205   100.0        100.0
ggplot(data = data2, aes(x =cylindernumber, fill =cylindernumber)) +
  geom_bar()

#drive wheel
#frequency table 
tab1(data2$drivewheel, sort.group = "decreasing", 
     cum.percent = TRUE)

## data2$drivewheel : 
##         Frequency Percent Cum. percent
## fwd           120    58.5         58.5
## rwd            76    37.1         95.6
## 4wd             9     4.4        100.0
##   Total       205   100.0        100.0
ggplot(data = data2, aes(x =drivewheel, fill =drivewheel)) +
  geom_bar()

# fuel type 

#Frequency table
tab1(data2$fueltype, sort.group = "decreasing", 
     cum.percent = TRUE)

## data2$fueltype : 
##         Frequency Percent Cum. percent
## gas           185    90.2         90.2
## diesel         20     9.8        100.0
##   Total       205   100.0        100.0
ggplot(data = data2, aes(x =fueltype, fill =fueltype)) +
  geom_bar()

# summary descriptive statistics for continuous data
data2 %>% group_by(fuelsystem) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
## # A tibble: 8 × 3
##   fuelsystem mean_price sd_price
##   <fct>           <dbl>    <dbl>
## 1 1bbl            7556.    1390.
## 2 2bbl            7478.    1385.
## 3 4bbl           12145     1375.
## 4 idi            15838.    7760.
## 5 mfi            12964       NA 
## 6 mpfi           17755.    8686.
## 7 spdi           10990.    2742.
## 8 spfi           11048       NA
data2 %>% group_by(fueltype) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
## # A tibble: 2 × 3
##   fueltype mean_price sd_price
##   <fct>         <dbl>    <dbl>
## 1 diesel       15838.    7760.
## 2 gas          13000.    7984.
data2 %>% group_by(cylindernumber) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
## # A tibble: 7 × 3
##   cylindernumber mean_price sd_price
##   <fct>               <dbl>    <dbl>
## 1 eight              37400.    5668.
## 2 five               21630.    6091.
## 3 four               10286.    3922.
## 4 six                23672.    8850.
## 5 three               5151       NA 
## 6 twelve             36000       NA 
## 7 two                13020     2079.
data2 %>% group_by(drivewheel) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
## # A tibble: 3 × 3
##   drivewheel mean_price sd_price
##   <fct>           <dbl>    <dbl>
## 1 4wd            11087.    3989.
## 2 fwd             9239.    3318.
## 3 rwd            19911.    9120.
#assumptions test
#normality
#q-q plot
qqnorm(data2$price, pch = 1, frame = FALSE)
qqline(data2$price, col = "steelblue", lwd = 2)

#using shapiro test
shapiro.test(data2$price)
## 
##  Shapiro-Wilk normality test
## 
## data:  data2$price
## W = 0.80066, p-value = 1.849e-15
#Linear relationship
#scatter plot matrix
plot(data2[-c(5,7:9)], col="blue")

#correlation for multicollinearity
cor(data2[-c(1,5,7:9)])
##            highwaympg    citympg horsepower enginesize
## highwaympg  1.0000000  0.9713370 -0.7705439 -0.6774699
## citympg     0.9713370  1.0000000 -0.8014562 -0.6536579
## horsepower -0.7705439 -0.8014562  1.0000000  0.8097687
## enginesize -0.6774699 -0.6536579  0.8097687  1.0000000
#the full model
model<-lm(log10(price)~highwaympg+citympg+horsepower+fuelsystem
          +enginesize+cylindernumber+drivewheel+fueltype, data=data2)
summary(model)
## 
## Call:
## lm(formula = log10(price) ~ highwaympg + citympg + horsepower + 
##     fuelsystem + enginesize + cylindernumber + drivewheel + fueltype, 
##     data = data2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.242788 -0.044721 -0.002504  0.049700  0.181253 
## 
## Coefficients: (1 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.9161791  0.1284815  30.480  < 2e-16 ***
## highwaympg            0.0039184  0.0037937   1.033 0.303012    
## citympg              -0.0124920  0.0043557  -2.868 0.004611 ** 
## horsepower            0.0012699  0.0003570   3.557 0.000476 ***
## fuelsystem2bbl       -0.0253120  0.0256602  -0.986 0.325210    
## fuelsystem4bbl       -0.0042693  0.0932270  -0.046 0.963523    
## fuelsystemidi         0.1417864  0.0320979   4.417 1.70e-05 ***
## fuelsystemmfi        -0.0535653  0.0841034  -0.637 0.524978    
## fuelsystemmpfi        0.0346036  0.0287782   1.202 0.230736    
## fuelsystemspdi       -0.0536225  0.0388492  -1.380 0.169168    
## fuelsystemspfi       -0.0081712  0.0820039  -0.100 0.920735    
## enginesize            0.0017259  0.0003798   4.544 9.92e-06 ***
## cylindernumberfive    0.0512946  0.0561305   0.914 0.361988    
## cylindernumberfour   -0.0453595  0.0564298  -0.804 0.422532    
## cylindernumbersix    -0.0387203  0.0448459  -0.863 0.389031    
## cylindernumberthree   0.0488243  0.1014790   0.481 0.630995    
## cylindernumbertwelve -0.2411926  0.0888768  -2.714 0.007281 ** 
## cylindernumbertwo    -0.0033038  0.1060834  -0.031 0.975189    
## drivewheelfwd        -0.0145856  0.0289198  -0.504 0.614617    
## drivewheelrwd         0.0471363  0.0308101   1.530 0.127751    
## fueltypegas                  NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07672 on 185 degrees of freedom
## Multiple R-squared:  0.8885, Adjusted R-squared:  0.8771 
## F-statistic:  77.6 on 19 and 185 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model)
## Warning: not plotting observations with leverage one:
##   19, 30, 47, 50, 59

#model 
library(leaps)
m1<-regsubsets(log10(price)~highwaympg+citympg+horsepower+fuelsystem
               +enginesize+cylindernumber+drivewheel+fueltype, data=data2)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
models<-summary(m1)
#see the number of recommended regression models
which.max(models$adjr2)
## [1] 8
#best models based on adj r2, cp and bic
data.frame(
  Adj.R2 = which.max(models$adjr2),
  CP = which.min(models$cp),
  BIC = which.min(models$bic)
)
##   Adj.R2 CP BIC
## 1      8  8   8
#get the corresponding adjusted r2, cp, and bic for the best model
models$adjr2[8]
## [1] 0.8789096
models$bic[8]
## [1] -393.0937
models$cp[8]
## [1] 5.016099
#now obtain the best predictors
models$which[8,]
##          (Intercept)           highwaympg              citympg 
##                 TRUE                FALSE                 TRUE 
##           horsepower       fuelsystem2bbl       fuelsystem4bbl 
##                 TRUE                FALSE                FALSE 
##        fuelsystemidi        fuelsystemmfi       fuelsystemmpfi 
##                FALSE                FALSE                 TRUE 
##       fuelsystemspdi       fuelsystemspfi           enginesize 
##                FALSE                FALSE                 TRUE 
##   cylindernumberfive   cylindernumberfour    cylindernumbersix 
##                 TRUE                FALSE                FALSE 
##  cylindernumberthree cylindernumbertwelve    cylindernumbertwo 
##                FALSE                 TRUE                FALSE 
##        drivewheelfwd        drivewheelrwd          fueltypegas 
##                FALSE                 TRUE                 TRUE
#regression coefficients for the best model
coef(m1, 8)
##          (Intercept)              citympg           horsepower 
##          3.990491774         -0.007909900          0.001375438 
##       fuelsystemmpfi           enginesize   cylindernumberfive 
##          0.059329349          0.001669728          0.097017056 
## cylindernumbertwelve        drivewheelrwd          fueltypegas 
##         -0.208613350          0.068676187         -0.155448648