** DATA_605_Discussion_11_Regression_1 **
library(RCurl)
## Loading required package: bitops
library(stringr)
** Discussion_11_Regression_Auto_MPG **
Scope: check regression on an Automobile MPG (miles per gallon) vs auto characteristics.
Fields: 1. mpg: continuous 2. cylinders: multi-valued discrete 3. displacement: continuous 4. horsepower: continuous 5. weight: continuous 6. acceleration: continuous 7. model year: multi-valued discrete 8. origin: multi-valued discrete 9. car name: string (unique for each instance)
# load file
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
auto_mpg <- read.csv(url, header = F, stringsAsFactors = FALSE)
head(auto_mpg)
## V1
## 1 18.0 8 307.0 130.0 3504. 12.0 70 1\tchevrolet chevelle malibu
## 2 15.0 8 350.0 165.0 3693. 11.5 70 1\tbuick skylark 320
## 3 18.0 8 318.0 150.0 3436. 11.0 70 1\tplymouth satellite
## 4 16.0 8 304.0 150.0 3433. 12.0 70 1\tamc rebel sst
## 5 17.0 8 302.0 140.0 3449. 10.5 70 1\tford torino
## 6 15.0 8 429.0 198.0 4341. 10.0 70 1\tford galaxie 500
ncol(auto_mpg)
## [1] 1
#auto_mpg
typeof(auto_mpg)
## [1] "list"
# convert data file from list to a dataframe
df2 <- as.data.frame(str_split(auto_mpg$V1,"\\s+",n=Inf, simplify = TRUE))
head(df2)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 1 18.0 8 307.0 130.0 3504. 12.0 70 1 chevrolet chevelle malibu
## 2 15.0 8 350.0 165.0 3693. 11.5 70 1 buick skylark 320
## 3 18.0 8 318.0 150.0 3436. 11.0 70 1 plymouth satellite
## 4 16.0 8 304.0 150.0 3433. 12.0 70 1 amc rebel sst
## 5 17.0 8 302.0 140.0 3449. 10.5 70 1 ford torino
## 6 15.0 8 429.0 198.0 4341. 10.0 70 1 ford galaxie 500
## V14
## 1
## 2
## 3
## 4
## 5
## 6
#typeof(df2)
str(df2)
## 'data.frame': 398 obs. of 14 variables:
## $ V1 : Factor w/ 129 levels "10.0","11.0",..: 17 7 17 9 13 7 5 5 5 7 ...
## $ V2 : Factor w/ 5 levels "3","4","5","6",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ V3 : Factor w/ 82 levels "100.0","101.0",..: 50 53 51 48 47 59 61 60 62 57 ...
## $ V4 : Factor w/ 94 levels "?","100.0","102.0",..: 17 35 29 29 24 42 47 46 48 40 ...
## $ V5 : Factor w/ 351 levels "1613.","1649.",..: 248 266 242 241 245 319 320 316 328 279 ...
## $ V6 : Factor w/ 96 levels "10.0","10.5",..: 10 8 3 10 2 1 94 93 1 93 ...
## $ V7 : Factor w/ 13 levels "70","71","72",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ V8 : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ V9 : Factor w/ 37 levels "amc","audi","bmw",..: 8 4 26 1 14 14 8 26 27 1 ...
## $ V10: Factor w/ 190 levels "","'cuda","100",..: 70 169 165 155 180 109 118 107 63 48 ...
## $ V11: Factor w/ 101 levels "","(auto)","(diesel)",..: 68 20 1 88 1 25 1 57 1 44 ...
## $ V12: Factor w/ 23 levels "","(diesel)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ V13: Factor w/ 4 levels "","(sw)","(turbo)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ V14: Factor w/ 2 levels "","(sw)": 1 1 1 1 1 1 1 1 1 1 ...
#df2$V1
# label the columns of the data frame for easy readability
#head(df2)
names(df2)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
## [12] "V12" "V13" "V14"
names(df2)[1]<-"MPG"
names(df2)[2]<-"CYL"
names(df2)[3]<-"CUBIC_INCH"
names(df2)[4]<-"HP"
names(df2)[5]<-"WGT"
names(df2)[6]<-"ACCEL"
names(df2)[7]<-"YEAR"
names(df2)[8]<-"ORIGIN"
names(df2)[9]<-"BRAND"
names(df2)[10]<-"MODEL"
# convert data frame factors to types usable in a model
df2$MPG <- as.numeric(as.character(df2$MPG))
df2$CYL <- as.numeric(as.character(df2$CYL))
df2$CUBIC_INCH <- as.numeric(as.character(df2$CUBIC_INCH))
df2$HP <- as.numeric(as.character(df2$HP))
## Warning: NAs introduced by coercion
df2$WGT <- as.numeric(as.character(df2$WGT))
df2$ACCEL <- as.numeric(as.character(df2$ACCEL))
df2$YEAR <- as.numeric(as.character(df2$YEAR))
df2$ORIGIN <- as.numeric(as.character(df2$ORIGIN))
df2$BRAND <- as.character(as.character(df2$BRAND))
df2$MODEL <- as.numeric(as.character(df2$MODEL))
## Warning: NAs introduced by coercion
# create a mult-factor model
attach(df2)
df2.lm <- lm(MPG ~ CYL + CUBIC_INCH + HP + WGT + YEAR + BRAND)
df2.lm
##
## Call:
## lm(formula = MPG ~ CYL + CUBIC_INCH + HP + WGT + YEAR + BRAND)
##
## Coefficients:
## (Intercept) CYL CUBIC_INCH
## -12.799114 -0.638287 0.025293
## HP WGT YEAR
## -0.028669 -0.006343 0.712411
## BRANDaudi BRANDbmw BRANDbuick
## 4.180386 2.207914 1.431720
## BRANDcadillac BRANDcapri BRANDchevroelt
## 4.224217 1.620488 0.604792
## BRANDchevrolet BRANDchevy BRANDchrysler
## 1.396817 2.027207 0.310153
## BRANDdatsun BRANDdodge BRANDfiat
## 5.883556 2.218713 4.736178
## BRANDford BRANDhi BRANDhonda
## 0.786973 4.897095 5.208871
## BRANDmaxda BRANDmazda BRANDmercedes
## 0.189448 3.898606 5.080784
## BRANDmercedes-benz BRANDmercury BRANDnissan
## 4.805567 0.544884 6.123773
## BRANDoldsmobile BRANDopel BRANDpeugeot
## 2.681174 2.205215 3.827634
## BRANDplymouth BRANDpontiac BRANDrenault
## 2.811592 2.977985 4.411274
## BRANDsaab BRANDsubaru BRANDtoyota
## 3.351356 3.509270 3.537207
## BRANDtoyouta BRANDtriumph BRANDvokswagen
## 2.700679 8.654728 -0.610845
## BRANDvolkswagen BRANDvolvo BRANDvw
## 3.125480 1.716767 10.415292
# create a subset of the model for analysis and plotting
autovars <- c('MPG', 'CYL', 'CUBIC_INCH', 'HP', 'WGT','YEAR')
Autos <- df2[autovars]
str(Autos)
## 'data.frame': 398 obs. of 6 variables:
## $ MPG : num 18 15 18 16 17 15 14 14 14 15 ...
## $ CYL : num 8 8 8 8 8 8 8 8 8 8 ...
## $ CUBIC_INCH: num 307 350 318 304 302 429 454 440 455 390 ...
## $ HP : num 130 165 150 150 140 198 220 215 225 190 ...
## $ WGT : num 3504 3693 3436 3433 3449 ...
## $ YEAR : num 70 70 70 70 70 70 70 70 70 70 ...
plot(Autos)
# note in plot that the relationships of MPG to cylinders(CYL), CUBIC_INCH, and WGT (weight) appear to be good.
summary(df2.lm)
##
## Call:
## lm(formula = MPG ~ CYL + CUBIC_INCH + HP + WGT + YEAR + BRAND)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1889 -1.9365 0.0022 1.7178 14.1334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.280e+01 4.300e+00 -2.976 0.003119 **
## CYL -6.383e-01 3.305e-01 -1.931 0.054267 .
## CUBIC_INCH 2.529e-02 8.152e-03 3.103 0.002073 **
## HP -2.867e-02 1.136e-02 -2.523 0.012071 *
## WGT -6.343e-03 6.297e-04 -10.073 < 2e-16 ***
## YEAR 7.124e-01 5.335e-02 13.355 < 2e-16 ***
## BRANDaudi 4.180e+00 1.410e+00 2.966 0.003226 **
## BRANDbmw 2.208e+00 2.384e+00 0.926 0.355043
## BRANDbuick 1.432e+00 1.008e+00 1.421 0.156256
## BRANDcadillac 4.224e+00 2.337e+00 1.808 0.071512 .
## BRANDcapri 1.620e+00 3.233e+00 0.501 0.616557
## BRANDchevroelt 6.048e-01 3.242e+00 0.187 0.852125
## BRANDchevrolet 1.397e+00 7.814e-01 1.788 0.074698 .
## BRANDchevy 2.027e+00 1.956e+00 1.036 0.300780
## BRANDchrysler 3.102e-01 1.479e+00 0.210 0.834056
## BRANDdatsun 5.884e+00 9.933e-01 5.923 7.56e-09 ***
## BRANDdodge 2.219e+00 8.779e-01 2.527 0.011939 *
## BRANDfiat 4.736e+00 1.337e+00 3.544 0.000448 ***
## BRANDford 7.870e-01 7.684e-01 1.024 0.306434
## BRANDhi 4.897e+00 3.345e+00 1.464 0.144078
## BRANDhonda 5.209e+00 1.131e+00 4.604 5.81e-06 ***
## BRANDmaxda 1.894e-01 2.360e+00 0.080 0.936067
## BRANDmazda 3.899e+00 1.258e+00 3.098 0.002106 **
## BRANDmercedes 5.081e+00 3.259e+00 1.559 0.119895
## BRANDmercedes-benz 4.806e+00 2.398e+00 2.004 0.045836 *
## BRANDmercury 5.449e-01 1.136e+00 0.480 0.631668
## BRANDnissan 6.124e+00 3.254e+00 1.882 0.060701 .
## BRANDoldsmobile 2.681e+00 1.189e+00 2.254 0.024791 *
## BRANDopel 2.205e+00 1.733e+00 1.272 0.204140
## BRANDpeugeot 3.828e+00 1.379e+00 2.776 0.005806 **
## BRANDplymouth 2.812e+00 8.431e-01 3.335 0.000944 ***
## BRANDpontiac 2.978e+00 1.038e+00 2.868 0.004381 **
## BRANDrenault 4.411e+00 1.958e+00 2.252 0.024910 *
## BRANDsaab 3.351e+00 1.796e+00 1.866 0.062859 .
## BRANDsubaru 3.509e+00 1.731e+00 2.027 0.043394 *
## BRANDtoyota 3.537e+00 9.564e-01 3.699 0.000252 ***
## BRANDtoyouta 2.701e+00 3.251e+00 0.831 0.406659
## BRANDtriumph 8.655e+00 3.242e+00 2.670 0.007950 **
## BRANDvokswagen -6.108e-01 3.240e+00 -0.189 0.850573
## BRANDvolkswagen 3.125e+00 1.076e+00 2.905 0.003901 **
## BRANDvolvo 1.717e+00 1.538e+00 1.116 0.265148
## BRANDvw 1.042e+01 1.472e+00 7.077 8.11e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.153 on 350 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.8539, Adjusted R-squared: 0.8368
## F-statistic: 49.91 on 41 and 350 DF, p-value: < 2.2e-16
# this is confirmed by reviewing the summary statistics. The CYL, and CUBIC_INCH, and WGT have values less than .05 which is significant. The adjusted R value is .8368 which is good correlation.
# the sample model fits the theoretical between -2 and +2.
qqnorm(resid(df2.lm))
qqline(resid(df2.lm))
hist(df2.lm$residuals)
# the histogram of residuals shows even near normal distribution which shows good linearity
plot(fitted(df2.lm),resid(df2.lm))
# the residuals are randomly distributed on the right side of the residual chart which indicates good linearity
# check single variable model
# for MPG vs WGT (weight) correlation is still good at adjusted R-squared at .691, but not good as multiple factor model above.
MPG_WGT.lm <- lm(MPG ~ WGT)
MPG_WGT.lm
##
## Call:
## lm(formula = MPG ~ WGT)
##
## Coefficients:
## (Intercept) WGT
## 46.317364 -0.007677
summary(MPG_WGT.lm)
##
## Call:
## lm(formula = MPG ~ WGT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.012 -2.801 -0.351 2.114 16.480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.3173644 0.7952452 58.24 <2e-16 ***
## WGT -0.0076766 0.0002575 -29.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.345 on 396 degrees of freedom
## Multiple R-squared: 0.6918, Adjusted R-squared: 0.691
## F-statistic: 888.9 on 1 and 396 DF, p-value: < 2.2e-16
# correlation on the plot looks good on the plot of MPG vs WGT, especially as WGT increases
plot(WGT,MPG)
abline(MPG_WGT.lm)
# the sample model fits the theoretical between -2 and +2.
qqnorm(resid(MPG_WGT.lm))
qqline(resid(MPG_WGT.lm))
hist(MPG_WGT.lm$residuals)
# the histogram of residuals shows even near normal distribution which shows good linearity
# the residuals are randomly distributed on the right side of the residual chart which indicates good linearity
plot(fitted(MPG_WGT.lm),resid(MPG_WGT.lm))
** END **