** DATA_605_Discussion_11_Regression_1 **

library(RCurl)
## Loading required package: bitops
library(stringr)

** Discussion_11_Regression_Auto_MPG **

Scope: check regression on an Automobile MPG (miles per gallon) vs auto characteristics.

Fields: 1. mpg: continuous 2. cylinders: multi-valued discrete 3. displacement: continuous 4. horsepower: continuous 5. weight: continuous 6. acceleration: continuous 7. model year: multi-valued discrete 8. origin: multi-valued discrete 9. car name: string (unique for each instance)

# load file
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
auto_mpg <- read.csv(url, header = F, stringsAsFactors = FALSE)

head(auto_mpg)
##                                                                                   V1
## 1 18.0   8   307.0      130.0      3504.      12.0   70  1\tchevrolet chevelle malibu
## 2         15.0   8   350.0      165.0      3693.      11.5   70  1\tbuick skylark 320
## 3        18.0   8   318.0      150.0      3436.      11.0   70  1\tplymouth satellite
## 4             16.0   8   304.0      150.0      3433.      12.0   70  1\tamc rebel sst
## 5               17.0   8   302.0      140.0      3449.      10.5   70  1\tford torino
## 6          15.0   8   429.0      198.0      4341.      10.0   70  1\tford galaxie 500
ncol(auto_mpg)
## [1] 1
#auto_mpg
typeof(auto_mpg)
## [1] "list"
# convert data file from list to a dataframe
df2 <- as.data.frame(str_split(auto_mpg$V1,"\\s+",n=Inf, simplify = TRUE))
head(df2)
##     V1 V2    V3    V4    V5   V6 V7 V8        V9       V10    V11 V12 V13
## 1 18.0  8 307.0 130.0 3504. 12.0 70  1 chevrolet  chevelle malibu        
## 2 15.0  8 350.0 165.0 3693. 11.5 70  1     buick   skylark    320        
## 3 18.0  8 318.0 150.0 3436. 11.0 70  1  plymouth satellite               
## 4 16.0  8 304.0 150.0 3433. 12.0 70  1       amc     rebel    sst        
## 5 17.0  8 302.0 140.0 3449. 10.5 70  1      ford    torino               
## 6 15.0  8 429.0 198.0 4341. 10.0 70  1      ford   galaxie    500        
##   V14
## 1    
## 2    
## 3    
## 4    
## 5    
## 6
#typeof(df2)
str(df2)
## 'data.frame':    398 obs. of  14 variables:
##  $ V1 : Factor w/ 129 levels "10.0","11.0",..: 17 7 17 9 13 7 5 5 5 7 ...
##  $ V2 : Factor w/ 5 levels "3","4","5","6",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ V3 : Factor w/ 82 levels "100.0","101.0",..: 50 53 51 48 47 59 61 60 62 57 ...
##  $ V4 : Factor w/ 94 levels "?","100.0","102.0",..: 17 35 29 29 24 42 47 46 48 40 ...
##  $ V5 : Factor w/ 351 levels "1613.","1649.",..: 248 266 242 241 245 319 320 316 328 279 ...
##  $ V6 : Factor w/ 96 levels "10.0","10.5",..: 10 8 3 10 2 1 94 93 1 93 ...
##  $ V7 : Factor w/ 13 levels "70","71","72",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ V8 : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V9 : Factor w/ 37 levels "amc","audi","bmw",..: 8 4 26 1 14 14 8 26 27 1 ...
##  $ V10: Factor w/ 190 levels "","'cuda","100",..: 70 169 165 155 180 109 118 107 63 48 ...
##  $ V11: Factor w/ 101 levels "","(auto)","(diesel)",..: 68 20 1 88 1 25 1 57 1 44 ...
##  $ V12: Factor w/ 23 levels "","(diesel)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ V13: Factor w/ 4 levels "","(sw)","(turbo)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ V14: Factor w/ 2 levels "","(sw)": 1 1 1 1 1 1 1 1 1 1 ...
#df2$V1

# label the columns of the data frame for easy readability
#head(df2)
names(df2)
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
## [12] "V12" "V13" "V14"
names(df2)[1]<-"MPG"
names(df2)[2]<-"CYL"
names(df2)[3]<-"CUBIC_INCH"
names(df2)[4]<-"HP"
names(df2)[5]<-"WGT"
names(df2)[6]<-"ACCEL"
names(df2)[7]<-"YEAR"
names(df2)[8]<-"ORIGIN"
names(df2)[9]<-"BRAND"
names(df2)[10]<-"MODEL"

# convert data frame factors to types usable in a model
df2$MPG <- as.numeric(as.character(df2$MPG))
df2$CYL <- as.numeric(as.character(df2$CYL))
df2$CUBIC_INCH <- as.numeric(as.character(df2$CUBIC_INCH))
df2$HP <- as.numeric(as.character(df2$HP))
## Warning: NAs introduced by coercion
df2$WGT <- as.numeric(as.character(df2$WGT))
df2$ACCEL <- as.numeric(as.character(df2$ACCEL))
df2$YEAR <- as.numeric(as.character(df2$YEAR))
df2$ORIGIN <- as.numeric(as.character(df2$ORIGIN))
df2$BRAND <- as.character(as.character(df2$BRAND))
df2$MODEL <- as.numeric(as.character(df2$MODEL))
## Warning: NAs introduced by coercion
# create a mult-factor model
attach(df2)
df2.lm <- lm(MPG ~ CYL + CUBIC_INCH + HP + WGT + YEAR + BRAND)
df2.lm
## 
## Call:
## lm(formula = MPG ~ CYL + CUBIC_INCH + HP + WGT + YEAR + BRAND)
## 
## Coefficients:
##        (Intercept)                 CYL          CUBIC_INCH  
##         -12.799114           -0.638287            0.025293  
##                 HP                 WGT                YEAR  
##          -0.028669           -0.006343            0.712411  
##          BRANDaudi            BRANDbmw          BRANDbuick  
##           4.180386            2.207914            1.431720  
##      BRANDcadillac          BRANDcapri      BRANDchevroelt  
##           4.224217            1.620488            0.604792  
##     BRANDchevrolet          BRANDchevy       BRANDchrysler  
##           1.396817            2.027207            0.310153  
##        BRANDdatsun          BRANDdodge           BRANDfiat  
##           5.883556            2.218713            4.736178  
##          BRANDford             BRANDhi          BRANDhonda  
##           0.786973            4.897095            5.208871  
##         BRANDmaxda          BRANDmazda       BRANDmercedes  
##           0.189448            3.898606            5.080784  
## BRANDmercedes-benz        BRANDmercury         BRANDnissan  
##           4.805567            0.544884            6.123773  
##    BRANDoldsmobile           BRANDopel        BRANDpeugeot  
##           2.681174            2.205215            3.827634  
##      BRANDplymouth        BRANDpontiac        BRANDrenault  
##           2.811592            2.977985            4.411274  
##          BRANDsaab         BRANDsubaru         BRANDtoyota  
##           3.351356            3.509270            3.537207  
##       BRANDtoyouta        BRANDtriumph      BRANDvokswagen  
##           2.700679            8.654728           -0.610845  
##    BRANDvolkswagen          BRANDvolvo             BRANDvw  
##           3.125480            1.716767           10.415292
# create a subset of the model for analysis and plotting
autovars <- c('MPG', 'CYL', 'CUBIC_INCH', 'HP', 'WGT','YEAR')
Autos <- df2[autovars]
str(Autos)
## 'data.frame':    398 obs. of  6 variables:
##  $ MPG       : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ CYL       : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ CUBIC_INCH: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ HP        : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ WGT       : num  3504 3693 3436 3433 3449 ...
##  $ YEAR      : num  70 70 70 70 70 70 70 70 70 70 ...
plot(Autos)

# note in plot that the relationships of MPG to cylinders(CYL), CUBIC_INCH, and WGT (weight) appear to be good.

summary(df2.lm)
## 
## Call:
## lm(formula = MPG ~ CYL + CUBIC_INCH + HP + WGT + YEAR + BRAND)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.1889 -1.9365  0.0022  1.7178 14.1334 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.280e+01  4.300e+00  -2.976 0.003119 ** 
## CYL                -6.383e-01  3.305e-01  -1.931 0.054267 .  
## CUBIC_INCH          2.529e-02  8.152e-03   3.103 0.002073 ** 
## HP                 -2.867e-02  1.136e-02  -2.523 0.012071 *  
## WGT                -6.343e-03  6.297e-04 -10.073  < 2e-16 ***
## YEAR                7.124e-01  5.335e-02  13.355  < 2e-16 ***
## BRANDaudi           4.180e+00  1.410e+00   2.966 0.003226 ** 
## BRANDbmw            2.208e+00  2.384e+00   0.926 0.355043    
## BRANDbuick          1.432e+00  1.008e+00   1.421 0.156256    
## BRANDcadillac       4.224e+00  2.337e+00   1.808 0.071512 .  
## BRANDcapri          1.620e+00  3.233e+00   0.501 0.616557    
## BRANDchevroelt      6.048e-01  3.242e+00   0.187 0.852125    
## BRANDchevrolet      1.397e+00  7.814e-01   1.788 0.074698 .  
## BRANDchevy          2.027e+00  1.956e+00   1.036 0.300780    
## BRANDchrysler       3.102e-01  1.479e+00   0.210 0.834056    
## BRANDdatsun         5.884e+00  9.933e-01   5.923 7.56e-09 ***
## BRANDdodge          2.219e+00  8.779e-01   2.527 0.011939 *  
## BRANDfiat           4.736e+00  1.337e+00   3.544 0.000448 ***
## BRANDford           7.870e-01  7.684e-01   1.024 0.306434    
## BRANDhi             4.897e+00  3.345e+00   1.464 0.144078    
## BRANDhonda          5.209e+00  1.131e+00   4.604 5.81e-06 ***
## BRANDmaxda          1.894e-01  2.360e+00   0.080 0.936067    
## BRANDmazda          3.899e+00  1.258e+00   3.098 0.002106 ** 
## BRANDmercedes       5.081e+00  3.259e+00   1.559 0.119895    
## BRANDmercedes-benz  4.806e+00  2.398e+00   2.004 0.045836 *  
## BRANDmercury        5.449e-01  1.136e+00   0.480 0.631668    
## BRANDnissan         6.124e+00  3.254e+00   1.882 0.060701 .  
## BRANDoldsmobile     2.681e+00  1.189e+00   2.254 0.024791 *  
## BRANDopel           2.205e+00  1.733e+00   1.272 0.204140    
## BRANDpeugeot        3.828e+00  1.379e+00   2.776 0.005806 ** 
## BRANDplymouth       2.812e+00  8.431e-01   3.335 0.000944 ***
## BRANDpontiac        2.978e+00  1.038e+00   2.868 0.004381 ** 
## BRANDrenault        4.411e+00  1.958e+00   2.252 0.024910 *  
## BRANDsaab           3.351e+00  1.796e+00   1.866 0.062859 .  
## BRANDsubaru         3.509e+00  1.731e+00   2.027 0.043394 *  
## BRANDtoyota         3.537e+00  9.564e-01   3.699 0.000252 ***
## BRANDtoyouta        2.701e+00  3.251e+00   0.831 0.406659    
## BRANDtriumph        8.655e+00  3.242e+00   2.670 0.007950 ** 
## BRANDvokswagen     -6.108e-01  3.240e+00  -0.189 0.850573    
## BRANDvolkswagen     3.125e+00  1.076e+00   2.905 0.003901 ** 
## BRANDvolvo          1.717e+00  1.538e+00   1.116 0.265148    
## BRANDvw             1.042e+01  1.472e+00   7.077 8.11e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.153 on 350 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.8539, Adjusted R-squared:  0.8368 
## F-statistic: 49.91 on 41 and 350 DF,  p-value: < 2.2e-16
# this is confirmed by reviewing the summary statistics. The CYL, and CUBIC_INCH, and WGT have values less than .05 which is significant.  The adjusted R value is .8368 which is good correlation.

# the sample model fits the theoretical between -2 and +2.
qqnorm(resid(df2.lm))
qqline(resid(df2.lm))

hist(df2.lm$residuals)

# the histogram of residuals shows even near normal distribution which shows good linearity

plot(fitted(df2.lm),resid(df2.lm))

# the residuals are randomly distributed on the right side of the residual chart which indicates good linearity
# check single variable model
# for MPG vs WGT (weight) correlation is still good at adjusted R-squared at .691, but not good as multiple factor model above.
MPG_WGT.lm <- lm(MPG ~ WGT)
MPG_WGT.lm
## 
## Call:
## lm(formula = MPG ~ WGT)
## 
## Coefficients:
## (Intercept)          WGT  
##   46.317364    -0.007677
summary(MPG_WGT.lm)
## 
## Call:
## lm(formula = MPG ~ WGT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.012  -2.801  -0.351   2.114  16.480 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.3173644  0.7952452   58.24   <2e-16 ***
## WGT         -0.0076766  0.0002575  -29.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.345 on 396 degrees of freedom
## Multiple R-squared:  0.6918, Adjusted R-squared:  0.691 
## F-statistic: 888.9 on 1 and 396 DF,  p-value: < 2.2e-16
# correlation on the plot looks good on the plot of MPG vs WGT, especially as WGT increases
plot(WGT,MPG)
abline(MPG_WGT.lm)

# the sample model fits the theoretical between -2 and +2.
qqnorm(resid(MPG_WGT.lm))
qqline(resid(MPG_WGT.lm))

hist(MPG_WGT.lm$residuals)

# the histogram of residuals shows even near normal distribution which shows good linearity

# the residuals are randomly distributed on the right side of the residual chart which indicates good linearity
plot(fitted(MPG_WGT.lm),resid(MPG_WGT.lm))

** END **