1 Preliminaries

1.2 Clarify your workflow

1.3 Load packages

library(tidyverse) # install.packages("tidyverse")
Warning message:
In if (i == constCount) constCount <<- constCount + 1 :
  closing unused connection 3 (https://github.com/ds777/sample-datasets/blob/master/dataAuto.csv?raw=true)

2 Data Import and Tidying

dataAuto <- read_csv("https://github.com/ds777/sample-datasets/blob/master/dataAuto.csv?raw=true")

2.1 View tabular data

dataAuto

2.2 Transform data

# Turn string variable into a factor variable
dataAuto <- dataAuto %>%
  mutate(make = as.factor(make))
dataAuto

3 Exploratory Data Analysis

3.1 Summarize data

summary(dataAuto)
        make        mpg            weight        weight1     
 AMC      :3   Min.   :14.00   Min.   :2020   Min.   :2.020  
 Audi     :2   1st Qu.:17.25   1st Qu.:2642   1st Qu.:2.643  
 BMW      :1   Median :21.00   Median :3200   Median :3.200  
 Buick    :7   Mean   :20.92   Mean   :3099   Mean   :3.099  
 Cadillac :3   3rd Qu.:23.00   3rd Qu.:3610   3rd Qu.:3.610  
 Chevrolet:6   Max.   :35.00   Max.   :4330   Max.   :4.330  
 Datsun   :4                                                 
     price          foreign          repairs          length     
 Min.   : 3299   Min.   :0.0000   Min.   :2.000   Min.   :163.0  
 1st Qu.: 4466   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:173.2  
 Median : 5146   Median :0.0000   Median :3.000   Median :191.0  
 Mean   : 6652   Mean   :0.2692   Mean   :3.269   Mean   :190.1  
 3rd Qu.: 8054   3rd Qu.:0.7500   3rd Qu.:4.000   3rd Qu.:203.0  
 Max.   :15906   Max.   :1.0000   Max.   :5.000   Max.   :222.0  
                                                                 

3.2 Visualize data

plot(dataAuto)

4 Linear Regression Modeling

4.1 Define variables

Y <- cbind(dataAuto$mpg)
X1 <- cbind(dataAuto$weight1)
X <- cbind(dataAuto$weight1, dataAuto$price, dataAuto$foreign)

4.2 Correlation among variables

cor(Y, X)
           [,1]       [,2]      [,3]
[1,] -0.8081609 -0.4384618 0.4003376

4.3 Visualize a scatterplot

plot(Y ~ X1, data = dataAuto)

4.4 Simple linear regression

olsreg1 <- lm(Y ~ X1)
summary(olsreg1)

Call:
lm(formula = Y ~ X1)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.4123 -1.6073 -0.1043  0.9261  8.1072 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  38.0665     2.6112  14.578 2.02e-13 ***
X1           -5.5315     0.8229  -6.722 5.93e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.86 on 24 degrees of freedom
Multiple R-squared:  0.6531,    Adjusted R-squared:  0.6387 
F-statistic: 45.19 on 1 and 24 DF,  p-value: 5.935e-07
confint(olsreg1, level=0.95)
                2.5 %    97.5 %
(Intercept) 32.677256 43.455664
X1          -7.229797 -3.833196
anova(olsreg1)
Analysis of Variance Table

Response: Y
          Df Sum Sq Mean Sq F value    Pr(>F)    
X1         1 369.57  369.57  45.189 5.935e-07 ***
Residuals 24 196.28    8.18                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

4.4.1 Plotting regression line

plot(Y ~ X1, data = dataAuto)
abline(olsreg1)

4.4.2 Predicted values for dependent variable

Y1hat <- fitted(olsreg1)
summary(Y1hat)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  14.12   18.10   20.37   20.92   23.45   26.89 
plot(Y1hat ~ X1)

4.4.3 Regression residuals

e1hat <- resid(olsreg1)
summary(e1hat)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-5.4123 -1.6073 -0.1043  0.0000  0.9261  8.1072 
plot(e1hat ~ X1)

4.5 Multiple linear regression

olsreg2 <- lm(Y ~ X)
summary(olsreg2)

Call:
lm(formula = Y ~ X)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.6942 -1.1857 -0.0452  0.6433  8.6895 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 42.1661962  4.2647533   9.887 1.48e-09 ***
X1          -7.1211114  1.6046735  -4.438 0.000207 ***
X2           0.0002258  0.0002654   0.851 0.404002    
X3          -2.5071265  2.0565685  -1.219 0.235723    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.89 on 22 degrees of freedom
Multiple R-squared:  0.6752,    Adjusted R-squared:  0.6309 
F-statistic: 15.25 on 3 and 22 DF,  p-value: 1.374e-05
confint(olsreg2, level=0.95)
                    2.5 %        97.5 %
(Intercept)  3.332164e+01 51.0107531780
X1          -1.044900e+01 -3.7932221856
X2          -3.245229e-04  0.0007760878
X3          -6.772188e+00  1.7579354345
anova(olsreg2)
Analysis of Variance Table

Response: Y
          Df Sum Sq Mean Sq F value    Pr(>F)    
X          3 382.08 127.360  15.247 1.374e-05 ***
Residuals 22 183.77   8.353                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

4.5.1 Predicted values for dependent variable

Yhat <- fitted(olsreg2)
summary(Yhat)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.90   17.91   20.46   20.92   23.99   27.89 

4.5.2 Regression residuals

rr ehat <- resid(olsreg2) summary(ehat)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.69416 -1.18567 -0.04524  0.00000  0.64332  8.68946 

Good Job!! you succesfully finished the lesson using a modern data science workflow

LS0tCnRpdGxlOiAiV2hhdCBpcyB0aGUgcmVsYXRpb3NoaXAgYmV0d2VlbiBtaWxlcyBwZXIgZ2Fsb24gYW5kIGEgY2FyJ3Mgd2VpZ2h0PyIKc3VidGl0bGU6ICJBIGxpbmVhciByZWdyZXNzaW9uIGFwcHJvYWNoIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIHRoZW1lOiBjb3NtbwogICAgaGlnaGxpZ2h0OiBtb25vY2hyb21lCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgdG9jX2RlcHRoOiA0CiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZGZfcHJpbnQ6IGthYmxlCiAgZ2l0aHViX2RvY3VtZW50OiBkZWZhdWx0Ci0tLQoKIyBQcmVsaW1pbmFyaWVzIAoKIyMgV2F0Y2ggYW5kIGRpc2N1c3MKCi0gW0NvbmNlcHR1YWwgdW5kZXJzdGFuZGluZ10oaHR0cHM6Ly92aWFsb2d1ZXMuY29tL3ZpYWxvZ3Vlcy9wbGF5LzQzMjIzP2tleT1jM2I3NDllODYzZmQ3OTVhNzAyYjYyZmU3NDgyMjE2MDAwNmJmNWJmYjhjNTBmYWJhZTY1KQotIFtFeGFtcGxlXShodHRwczovL3ZpYWxvZ3Vlcy5jb20vdmlhbG9ndWVzL3BsYXkvNDMyMjQ/a2V5PWQzYTQ3MWUyMjBkNzY5MDgyMDliNDJhOGRmMWRkM2I5ZWYzY2MyNDEwOThmZmM0OGI2OGIpCi0gW1IgaW1wbGVtZW50YXRpb25dKGh0dHBzOi8vdmlhbG9ndWVzLmNvbS92aWFsb2d1ZXMvcGxheS80MzIyNT9rZXk9YWEwMTQ5ZjVjMWUzMzk2MTM3MTc3MDliNGUxZDE4OWNkMjA0MTUwNTIwZWEwZTgxNjIwMCkKLSBbWW91ciBUdXJuOiBFeGVjdXRlIHRoaXMgbm90ZWJvb2tdKGh0dHBzOi8vcnN0dWRpby5jbG91ZC9wcm9qZWN0LzIzNjEwKQoKIyMgQ2xhcmlmeSB5b3VyIHdvcmtmbG93CgohW1NvdXJjZTogaHR0cDovL3I0ZHMuaGFkLmNvLm56L2V4cGxvcmUtaW50cm8uaHRtbF0oaHR0cDovL3I0ZHMuaGFkLmNvLm56L2RpYWdyYW1zL2RhdGEtc2NpZW5jZS1leHBsb3JlLnBuZykKCiMjIExvYWQgcGFja2FnZXMKCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKSAjIGluc3RhbGwucGFja2FnZXMoInRpZHl2ZXJzZSIpCmBgYAoKIyBEYXRhIEltcG9ydCBhbmQgVGlkeWluZyAgCgpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpkYXRhQXV0byA8LSByZWFkX2NzdigiaHR0cHM6Ly9naXRodWIuY29tL2RzNzc3L3NhbXBsZS1kYXRhc2V0cy9ibG9iL21hc3Rlci9kYXRhQXV0by5jc3Y/cmF3PXRydWUiKQpgYGAKCiMjIFZpZXcgdGFidWxhciBkYXRhCgpgYGB7cn0KZGF0YUF1dG8KYGBgCgoKIyMgVHJhbnNmb3JtIGRhdGEKCmBgYHtyfQojIFR1cm4gc3RyaW5nIHZhcmlhYmxlIGludG8gYSBmYWN0b3IgdmFyaWFibGUKZGF0YUF1dG8gPC0gZGF0YUF1dG8gJT4lCiAgbXV0YXRlKG1ha2UgPSBhcy5mYWN0b3IobWFrZSkpCmRhdGFBdXRvCmBgYAoKCiMgRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcwoKIyMgU3VtbWFyaXplIGRhdGEKCmBgYHtyfQpzdW1tYXJ5KGRhdGFBdXRvKQpgYGAKCgojIyBWaXN1YWxpemUgZGF0YQoKYGBge3J9CnBsb3QoZGF0YUF1dG8pCmBgYAoKCgoKIyBMaW5lYXIgUmVncmVzc2lvbiBNb2RlbGluZyAKCiMjIERlZmluZSB2YXJpYWJsZXMKCmBgYHtyfQpZIDwtIGNiaW5kKGRhdGFBdXRvJG1wZykKWDEgPC0gY2JpbmQoZGF0YUF1dG8kd2VpZ2h0MSkKWCA8LSBjYmluZChkYXRhQXV0byR3ZWlnaHQxLCBkYXRhQXV0byRwcmljZSwgZGF0YUF1dG8kZm9yZWlnbikKYGBgCgoKIyMgQ29ycmVsYXRpb24gYW1vbmcgdmFyaWFibGVzCgpgYGB7cn0KY29yKFksIFgpCmBgYAoKCiMjIFZpc3VhbGl6ZSBhIHNjYXR0ZXJwbG90CgpgYGB7cn0KcGxvdChZIH4gWDEsIGRhdGEgPSBkYXRhQXV0bykKYGBgCgoKIyMgU2ltcGxlIGxpbmVhciByZWdyZXNzaW9uIAoKYGBge3J9Cm9sc3JlZzEgPC0gbG0oWSB+IFgxKQpzdW1tYXJ5KG9sc3JlZzEpCmNvbmZpbnQob2xzcmVnMSwgbGV2ZWw9MC45NSkKYW5vdmEob2xzcmVnMSkKYGBgCgoKIyMjIFBsb3R0aW5nIHJlZ3Jlc3Npb24gbGluZQoKYGBge3J9CnBsb3QoWSB+IFgxLCBkYXRhID0gZGF0YUF1dG8pCmFibGluZShvbHNyZWcxKQpgYGAKCgojIyMgUHJlZGljdGVkIHZhbHVlcyBmb3IgZGVwZW5kZW50IHZhcmlhYmxlCgpgYGB7cn0KWTFoYXQgPC0gZml0dGVkKG9sc3JlZzEpCnN1bW1hcnkoWTFoYXQpCnBsb3QoWTFoYXQgfiBYMSkKYGBgCgoKIyMjIFJlZ3Jlc3Npb24gcmVzaWR1YWxzCgpgYGB7cn0KZTFoYXQgPC0gcmVzaWQob2xzcmVnMSkKc3VtbWFyeShlMWhhdCkKcGxvdChlMWhhdCB+IFgxKQpgYGAKCgojIyBNdWx0aXBsZSBsaW5lYXIgcmVncmVzc2lvbgoKYGBge3J9Cm9sc3JlZzIgPC0gbG0oWSB+IFgpCnN1bW1hcnkob2xzcmVnMikKY29uZmludChvbHNyZWcyLCBsZXZlbD0wLjk1KQphbm92YShvbHNyZWcyKQpgYGAKCgojIyMgUHJlZGljdGVkIHZhbHVlcyBmb3IgZGVwZW5kZW50IHZhcmlhYmxlCgpgYGB7cn0KWWhhdCA8LSBmaXR0ZWQob2xzcmVnMikKc3VtbWFyeShZaGF0KQpgYGAKCgojIyMgUmVncmVzc2lvbiByZXNpZHVhbHMKCmBgYHtyfQplaGF0IDwtIHJlc2lkKG9sc3JlZzIpCnN1bW1hcnkoZWhhdCkKYGBgCgoKR29vZCBKb2IhISB5b3Ugc3VjY2VzZnVsbHkgZmluaXNoZWQgdGhlIGxlc3NvbiB1c2luZyBhIG1vZGVybiBkYXRhIHNjaWVuY2Ugd29ya2Zsb3cKCiFbU291cmNlOiBodHRwczovL21lZGlhLmdpcGh5LmNvbS9tZWRpYS9YcmVRbWs3RVRDYWswL2dpcGh5LmdpZl0oaHR0cHM6Ly9tZWRpYS5naXBoeS5jb20vbWVkaWEvWHJlUW1rN0VUQ2FrMC9naXBoeS5naWYpCg==