Author: Jay Liao (ID: RE6094028)

Load in the libraries

library(ISLR)
library(dplyr)
library(ggplot2)
library(corrplot)
library(GGally)

Exercise 2.9

This exercise involves the Auto data set studied in the lab. Make sure that missing values have been removed from the data.

dta <- Auto %>% na.omit()

Exercise 2.9 - (a)

Which of the predictors are quantitative, and which are qualitative?

dta$origin <- factor(dta$origin)
levels(dta$origin) <- c('American', 'European', 'Japanese')
str(dta)

'data.frame':   392 obs. of  9 variables:
 $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
 $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
 $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
 $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
 $ weight      : num  3504 3693 3436 3433 3449 ...
 $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
 $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
 $ origin      : Factor w/ 3 levels "American","European",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

[ANS] There are 7 quantitative variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year. The variable origin, name are qualitative.

Exercise 2.9 - (b)

What is the range of each quantitative predictor? You can answer this using the range() function.

dta %>% select_if(is.numeric) %>% apply(., 2, range) %>% 
  apply(., 2, function(v) {v[2] - v[1]})

         mpg    cylinders displacement   horsepower       weight acceleration 
        37.6          5.0        387.0        184.0       3527.0         16.8 
        year 
        12.0

Exercise 2.9 - (c)

What is the mean and standard deviation of each quantitative predictor?

dta %>% select_if(is.numeric) %>% apply(., 2, mean)

         mpg    cylinders displacement   horsepower       weight acceleration 
   23.445918     5.471939   194.411990   104.469388  2977.584184    15.541327 
        year 
   75.979592

dta %>% select_if(is.numeric) %>% apply(., 2, sd)

         mpg    cylinders displacement   horsepower       weight acceleration 
    7.805007     1.705783   104.644004    38.491160   849.402560     2.758864 
        year 
    3.683737

Exercise 2.9 - (d)

Now remove the $10^{th}$ through $85^{th}$ observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

dta_subset <- dta[-(round(nrow(dta)*.1):round(nrow(dta)*.8)),]
dta_subset %>% select_if(is.numeric) %>% apply(., 2, range) %>% 
  apply(., 2, function(v) {v[2] - v[1]})

         mpg    cylinders displacement   horsepower       weight acceleration 
        37.6          5.0        385.0        179.0       2977.0         16.6 
        year 
        12.0

dta_subset %>% select_if(is.numeric) %>% apply(., 2, mean)

         mpg    cylinders displacement   horsepower       weight acceleration 
   27.468103     5.051724   172.750000    99.000000  2738.146552    15.502586 
        year 
   77.560345

dta_subset %>% select_if(is.numeric) %>% apply(., 2, sd)

         mpg    cylinders displacement   horsepower       weight acceleration 
    8.640980     1.553928   100.658603    42.088726   703.735650     3.110332 
        year 
    5.180715

Exercise 2.9 - (e)

Using the full data set, investigate the predictors graphically, using scatter plots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.

Overview

library(corrplot)
corrplot(dta %>% select_if(is.numeric) %>% cor, method='square')

library(GGally)
ggpairs(dta %>% select_if(is.numeric))

Stratified view

plots <- split(dta, dta$origin) %>%
  lapply(., function(df) {
    cat(as.character(df$origin)[1], ' cars')
    plot_c <- corrplot(df %>% select_if(is.numeric) %>% cor,
                       method='square')
    plot_g <- ggpairs(df %>% select_if(is.numeric))
    print(plot_c); print(plot_g)
  })

American  cars

                    mpg  cylinders displacement horsepower     weight
mpg           1.0000000 -0.8245240   -0.8346281 -0.7515703 -0.8464240
cylinders    -0.8245240  1.0000000    0.9338854  0.8276464  0.8816087
displacement -0.8346281  0.9338854    1.0000000  0.9027437  0.9175882
horsepower   -0.7515703  0.8276464    0.9027437  1.0000000  0.8384500
weight       -0.8464240  0.8816087    0.9175882  0.8384500  1.0000000
acceleration  0.3772391 -0.5632928   -0.6198904 -0.7191910 -0.4402301
year          0.6486410 -0.4639865   -0.4975912 -0.4950090 -0.4063883
             acceleration       year
mpg             0.3772391  0.6486410
cylinders      -0.5632928 -0.4639865
displacement   -0.6198904 -0.4975912
horsepower     -0.7191910 -0.4950090
weight         -0.4402301 -0.4063883
acceleration    1.0000000  0.3808776
year            0.3808776  1.0000000

European  cars

                    mpg  cylinders displacement horsepower     weight
mpg           1.0000000 -0.2717195  -0.49559428 -0.6795748 -0.5120112
cylinders    -0.2717195  1.0000000   0.65779147  0.3926528  0.5747666
displacement -0.4955943  0.6577915   1.00000000  0.6220432  0.8915604
horsepower   -0.6795748  0.3926528   0.62204321  1.0000000  0.6116187
weight       -0.5120112  0.5747666   0.89156045  0.6116187  1.0000000
acceleration  0.2980474  0.0244526   0.03803164 -0.5443876  0.1663252
year          0.5042313  0.2628514   0.20693994 -0.1323300  0.1781785
             acceleration       year
mpg            0.29804738  0.5042313
cylinders      0.02445260  0.2628514
displacement   0.03803164  0.2069399
horsepower    -0.54438760 -0.1323300
weight         0.16632523  0.1781785
acceleration   1.00000000  0.1750765
year           0.17507653  1.0000000

Japanese  cars

                    mpg   cylinders displacement horsepower      weight
mpg           1.0000000 -0.13978821   -0.3660203 -0.6730950 -0.56410645
cylinders    -0.1397882  1.00000000    0.7209924  0.4317698  0.48918447
displacement -0.3660203  0.72099238    1.0000000  0.7301760  0.84142997
horsepower   -0.6730950  0.43176979    0.7301760  1.0000000  0.86758948
weight       -0.5641064  0.48918447    0.8414300  0.8675895  1.00000000
acceleration  0.4011143 -0.21967543   -0.5355895 -0.7201492 -0.56768166
year          0.5686621  0.08598536    0.1208350 -0.2152642  0.04536495
              acceleration          year
mpg           0.4011143078  0.5686620898
cylinders    -0.2196754275  0.0859853563
displacement -0.5355895142  0.1208349880
horsepower   -0.7201491696 -0.2152642313
weight       -0.5676816565  0.0453649531
acceleration  1.0000000000 -0.0009436914
year         -0.0009436914  1.0000000000

[ANS] In general, according to the correlation coefficient plots and scatter plots, we can find that all correlation coefficients between each two of cylinders, displacement, horsepower, and weight are highly positive. While all of these four variables are highly negatively correlated with mpg. Also, mpg is slightly positively correlated with acceleration and year.

However, this situation is not so obvious in subset of European cars and Japanese cars. It may result from the imbalance sample size.

table(dta$origin)


American European Japanese 
     245       68       79

Exercise 2.9 - (f)

Supposed that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plot suggest that any of the other variables might be useful in predicting mpg? Justify your answer.

[ANS] According the high correlation coefficient and the differences among the origin, variables cylinders, displacement, horsepower, weight, origin might be useful in predicting mpg.

Exercise 2.10

This exercise involves the Boston housing data set.

Exercise 2.10 - (a)

To begin, load in the Boston data set. The Boston data set is part of the MASS library in R.

library(MASS)

Now the data set is contained in the object Boston.

Boston

Read about the data set:

?Boston

How many rows are in this data set? How many columns? What do the rows and columns represent?

dim(Boston)

[1] 506  14

[ANS] There are 506 rows and 14 columns in the data set. A row represents a town in Boston and a column is an index (a variable) related to demography and sociology.

Exercise 2.10 - (b)

Make some pairwise scatter plots of the predictors (columns) in this data set. Describe your findings.

dta <- Boston
?Boston
dta$chas <- factor(dta$chas) # By ?Boston, we can know that the variable chas is actually binary.

Solution

Since there are too many variables in the data set, it may be more appropriate to explore which variables have associations interesting enough to be plotted as scatter plots.

mtx_cor <- Boston %>% select_if(is.numeric) %>% cor
corrplot(mtx_cor, diag = FALSE, type = 'lower')

corrplot(mtx_cor * (abs(mtx_cor) >= .7),
         method = 'square', diag = FALSE, type = 'lower')

Some of the correlation coefficients of pairs of the following variables have relatively large coefficients (i.e., $|r| \geq 0.7$):

indus, nox, age, dis, rad, tax, lstat, medv

col_selected <- colnames(mtx_cor)[apply(abs(mtx_cor) >= .7, 1, sum) > 1]
corrplot(Boston %>% dplyr::select(col_selected) %>% cor,
         method = 'square', diag = FALSE, type = 'lower')

ggpairs(Boston %>% dplyr::select(col_selected))

[ANS]

dis (weighted mean of distances to five Boston employment centres) is relatively highly negatively correlated with indus, nox (nitrogen oxides concentration (parts per 10 million)), age (proportion of owner-occupied units built prior to 1940).
tax (full-value property-tax rate per $10,000) is relatively highly positively with indus and is extremely positively correlated with rad (index of accessibility to radial highways).
medv (median value of owner-occupied homes in $1000s) is extremely negatively correlated with lstat (ower status of the population (percent)).

Exercise 2.10 - (c)

Are any of the predictors associated with per capita crime rate? If so, explain the relationship.

data.frame(i = rownames(cor(Boston))[-1], r = cor(Boston)[,'crim'][-1]) %>%
  mutate(sign = r > 0) %>% 
  qplot(x = r, y = i, col=sign, data = ., xlim = c(-.75, .75),
        xlab = 'Correlation coefficent', ylab = 'Variable') +
    ggtitle('Correlation coefficent between crim and other variables') +
    geom_segment(aes(xend = 0, yend = i)) +
    geom_vline(xintercept = 0, color = 'grey50') +
    theme(legend.position = 'none')

Each of the following variables has a medium negative correlation with crim (per capita crime rate by town):

medv: median value of owner-occupied homes in $1000s
dis: weighted mean of distances to five Boston employment centres
black: $1000 \times (B_k - 0.63)^2$ where $B_k$ is the proportion of blacks by town

Each of the following variables has a medium positive correlation with crim (per capita crime rate by town):

tax: full-value property-tax rate per $$10,000$
rad: index of accessibility to radial highways
nox: nitrogen oxides concentration (parts per 10 million)
lstat: lower status of the population (percent)
indus: proportion of non-retail business acres per town

Exercise 2.10 - (d)

Do any of the suburbs of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.

par(mfcol = c(1, 3))
boxplot(Boston$crim)
boxplot(Boston$tax)
boxplot(Boston$ptratio)

mtx_output <- apply(Boston[, c('crim', 'tax', 'ptratio')], 2, function(v) {
  s <- robustHD::standardize(v)
  c(range(v)[2] - range(v)[1], range(s)[2] - range(s)[1])
})
rownames(mtx_output) <- c('range', 'range of standardized score')
mtx_output

                                crim        tax  ptratio
range                       88.96988 524.000000 9.400000
range of standardized score 10.34348   3.109107 4.341911

According to the box plots, some suburbs of Boston appear to have particularly high crime rates. No suburb of Boston appears to have particularly high tax rates or pupil-teacher ratios. Ranges of standardized score also demonstrate this while ranges do not.

Exercise 2.10 - (e)

How many of the suburbs in this data set bound the Charles river?

sum(Boston$chas)

[1] 35

There are 35 suburbs in this data set bound the Charles river.

Exercise 2.10 - (f)

What is the median pupil-teacher ratio among the towns in this data set?

median(Boston$ptratio)

[1] 19.05

The median pupil-teacher ratio among the towns in this data set is 19.05.

Exercise 2.10 - (g)

Which suburb of Boston has the lowest median value of owner-occupied homes? What are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors? Comment on your findings.

which.min(Boston$medv)

[1] 399

The suburb of no. 399 has the lowest median value of owner-occupied homes.

Boston[which.min(Boston$medv),]

Other predictors of this suburb is shown above.

apply(Boston, 2,  range)

         crim  zn indus chas   nox    rm   age     dis rad tax ptratio  black
[1,]  0.00632   0  0.46    0 0.385 3.561   2.9  1.1296   1 187    12.6   0.32
[2,] 88.97620 100 27.74    1 0.871 8.780 100.0 12.1265  24 711    22.0 396.90
     lstat medv
[1,]  1.73    5
[2,] 37.97   50

Boston[which.min(Boston$medv),]

robustHD::standardize(Boston)[which.min(Boston$medv),]

robustHD::standardize(Boston)[which.min(Boston$medv),] %>% as.numeric() %>%
  data.frame(x = colnames(Boston), z = .) %>%
  qplot(x = z, y = x, data = ., xlim = c(-5, 5),
        xlab = 'Standardized score', ylab = 'Variable') +
    ggtitle('Standardized score of the suburb with the lowest median value\nof owner-occupied homes') +
    geom_segment(aes(xend = 0, yend = x)) +
    geom_vline(xintercept = 0, color = 'grey50')

Compared to the global, this suburb have much higher crim (per capita crime rate by town) and lstat (lower status of the population (percent)), have relatively higher tax (full-value property-tax rate per $10,000) and rad (index of accessibility to radial highways), and have relatively lower rm (average number of rooms per dwelling) and dis (weighted mean of distances to five Boston employment centres).

Exercise 2.10 - (h)

In this data set, how many of the suburbs average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the suburbs that average more than eight rooms per dwelling.

sum(Boston$rm > 7)

[1] 64

sum(Boston$rm > 8)

[1] 13

There are 64 suburbs average more than seven rooms per dwelling and 13 suburbs average more than seven rooms per dwelling.

sub_8rm_z <- robustHD::standardize(Boston)[Boston$rm > 8,]
summary(sub_8rm_z)

      crim                zn               indus              chas        
 Min.   :-0.41777   Min.   :-0.48724   Min.   :-1.2327   Min.   :-0.2723  
 1st Qu.:-0.38157   1st Qu.:-0.48724   1st Qu.:-1.0447   1st Qu.:-0.2723  
 Median :-0.35963   Median :-0.48724   Median :-0.7196   Median :-0.2723  
 Mean   :-0.33654   Mean   : 0.09655   Mean   :-0.5916   Mean   : 0.3334  
 3rd Qu.:-0.35286   3rd Qu.: 0.37030   3rd Qu.:-0.7196   3rd Qu.:-0.2723  
 Max.   :-0.01619   Max.   : 3.58609   Max.   : 1.2307   Max.   : 3.6648  
      nox                rm             age                dis          
 Min.   :-1.1960   Min.   :2.490   Min.   :-2.13774   Min.   :-0.94697  
 1st Qu.:-0.4375   1st Qu.:2.793   1st Qu.: 0.06484   1st Qu.:-0.71546  
 Median :-0.4116   Median :2.864   Median : 0.34549   Median :-0.42771  
 Mean   :-0.1334   Mean   :2.937   Mean   : 0.10528   Mean   :-0.17327  
 3rd Qu.: 0.4341   3rd Qu.:3.008   3rd Qu.: 0.63680   3rd Qu.:-0.06798  
 Max.   : 1.4093   Max.   :3.552   Max.   : 0.89968   Max.   : 2.42752  
      rad               tax             ptratio            black         
 Min.   :-0.8670   Min.   :-1.0932   Min.   :-2.5199   Min.   :-0.02327  
 1st Qu.:-0.5225   1st Qu.:-0.8558   1st Qu.:-1.7347   1st Qu.: 0.30523  
 Median :-0.2928   Median :-0.6007   Median :-0.4876   Median : 0.33064  
 Mean   :-0.2398   Mean   :-0.4934   Mean   :-0.9672   Mean   : 0.31258  
 3rd Qu.:-0.1779   3rd Qu.:-0.6007   3rd Qu.:-0.4876   3rd Qu.: 0.36175  
 Max.   : 1.6596   Max.   : 1.5294   Max.   : 0.8058   Max.   : 0.44062  
     lstat             medv         
 Min.   :-1.426   Min.   :-0.06881  
 1st Qu.:-1.307   1st Qu.: 2.08405  
 Median :-1.192   Median : 2.80166  
 Mean   :-1.168   Mean   : 2.35587  
 3rd Qu.:-1.055   3rd Qu.: 2.98650  
 Max.   :-0.730   Max.   : 2.98650

apply(sub_8rm_z, 2, mean) %>% as.numeric() %>%
  data.frame(x = colnames(Boston), z = .) %>%
  qplot(x = z, y = x, data = ., xlim = c(-5, 5),
        xlab = 'Standardized score', ylab = 'Variable') +
    ggtitle('Standardized score of the suburb with the lowest median value\nof owner-occupied homes') +
    geom_segment(aes(xend = 0, yend = x)) +
    geom_vline(xintercept = 0, color = 'grey50')

The standardized scores show that the suburbs that average more than eight rooms per dwelling have relatively higher rm (average number of rooms per dwelling.) and medv (median value of owner-occupied homes in $1000s).

Machine Learning - HW2