Summary

Exercises from the second chapter of the Introduction to Statistical learning with R.

College data - Applied 8

# (a) / (b) / (c)
summary(College)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
pairs(College[,1:10])

ggplot(College, aes(x= Private,  y= Outstate))+
  geom_boxplot()+
  ggtitle("Out-of-state tuition vs private college")

college <- College
college$Elite <- ifelse(college$Top10perc > 50, college$Elite <- "Yes", 
                        college$Elite <- "No")

college$Elite <- as.factor(college$Elite)

summary(college$Elite)
##  No Yes 
## 699  78
ggplot(college, aes(x= Elite,  y= Outstate))+
  geom_boxplot()+
  ggtitle("Out-of-state tuition vs Elite")

a <- ggplot(college, aes(x=Accept))+ geom_histogram()
b <- ggplot(college, aes(x=Enroll))+ geom_histogram(bins = 50)
c <- ggplot(college, aes(x=Books))+ geom_histogram(bins = 40)
d <- ggplot(college, aes(x=PhD))+ geom_histogram()
grid.arrange(a,b,c,d, nrow= 2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Auto data - Applied 9

summary(Auto)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365
# (a)
# all the columns are qualitative except "name" and "Origin"

# (b)
sapply(Auto[, 1:7], range)
##       mpg cylinders displacement horsepower weight acceleration year
## [1,]  9.0         3           68         46   1613          8.0   70
## [2,] 46.6         8          455        230   5140         24.8   82
# (c)
sapply(Auto[, 1:7], mean)
##          mpg    cylinders displacement   horsepower       weight 
##    23.445918     5.471939   194.411990   104.469388  2977.584184 
## acceleration         year 
##    15.541327    75.979592
sapply(Auto[, 1:7], sd)
##          mpg    cylinders displacement   horsepower       weight 
##     7.805007     1.705783   104.644004    38.491160   849.402560 
## acceleration         year 
##     2.758864     3.683737
# (d)
# now we remove the 10th though 85th observation
auto <- Auto[-c(10:85),]
sapply(auto[, 1:7], range)
##       mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0         3           68         46   1649          8.5   70
## [2,] 46.6         8          455        230   4997         24.8   82
sapply(auto[, 1:7], mean)
##          mpg    cylinders displacement   horsepower       weight 
##    24.404430     5.373418   187.240506   100.721519  2935.971519 
## acceleration         year 
##    15.726899    77.145570
sapply(auto[, 1:7], sd)
##          mpg    cylinders displacement   horsepower       weight 
##     7.867283     1.654179    99.678367    35.708853   811.300208 
## acceleration         year 
##     2.693721     3.106217
# (e)
# Using the full dataset, investigate the predictors graphically
pairs(Auto)

plot(Auto$horsepower, Auto$weight)

plot(Auto$mpg, Auto$weight)

plot(Auto$mpg, Auto$year)

# (f)
# as we can see on the pairs plot, all predictors seem to have some impact on the mpg column

Boston data - Applied 10

# (a)
boston <- MASS::Boston

# There are 506 rows and 14 columns
# Each row represents a house value in Boston


# (b)

ggplot(boston, aes(x=nox, y= age))+
  geom_point()+
  ggtitle("nitrogen oxides concentration vs proportion of owner-occupied units built prior to 1940")

# The nitrogen oxides concentration increases with the proportion of older building


ggplot(boston, aes(x=lstat, y= medv))+
  geom_point()+
  ggtitle("lower status of the population (percent) vs median value of owner-occupied homes in $1000s")

# As the percentage of the lower status of the population increase, the median value of owner occupied homes decrease


ggplot(boston, aes(x=nox, y= dis))+
  geom_point()+
  ggtitle("nitrogen oxides concentration vs weighted mean of distances to five Boston employment centres")

# The nitrogen oxides concentration increases as we get closer to the Boston employment centres


ggplot(boston, aes(x=lstat, y= age))+
  geom_point()+
  ggtitle("lower status of the population (percent) vs proportion of owner-occupied units built prior to 1940")

# Lower status population tends to leave in older buildings


ggplot(boston, aes(x=rm, y= medv))+
  geom_point()+
  ggtitle("average number of rooms per dwelling vs median value of owner-occupied homes in $1000s")

# As your flat gets more rooms, the price increase

# (c)
# are any of the predictors associated with per capita crime rate?
pairs(boston)

ggplot(boston, aes(x= age, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs proportion of owner-occupied units built prior to 1940")

# Higher crime rate in older building area

ggplot(boston, aes(x= zn, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs proportion of residential land zoned for lots over 25,000 sq.ft.")

# Lower crime rate in industrial zones

ggplot(boston, aes(x= tax, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs full-value property-tax rate per $10,000")

# Higher crime rate in higher property tax rate

ggplot(boston, aes(x= dis, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs weighted mean of distances to five Boston employment centres")

# Higher crime rate as we get close distance from the Boston employment centres


ggplot(boston, aes(x= rad, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs index of accessibility to radial highways.")

# Higher crime rate as the highways accessibility increase


ggplot(boston, aes(x= lstat, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs lower status of the population (percent)")

# Higher crime rate where the percentage of lower status population increase


ggplot(boston, aes(x= lstat, y=crim))+
  geom_point()+
  ggtitle("per capita crime rate by town vs lower status of the population (percent)")

# (d) 
# do any of the Boston suburbs appear to have particularly high crimes rates? Tax rates ? Pupil teacher ratio?


boxplot(boston$crim, main = "per capita crime rate by town")

hist(boston$crim[boston$crim > 1], breaks= 40)

# we can see few towns/suburbs with high per capita crime rate 

boxplot(boston$tax, main = "full-value property-tax rate per $10,000.")

hist(boston$tax, breaks= 40)

# Similarly we have few places with high Tax : full-value property-tax rate per \$10,000 around 680


hist(boston$ptratio, breaks= 40)

#We have few places with a higher pupil-teacher ratio (although it is mostly within s a smaller range: 14 - 22)

# (e)
# How many ofthe surbubs in this data set bound the Charles river?

sum(boston$chas)
## [1] 35
# (f) 
# What is the median pupil-teacher ratio among the towns in this data set>

median(boston$ptratio)
## [1] 19.05
# (g)
# Which suburb of Boston has lowest median value of owner-occupied homes? what are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors?

min(boston$medv)
## [1] 5
median_boston <- as.matrix(apply(boston, 2, FUN= "median"))
min_medv <- t(as.matrix(boston[min(boston$medv),]))

table_min_median <- cbind(median_boston, min_medv)
colnames(table_min_median) <- c("median_Boston", "Min medv")
print(table_min_median)
##         median_Boston  Min medv
## crim          0.25651   0.06905
## zn            0.00000   0.00000
## indus         9.69000   2.18000
## chas          0.00000   0.00000
## nox           0.53800   0.45800
## rm            6.20850   7.14700
## age          77.50000  54.20000
## dis           3.20745   6.06220
## rad           5.00000   3.00000
## tax         330.00000 222.00000
## ptratio      19.05000  18.70000
## black       391.44000 396.90000
## lstat        11.36000   5.33000
## medv         21.20000  36.20000
# It looks like it is a good place to live, with low crime ratio, low proportion of non-retail business, better air quality than the Boston median (now). It seems to be further to the 5 Boston employment centres than most Boston suburbs from the dataset. The lower status of the population (percent) is also lower than the Boston median    


# (h) 
# In this dataset, how many of the suburbs average more than seven rooms per dwelling? more than 8 rooms per dwelling? Comment on the suburbs that average more than 8 rooms per dwelling

sum(boston$rm >= 7) / 506 * 100
## [1] 12.64822
# 12% (64 dwellings) of the dwellings have 7 rooms or more

sum(boston$rm >= 8) / 506 * 100
## [1] 2.56917
# 2.6% (13 dwellings) have  8 rooms or more

dwelling_8_rooms <- filter(boston, rm >= 8)
dwelling_8_rooms <- apply(dwelling_8_rooms, 2, FUN= "median")
dwelling_8_rooms <- as.matrix(dwelling_8_rooms)
dwelling_8_rooms <- cbind(median_boston, dwelling_8_rooms)
colnames(dwelling_8_rooms) <- c("Median Boston", "> 8 rooms") 

# lower crime and lower "lower status of the population (percent)" and higher median value of owner-occupied homes in \$1000s