Exercises from the second chapter of the Introduction to Statistical learning with R.
# (a) / (b) / (c)
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
pairs(College[,1:10])
ggplot(College, aes(x= Private, y= Outstate))+
geom_boxplot()+
ggtitle("Out-of-state tuition vs private college")
college <- College
college$Elite <- ifelse(college$Top10perc > 50, college$Elite <- "Yes",
college$Elite <- "No")
college$Elite <- as.factor(college$Elite)
summary(college$Elite)
## No Yes
## 699 78
ggplot(college, aes(x= Elite, y= Outstate))+
geom_boxplot()+
ggtitle("Out-of-state tuition vs Elite")
a <- ggplot(college, aes(x=Accept))+ geom_histogram()
b <- ggplot(college, aes(x=Enroll))+ geom_histogram(bins = 50)
c <- ggplot(college, aes(x=Books))+ geom_histogram(bins = 40)
d <- ggplot(college, aes(x=PhD))+ geom_histogram()
grid.arrange(a,b,c,d, nrow= 2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name
## amc matador : 5
## ford pinto : 5
## toyota corolla : 5
## amc gremlin : 4
## amc hornet : 4
## chevrolet chevette: 4
## (Other) :365
# (a)
# all the columns are qualitative except "name" and "Origin"
# (b)
sapply(Auto[, 1:7], range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
# (c)
sapply(Auto[, 1:7], mean)
## mpg cylinders displacement horsepower weight
## 23.445918 5.471939 194.411990 104.469388 2977.584184
## acceleration year
## 15.541327 75.979592
sapply(Auto[, 1:7], sd)
## mpg cylinders displacement horsepower weight
## 7.805007 1.705783 104.644004 38.491160 849.402560
## acceleration year
## 2.758864 3.683737
# (d)
# now we remove the 10th though 85th observation
auto <- Auto[-c(10:85),]
sapply(auto[, 1:7], range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
sapply(auto[, 1:7], mean)
## mpg cylinders displacement horsepower weight
## 24.404430 5.373418 187.240506 100.721519 2935.971519
## acceleration year
## 15.726899 77.145570
sapply(auto[, 1:7], sd)
## mpg cylinders displacement horsepower weight
## 7.867283 1.654179 99.678367 35.708853 811.300208
## acceleration year
## 2.693721 3.106217
# (e)
# Using the full dataset, investigate the predictors graphically
pairs(Auto)
plot(Auto$horsepower, Auto$weight)
plot(Auto$mpg, Auto$weight)
plot(Auto$mpg, Auto$year)
# (f)
# as we can see on the pairs plot, all predictors seem to have some impact on the mpg column
# (a)
boston <- MASS::Boston
# There are 506 rows and 14 columns
# Each row represents a house value in Boston
# (b)
ggplot(boston, aes(x=nox, y= age))+
geom_point()+
ggtitle("nitrogen oxides concentration vs proportion of owner-occupied units built prior to 1940")
# The nitrogen oxides concentration increases with the proportion of older building
ggplot(boston, aes(x=lstat, y= medv))+
geom_point()+
ggtitle("lower status of the population (percent) vs median value of owner-occupied homes in $1000s")
# As the percentage of the lower status of the population increase, the median value of owner occupied homes decrease
ggplot(boston, aes(x=nox, y= dis))+
geom_point()+
ggtitle("nitrogen oxides concentration vs weighted mean of distances to five Boston employment centres")
# The nitrogen oxides concentration increases as we get closer to the Boston employment centres
ggplot(boston, aes(x=lstat, y= age))+
geom_point()+
ggtitle("lower status of the population (percent) vs proportion of owner-occupied units built prior to 1940")
# Lower status population tends to leave in older buildings
ggplot(boston, aes(x=rm, y= medv))+
geom_point()+
ggtitle("average number of rooms per dwelling vs median value of owner-occupied homes in $1000s")
# As your flat gets more rooms, the price increase
# (c)
# are any of the predictors associated with per capita crime rate?
pairs(boston)
ggplot(boston, aes(x= age, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs proportion of owner-occupied units built prior to 1940")
# Higher crime rate in older building area
ggplot(boston, aes(x= zn, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs proportion of residential land zoned for lots over 25,000 sq.ft.")
# Lower crime rate in industrial zones
ggplot(boston, aes(x= tax, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs full-value property-tax rate per $10,000")
# Higher crime rate in higher property tax rate
ggplot(boston, aes(x= dis, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs weighted mean of distances to five Boston employment centres")
# Higher crime rate as we get close distance from the Boston employment centres
ggplot(boston, aes(x= rad, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs index of accessibility to radial highways.")
# Higher crime rate as the highways accessibility increase
ggplot(boston, aes(x= lstat, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs lower status of the population (percent)")
# Higher crime rate where the percentage of lower status population increase
ggplot(boston, aes(x= lstat, y=crim))+
geom_point()+
ggtitle("per capita crime rate by town vs lower status of the population (percent)")
# (d)
# do any of the Boston suburbs appear to have particularly high crimes rates? Tax rates ? Pupil teacher ratio?
boxplot(boston$crim, main = "per capita crime rate by town")
hist(boston$crim[boston$crim > 1], breaks= 40)
# we can see few towns/suburbs with high per capita crime rate
boxplot(boston$tax, main = "full-value property-tax rate per $10,000.")
hist(boston$tax, breaks= 40)
# Similarly we have few places with high Tax : full-value property-tax rate per \$10,000 around 680
hist(boston$ptratio, breaks= 40)
#We have few places with a higher pupil-teacher ratio (although it is mostly within s a smaller range: 14 - 22)
# (e)
# How many ofthe surbubs in this data set bound the Charles river?
sum(boston$chas)
## [1] 35
# (f)
# What is the median pupil-teacher ratio among the towns in this data set>
median(boston$ptratio)
## [1] 19.05
# (g)
# Which suburb of Boston has lowest median value of owner-occupied homes? what are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors?
min(boston$medv)
## [1] 5
median_boston <- as.matrix(apply(boston, 2, FUN= "median"))
min_medv <- t(as.matrix(boston[min(boston$medv),]))
table_min_median <- cbind(median_boston, min_medv)
colnames(table_min_median) <- c("median_Boston", "Min medv")
print(table_min_median)
## median_Boston Min medv
## crim 0.25651 0.06905
## zn 0.00000 0.00000
## indus 9.69000 2.18000
## chas 0.00000 0.00000
## nox 0.53800 0.45800
## rm 6.20850 7.14700
## age 77.50000 54.20000
## dis 3.20745 6.06220
## rad 5.00000 3.00000
## tax 330.00000 222.00000
## ptratio 19.05000 18.70000
## black 391.44000 396.90000
## lstat 11.36000 5.33000
## medv 21.20000 36.20000
# It looks like it is a good place to live, with low crime ratio, low proportion of non-retail business, better air quality than the Boston median (now). It seems to be further to the 5 Boston employment centres than most Boston suburbs from the dataset. The lower status of the population (percent) is also lower than the Boston median
# (h)
# In this dataset, how many of the suburbs average more than seven rooms per dwelling? more than 8 rooms per dwelling? Comment on the suburbs that average more than 8 rooms per dwelling
sum(boston$rm >= 7) / 506 * 100
## [1] 12.64822
# 12% (64 dwellings) of the dwellings have 7 rooms or more
sum(boston$rm >= 8) / 506 * 100
## [1] 2.56917
# 2.6% (13 dwellings) have 8 rooms or more
dwelling_8_rooms <- filter(boston, rm >= 8)
dwelling_8_rooms <- apply(dwelling_8_rooms, 2, FUN= "median")
dwelling_8_rooms <- as.matrix(dwelling_8_rooms)
dwelling_8_rooms <- cbind(median_boston, dwelling_8_rooms)
colnames(dwelling_8_rooms) <- c("Median Boston", "> 8 rooms")
# lower crime and lower "lower status of the population (percent)" and higher median value of owner-occupied homes in \$1000s