library(ggplot2)
library(dplyr)
library(corrr)
library(tidyr)
library(ISLR)
library(ISLR2)
library(dplyr)
library(Hmisc)
library(pheatmap)
data(Wage)
#Figure 1.1 Left
age.wage = data.frame(group_by(Wage, age) %>% summarise(mean.wage=mean(wage)))
p <- ggplot(data=Wage, aes(x=age, y=wage)) + geom_point() + geom_point(data=age.wage, aes(y=mean.wage), colour='red') + geom_smooth(colour='red',se=FALSE)
p
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Figure 1.1 Center
p2 <- ggplot(data=Wage, aes(x=year, y=wage)) + geom_point() + geom_smooth(method=lm, se=FALSE)
p2
## `geom_smooth()` using formula 'y ~ x'
#Figure 1.1 Right
p3<-ggplot(Wage, aes(x=education, y=wage, fill=education)) +
geom_boxplot() +
scale_x_discrete(labels=c("1","2","3","4","5"))
p3
college <- read.csv("https://www.statlearning.com/s/College.csv")
rownames(college) <- college[,1]
View(college)
college <- college[,-1]
View(college)
#i.)
summary(college)
## Private Apps Accept Enroll
## Length:777 Min. : 81 Min. : 72 Min. : 35
## Class :character 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242
## Mode :character Median : 1558 Median : 1110 Median : 434
## Mean : 3002 Mean : 2019 Mean : 780
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902
## Max. :48094 Max. :26330 Max. :6392
## Top10perc Top25perc F.Undergrad P.Undergrad
## Min. : 1.00 Min. : 9.0 Min. : 139 Min. : 1.0
## 1st Qu.:15.00 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0
## Median :23.00 Median : 54.0 Median : 1707 Median : 353.0
## Mean :27.56 Mean : 55.8 Mean : 3700 Mean : 855.3
## 3rd Qu.:35.00 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0
## Max. :96.00 Max. :100.0 Max. :31643 Max. :21836.0
## Outstate Room.Board Books Personal
## Min. : 2340 Min. :1780 Min. : 96.0 Min. : 250
## 1st Qu.: 7320 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850
## Median : 9990 Median :4200 Median : 500.0 Median :1200
## Mean :10441 Mean :4358 Mean : 549.4 Mean :1341
## 3rd Qu.:12925 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700
## Max. :21700 Max. :8124 Max. :2340.0 Max. :6800
## PhD Terminal S.F.Ratio perc.alumni
## Min. : 8.00 Min. : 24.0 Min. : 2.50 Min. : 0.00
## 1st Qu.: 62.00 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00
## Median : 75.00 Median : 82.0 Median :13.60 Median :21.00
## Mean : 72.66 Mean : 79.7 Mean :14.09 Mean :22.74
## 3rd Qu.: 85.00 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00
## Max. :103.00 Max. :100.0 Max. :39.80 Max. :64.00
## Expend Grad.Rate
## Min. : 3186 Min. : 10.00
## 1st Qu.: 6751 1st Qu.: 53.00
## Median : 8377 Median : 65.00
## Mean : 9660 Mean : 65.46
## 3rd Qu.:10830 3rd Qu.: 78.00
## Max. :56233 Max. :118.00
#ii.)
college$Private <- as.factor(college$Private)
pairs(college[,1:10])
#iii.)
p <- ggplot(data=college, aes(x=Private, y=Outstate)) + geom_point() +
ylab("Out-of-state tuition")
p
#iv.)
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] <- "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(college$Elite) #There are 78 elite colleges
## No Yes
## 699 78
p2 <- ggplot(college, aes(x=Elite, y=Outstate, fill=Elite)) +
geom_boxplot() + ylab("Out-of-state tuition")
p2
There are 78 elite colleges.
#v.)
par(mfrow=c(2,2))
hist(college$Apps, xlab = "Apps Received", main = "")
hist(college$perc.alumni, col=1, xlab = "% of alumni who donate", main = "")
hist(college$S.F.Ratio, col=2, breaks=10, xlab = "Student/faculty ratio", main = "")
hist(college$Expend, breaks=100, xlab = "Instructional expenditure/student", main = "")
#vi.) Continue exploring the data and provide brief summary
p3 <- ggplot(college, aes(x=Private, y=Room.Board, fill=Private)) +
geom_boxplot() + ylab("Room and Board Costs")
p3
It appears that private schools tend to have most expensive room and board. This begs the question: What are you paying for?
Maybe they are paying for better student faculty ratios? Let’s see.
p4 <- ggplot(college, aes(x=Private, y=S.F.Ratio, fill=Private)) +
geom_boxplot() + ylab("Student-Faculty Ratio")
p4
Private schools have lower student-faculty ratio. But only by about 5 students.
Maybe private schools spend more on each student. Let’s see.
p5 <- ggplot(college, aes(x=Private, y=Expend, fill=Private)) + geom_boxplot()
p5
It appears that Private schools tend to expend more on each student than do public schools. I wonder what that the itemized expense report looks like.
Auto <- read.csv("https://www.statlearning.com/s/Auto.csv",
header = TRUE, na.strings = "?")
Auto <- na.omit(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 Length:392
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 Class :character
## Median :15.50 Median :76.00 Median :1.000 Mode :character
## Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :24.80 Max. :82.00 Max. :3.000
It appears that all predictors are quantitative except for origin, which was coded as 1=usa, 2=europe, 3=japan and name: name of car make
qual.columns <- which(names(Auto) %in% c("name", "origin"))
quantitative.Auto <- Auto[,-qual.columns]
sapply(quantitative.Auto, range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
#c.)
sapply(quantitative.Auto, mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
sapply(quantitative.Auto, sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
#d.)
sapply(quantitative.Auto[-c(10:85),], range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
sapply(quantitative.Auto[-c(10:85),], mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
sapply(quantitative.Auto[-c(10:85),], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
Let’s do some exploration. Would like to know if cars become more efficient in this timeframe?
p6 <- ggplot(quantitative.Auto, aes(x=as.factor(year), y=mpg)) + geom_boxplot() +
xlab("Year")
p6
It appears that yes, on the average cars became more fuel efficient.
Would like to know: Which countries made the most efficient cars?
#which country made the most efficient cars?
Auto$origin <- as.factor(Auto$origin)
p7 <- ggplot(Auto, aes(x=origin, y=mpg, fill=origin)) + geom_boxplot() +
scale_x_discrete(labels=c('USA', 'Europe', 'Japan'))
p7
Not a surprise that USA came last in fuel efficiency. This leads us to our discussion about mpg in Problem 9f.)
pheatmap(cor(quantitative.Auto, use="pairwise.complete.obs"))
Per the heatmap there appears to be strong negative correlation between mpg and horsepower, weight, cylinders, and displacement. Furthermore, there appears to be strong negative correlation between mpg and year and acceleration. These all intuitively make sense.
We will now focus on the correlations between predictors and mpg.
auto.corr = quantitative.Auto %>% correlate() %>% focus(mpg)
auto.corr[which(abs(auto.corr[,2]) > 0.35),]
## # A tibble: 6 x 2
## term mpg
## <chr> <dbl>
## 1 cylinders -0.778
## 2 displacement -0.805
## 3 horsepower -0.778
## 4 weight -0.832
## 5 acceleration 0.423
## 6 year 0.581
As we can there are moderate to strong correlations between all quantitative predictors and the mpg. These numerical findings match the heatmap above.
For example, let’s look at cylinders and mpg.
p <- ggplot(Auto, aes(x=as.factor(cylinders), y=mpg, fill=as.factor(cylinders))) + geom_boxplot() + scale_x_discrete(labels=c('3', '4', '5', '6', '8')) +
xlab('# of Cylinders') + guides(fill=guide_legend(title='# of Cylinders'))
p
It appears the as the number of cylinders increase, the mpg decreases. In other words they are negatively correlated. Thus, it is reasonable to believe that cylinders could be used to predict the mpg.
To be fair though, since cylinders is strongly positively correlated with horsepower or weight, we could also use horsepower or weight to predict mpg. A learning algorithm would be needed to see which predictor variable is most informational for predicting mpg.
dim(Boston)
## [1] 506 13
There are 506 rows and 14 columns. Each row represents an observation where each observation is housing values in a particular suburb of Boston. The 14 columns are the house characteristics for that particular suburb. For example, data on crime rate of town, proximity to Charles River, proximity to economic areas or highways is included.
Will focus on the relationship between crime rate and industrial zoning.
p <- ggplot(Boston, aes(x=indus, y=crim)) + geom_point()
p
large.crime = which(Boston$crim > 10)
p2 <- ggplot(Boston[-large.crime,], aes(x=indus, y=crim)) + geom_point()
p2
There doesn’t appear to be any clear relationship between the industrial proportion (the proportion of non-retail business acres per town) and the crime rate. We might’ve guessed that crime happens in the more ‘industrial’ parts of town. Yet that cannot be argued very well.
All that can be said is those neighborhoods with industrial proportion of between 17 and 20 have particularly high crime rates. All that we can guess it that it must be a bunched neighborhoods around a hot crime spot
crim.corr = Boston %>% correlate() %>% focus(crim)
crim.corr[which(abs(crim.corr[,2]) > 0.35),]
## # A tibble: 8 x 2
## term crim
## <chr> <dbl>
## 1 indus 0.407
## 2 nox 0.421
## 3 age 0.353
## 4 dis -0.380
## 5 rad 0.626
## 6 tax 0.583
## 7 lstat 0.456
## 8 medv -0.388
We can see that many of the predictor variables are moderately correlated with crime rate.
qplot(Boston$crim, binwidth=4 , xlab = "Crime rate", ylab="# of Suburbs" )
It is interesting to note that there is a long skewed right tail. That means that yes, there are a couple neighborhoods with relatively high crime rates.
qplot(Boston$tax, binwidth=40 , xlab = "Full-value property-tax rate per $10,000", ylab="# of Suburbs")
There appears to be a bimodal distribution. This suggests there is a large gap between the lower-middle class and the upper class.
qplot(Boston$ptratio, binwidth=3, xlab ="Pupil-teacher ratio by town", ylab="Number of Suburbs")
There doesn’t appear to be any interesting to note or any outliers to mention in Pupil-teacher ratio by town. Most seem to be a tight range of 15-22.
summary(Boston$chas==1)
## Mode FALSE TRUE
## logical 471 35
There are 35 suburbs that border the Charles River.
median(Boston$ptratio)
## [1] 19.05
The median P-T ratio is 19.05
min.medv.tract = Boston[which.min(Boston$medv),]
par(mfrow=c(5,3), mar=c(2, 2, 1, 0))
for (i in 1:ncol(Boston)){
hist(Boston[, i], main=colnames(Boston)[i], breaks="FD")
abline(v=Boston[which.min(Boston$medv), i], col="blue", lw=3)
}
From our graphic above, we can state the folling about this particular suburb.
This neighborhood has a very high crime rate compared to the median of the city. There are few to no zoned lots over 25,000 square feet. The proportion of industrial acres per town is above average. The neighborhood doesn’t border the Charles River. The nitrogen oxide concentration is above the median of the city. The neighborhood has one of the lower rooms per dwelling (i.e. each dwelling is a single or double unit).
sum(Boston$rm > 7)
## [1] 64
sum(Boston$rm > 8)
## [1] 13
more8.rooms = Boston[which(Boston$rm>8),]
par(mfrow=c(5,3), mar=c(2, 2, 1, 0))
for (i in 1:ncol(Boston)){
hist(Boston[, i], main=colnames(Boston)[i], breaks="FD")
abline(v=Boston[which(Boston$rm>8), i], col="red", lw=1)
}
One of the more telling stories of this graphic is that these neighborhooods tend to be in some of the higher status neighborhoods (see the lstat histogram). This makes sense since if a dwelling has more than 8 rooms than it most likely in a very economically rich neighborhood.