Math 533 HW1

Libraries Used

library(ggplot2)
library(dplyr)
library(corrr)
library(tidyr)
library(ISLR)
library(ISLR2)
library(dplyr)
library(Hmisc)
library(pheatmap)

2.) Recreate Figure 1.1 from ISLR

data(Wage)

#Figure 1.1 Left
age.wage = data.frame(group_by(Wage, age) %>% summarise(mean.wage=mean(wage)))
p <- ggplot(data=Wage, aes(x=age, y=wage)) + geom_point() + geom_point(data=age.wage, aes(y=mean.wage), colour='red') + geom_smooth(colour='red',se=FALSE)
p

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Figure 1.1 Center
p2 <- ggplot(data=Wage, aes(x=year, y=wage)) + geom_point() + geom_smooth(method=lm, se=FALSE)
p2

## `geom_smooth()` using formula 'y ~ x'

#Figure 1.1 Right
p3<-ggplot(Wage, aes(x=education, y=wage, fill=education)) +
  geom_boxplot() + 
  scale_x_discrete(labels=c("1","2","3","4","5"))
p3

Solve Problems 8, 9, 10

Problem 8

8a.)

college <- read.csv("https://www.statlearning.com/s/College.csv")

8b.)

rownames(college) <- college[,1]
View(college)

college <- college[,-1]
View(college)

8c.) (i-iv)

#i.)
summary(college)

##    Private               Apps           Accept          Enroll    
##  Length:777         Min.   :   81   Min.   :   72   Min.   :  35  
##  Class :character   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242  
##  Mode  :character   Median : 1558   Median : 1110   Median : 434  
##                     Mean   : 3002   Mean   : 2019   Mean   : 780  
##                     3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902  
##                     Max.   :48094   Max.   :26330   Max.   :6392  
##    Top10perc       Top25perc      F.Undergrad     P.Undergrad     
##  Min.   : 1.00   Min.   :  9.0   Min.   :  139   Min.   :    1.0  
##  1st Qu.:15.00   1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0  
##  Median :23.00   Median : 54.0   Median : 1707   Median :  353.0  
##  Mean   :27.56   Mean   : 55.8   Mean   : 3700   Mean   :  855.3  
##  3rd Qu.:35.00   3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0  
##  Max.   :96.00   Max.   :100.0   Max.   :31643   Max.   :21836.0  
##     Outstate       Room.Board       Books           Personal   
##  Min.   : 2340   Min.   :1780   Min.   :  96.0   Min.   : 250  
##  1st Qu.: 7320   1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850  
##  Median : 9990   Median :4200   Median : 500.0   Median :1200  
##  Mean   :10441   Mean   :4358   Mean   : 549.4   Mean   :1341  
##  3rd Qu.:12925   3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700  
##  Max.   :21700   Max.   :8124   Max.   :2340.0   Max.   :6800  
##       PhD            Terminal       S.F.Ratio      perc.alumni   
##  Min.   :  8.00   Min.   : 24.0   Min.   : 2.50   Min.   : 0.00  
##  1st Qu.: 62.00   1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00  
##  Median : 75.00   Median : 82.0   Median :13.60   Median :21.00  
##  Mean   : 72.66   Mean   : 79.7   Mean   :14.09   Mean   :22.74  
##  3rd Qu.: 85.00   3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00  
##  Max.   :103.00   Max.   :100.0   Max.   :39.80   Max.   :64.00  
##      Expend        Grad.Rate     
##  Min.   : 3186   Min.   : 10.00  
##  1st Qu.: 6751   1st Qu.: 53.00  
##  Median : 8377   Median : 65.00  
##  Mean   : 9660   Mean   : 65.46  
##  3rd Qu.:10830   3rd Qu.: 78.00  
##  Max.   :56233   Max.   :118.00

#ii.) 
college$Private <- as.factor(college$Private)
pairs(college[,1:10])

#iii.) 
p <- ggplot(data=college, aes(x=Private, y=Outstate)) + geom_point() + 
  ylab("Out-of-state tuition")
p

#iv.) 
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] <- "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)

summary(college$Elite) #There are 78 elite colleges

##  No Yes 
## 699  78

p2 <- ggplot(college, aes(x=Elite, y=Outstate, fill=Elite)) + 
  geom_boxplot() + ylab("Out-of-state tuition")
p2

There are 78 elite colleges.

8c.) (v)

#v.) 
par(mfrow=c(2,2))
hist(college$Apps, xlab = "Apps Received", main = "")
hist(college$perc.alumni, col=1, xlab = "% of alumni who donate", main = "")
hist(college$S.F.Ratio, col=2, breaks=10, xlab = "Student/faculty ratio", main = "")
hist(college$Expend, breaks=100, xlab = "Instructional expenditure/student", main = "")

8c.) (vi.)

#vi.) Continue exploring the data and provide brief summary
p3 <- ggplot(college, aes(x=Private, y=Room.Board, fill=Private)) + 
  geom_boxplot() + ylab("Room and Board Costs")
p3

It appears that private schools tend to have most expensive room and board. This begs the question: What are you paying for?
Maybe they are paying for better student faculty ratios? Let’s see.

p4 <- ggplot(college, aes(x=Private, y=S.F.Ratio, fill=Private)) + 
  geom_boxplot() + ylab("Student-Faculty Ratio")
p4

Private schools have lower student-faculty ratio. But only by about 5 students.

Maybe private schools spend more on each student. Let’s see.

p5 <- ggplot(college, aes(x=Private, y=Expend, fill=Private)) + geom_boxplot()
p5

It appears that Private schools tend to expend more on each student than do public schools. I wonder what that the itemized expense report looks like.

Problem 9

Problem 9a.)

Auto <- read.csv("https://www.statlearning.com/s/Auto.csv", 
                 header = TRUE, na.strings = "?")
Auto <- na.omit(Auto)

summary(Auto)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##   acceleration        year           origin          name          
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   Length:392        
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   Class :character  
##  Median :15.50   Median :76.00   Median :1.000   Mode  :character  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577                     
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000                     
##  Max.   :24.80   Max.   :82.00   Max.   :3.000

It appears that all predictors are quantitative except for origin, which was coded as 1=usa, 2=europe, 3=japan and name: name of car make

Problem 9b.)

qual.columns <- which(names(Auto) %in% c("name", "origin"))
quantitative.Auto <- Auto[,-qual.columns]
sapply(quantitative.Auto, range)

##       mpg cylinders displacement horsepower weight acceleration year
## [1,]  9.0         3           68         46   1613          8.0   70
## [2,] 46.6         8          455        230   5140         24.8   82

Problem 9c.)

#c.) 
sapply(quantitative.Auto, mean)

##          mpg    cylinders displacement   horsepower       weight acceleration 
##    23.445918     5.471939   194.411990   104.469388  2977.584184    15.541327 
##         year 
##    75.979592

sapply(quantitative.Auto, sd)

##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.805007     1.705783   104.644004    38.491160   849.402560     2.758864 
##         year 
##     3.683737

Problem 9d.)

#d.) 
sapply(quantitative.Auto[-c(10:85),], range)

##       mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0         3           68         46   1649          8.5   70
## [2,] 46.6         8          455        230   4997         24.8   82

sapply(quantitative.Auto[-c(10:85),], mean)

##          mpg    cylinders displacement   horsepower       weight acceleration 
##    24.404430     5.373418   187.240506   100.721519  2935.971519    15.726899 
##         year 
##    77.145570

sapply(quantitative.Auto[-c(10:85),], sd)

##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.867283     1.654179    99.678367    35.708853   811.300208     2.693721 
##         year 
##     3.106217

Problem 9e.)

Let’s do some exploration. Would like to know if cars become more efficient in this timeframe?

p6 <- ggplot(quantitative.Auto, aes(x=as.factor(year), y=mpg)) + geom_boxplot() +
  xlab("Year") 
p6

It appears that yes, on the average cars became more fuel efficient.

Would like to know: Which countries made the most efficient cars?

#which country made the most efficient cars? 
Auto$origin <- as.factor(Auto$origin)
p7 <- ggplot(Auto, aes(x=origin, y=mpg, fill=origin)) + geom_boxplot() + 
  scale_x_discrete(labels=c('USA', 'Europe', 'Japan'))
p7

Not a surprise that USA came last in fuel efficiency. This leads us to our discussion about mpg in Problem 9f.)

pheatmap(cor(quantitative.Auto, use="pairwise.complete.obs"))

Per the heatmap there appears to be strong negative correlation between mpg and horsepower, weight, cylinders, and displacement. Furthermore, there appears to be strong negative correlation between mpg and year and acceleration. These all intuitively make sense.

We will now focus on the correlations between predictors and mpg.

auto.corr = quantitative.Auto %>% correlate() %>% focus(mpg)
auto.corr[which(abs(auto.corr[,2]) > 0.35),]

## # A tibble: 6 x 2
##   term            mpg
##   <chr>         <dbl>
## 1 cylinders    -0.778
## 2 displacement -0.805
## 3 horsepower   -0.778
## 4 weight       -0.832
## 5 acceleration  0.423
## 6 year          0.581

As we can there are moderate to strong correlations between all quantitative predictors and the mpg. These numerical findings match the heatmap above.

For example, let’s look at cylinders and mpg.

p <- ggplot(Auto, aes(x=as.factor(cylinders), y=mpg, fill=as.factor(cylinders))) + geom_boxplot() + scale_x_discrete(labels=c('3', '4', '5', '6', '8')) + 
  xlab('# of Cylinders')  + guides(fill=guide_legend(title='# of Cylinders'))
p

It appears the as the number of cylinders increase, the mpg decreases. In other words they are negatively correlated. Thus, it is reasonable to believe that cylinders could be used to predict the mpg.

To be fair though, since cylinders is strongly positively correlated with horsepower or weight, we could also use horsepower or weight to predict mpg. A learning algorithm would be needed to see which predictor variable is most informational for predicting mpg.

Problem 10

Problem 10a.)

dim(Boston)

## [1] 506  13

There are 506 rows and 14 columns. Each row represents an observation where each observation is housing values in a particular suburb of Boston. The 14 columns are the house characteristics for that particular suburb. For example, data on crime rate of town, proximity to Charles River, proximity to economic areas or highways is included.

Problem 10b.)

Will focus on the relationship between crime rate and industrial zoning.

p <- ggplot(Boston, aes(x=indus, y=crim)) + geom_point()
p

large.crime = which(Boston$crim > 10)

p2 <- ggplot(Boston[-large.crime,], aes(x=indus, y=crim)) + geom_point()
p2

There doesn’t appear to be any clear relationship between the industrial proportion (the proportion of non-retail business acres per town) and the crime rate. We might’ve guessed that crime happens in the more ‘industrial’ parts of town. Yet that cannot be argued very well.

All that can be said is those neighborhoods with industrial proportion of between 17 and 20 have particularly high crime rates. All that we can guess it that it must be a bunched neighborhoods around a hot crime spot

Problem 10c.)

crim.corr = Boston %>% correlate() %>% focus(crim)
crim.corr[which(abs(crim.corr[,2]) > 0.35),]

## # A tibble: 8 x 2
##   term    crim
##   <chr>  <dbl>
## 1 indus  0.407
## 2 nox    0.421
## 3 age    0.353
## 4 dis   -0.380
## 5 rad    0.626
## 6 tax    0.583
## 7 lstat  0.456
## 8 medv  -0.388

We can see that many of the predictor variables are moderately correlated with crime rate.

Problem 10d.)

qplot(Boston$crim, binwidth=4 , xlab = "Crime rate", ylab="# of Suburbs" )

It is interesting to note that there is a long skewed right tail. That means that yes, there are a couple neighborhoods with relatively high crime rates.

qplot(Boston$tax, binwidth=40 , xlab = "Full-value property-tax rate per $10,000", ylab="# of Suburbs")

There appears to be a bimodal distribution. This suggests there is a large gap between the lower-middle class and the upper class.

qplot(Boston$ptratio, binwidth=3, xlab ="Pupil-teacher ratio by town", ylab="Number of Suburbs")

There doesn’t appear to be any interesting to note or any outliers to mention in Pupil-teacher ratio by town. Most seem to be a tight range of 15-22.

Problem 10e.)

summary(Boston$chas==1)

##    Mode   FALSE    TRUE 
## logical     471      35

There are 35 suburbs that border the Charles River.

Problem 10f.)

median(Boston$ptratio)

## [1] 19.05

The median P-T ratio is 19.05

Problem 10g.)

min.medv.tract = Boston[which.min(Boston$medv),]
par(mfrow=c(5,3), mar=c(2, 2, 1, 0))
for (i in 1:ncol(Boston)){
  hist(Boston[, i], main=colnames(Boston)[i], breaks="FD")
  abline(v=Boston[which.min(Boston$medv), i], col="blue", lw=3)
}

From our graphic above, we can state the folling about this particular suburb.

This neighborhood has a very high crime rate compared to the median of the city. There are few to no zoned lots over 25,000 square feet. The proportion of industrial acres per town is above average. The neighborhood doesn’t border the Charles River. The nitrogen oxide concentration is above the median of the city. The neighborhood has one of the lower rooms per dwelling (i.e. each dwelling is a single or double unit).

Problem 10h.)

sum(Boston$rm > 7)

## [1] 64

sum(Boston$rm > 8)

## [1] 13

more8.rooms = Boston[which(Boston$rm>8),]
par(mfrow=c(5,3), mar=c(2, 2, 1, 0))
for (i in 1:ncol(Boston)){
  hist(Boston[, i], main=colnames(Boston)[i], breaks="FD")
  abline(v=Boston[which(Boston$rm>8), i], col="red", lw=1)
}

One of the more telling stories of this graphic is that these neighborhooods tend to be in some of the higher status neighborhoods (see the lstat histogram). This makes sense since if a dwelling has more than 8 rooms than it most likely in a very economically rich neighborhood.