# Homework #2 Script
library(tidyverse)
## ── Attaching packages ─────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data",
header=TRUE,
na.strings = "?")
Section A
str(Auto)
## 'data.frame': 397 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
# Quantitative variables include: mpg, displacement, horsepower, weight,
# acceleration, and year.
# Qualitative variables are origin, and name.
Section B
# Ranges of all quanitative variables:
range(Auto$mpg)
## [1] 9.0 46.6
range(Auto$displacement)
## [1] 68 455
range(Auto$horsepower)
## [1] NA NA
range(Auto$weight)
## [1] 1613 5140
range(Auto$acceleration)
## [1] 8.0 24.8
range(Auto$year)
## [1] 70 82
Section C
# Mean and standard deviation of all quantitative variables:
mean(Auto$mpg, na.rm=TRUE)
## [1] 23.51587
sd(Auto$mpg, na.rm=TRUE)
## [1] 7.825804
mean(Auto$cylinders, na.rm=TRUE)
## [1] 5.458438
sd(Auto$cylinders, na.rm=TRUE)
## [1] 1.701577
mean(Auto$displacement, na.rm=TRUE)
## [1] 193.5327
sd(Auto$displacement, na.rm=TRUE)
## [1] 104.3796
mean(Auto$horsepower, na.rm=TRUE)
## [1] 104.4694
sd(Auto$horsepower, na.rm=TRUE)
## [1] 38.49116
mean(Auto$weight, na.rm=TRUE)
## [1] 2970.262
sd(Auto$weight, na.rm=TRUE)
## [1] 847.9041
mean(Auto$acceleration, na.rm=TRUE)
## [1] 15.55567
sd(Auto$acceleration, na.rm=TRUE)
## [1] 2.749995
mean(Auto$year, na.rm=TRUE)
## [1] 75.99496
sd(Auto$year, na.rm=TRUE)
## [1] 3.690005
Section D
# Range, mean, and sd of quantitative variables without the 10-85th observations
indexedAuto <- Auto[-c(10:85)]
range(indexedAuto$mpg, na.rm = TRUE)
## [1] 9.0 46.6
mean(indexedAuto$mpg, na.rm=TRUE)
## [1] 23.51587
sd(indexedAuto$mpg, na.rm=TRUE)
## [1] 7.825804
range(indexedAuto$cylinders, na.rm = TRUE)
## [1] 3 8
mean(indexedAuto$cylinders, na.rm=TRUE)
## [1] 5.458438
sd(indexedAuto$cylinders, na.rm=TRUE)
## [1] 1.701577
range(indexedAuto$displacement, na.rm = TRUE)
## [1] 68 455
mean(indexedAuto$displacement, na.rm=TRUE)
## [1] 193.5327
sd(indexedAuto$displacement, na.rm=TRUE)
## [1] 104.3796
range(indexedAuto$horsepower, na.rm = TRUE)
## [1] 46 230
mean(indexedAuto$horsepower, na.rm=TRUE)
## [1] 104.4694
sd(indexedAuto$horsepower, na.rm=TRUE)
## [1] 38.49116
range(indexedAuto$weight, na.rm = TRUE)
## [1] 1613 5140
mean(indexedAuto$weight, na.rm=TRUE)
## [1] 2970.262
sd(indexedAuto$weight, na.rm=TRUE)
## [1] 847.9041
range(indexedAuto$acceleration, na.rm = TRUE)
## [1] 8.0 24.8
mean(indexedAuto$acceleration, na.rm=TRUE)
## [1] 15.55567
sd(indexedAuto$acceleration, na.rm=TRUE)
## [1] 2.749995
range(indexedAuto$year, na.rm = TRUE)
## [1] 70 82
mean(indexedAuto$year, na.rm=TRUE)
## [1] 75.99496
sd(indexedAuto$year, na.rm=TRUE)
## [1] 3.690005
Section E
pairs(Auto[,1:6])
# This shows a moderate positive correlation with weight and horsepower.
ggplot(Auto, aes(weight, horsepower))+geom_point()
## Warning: Removed 5 rows containing missing values (geom_point).
# This shows a moderate negative correlation between mpg and weight.
ggplot(Auto, aes(mpg, weight))+geom_point()
# This shows a small negative correlation between displacement and acceleration.
ggplot(Auto, aes(displacement, acceleration))+geom_point()
Section F
pairs(Auto[,1:6])
# Based on these plots, it looks like displacement, horsepower, and weight might help us predict mpg.
# These have the strongest associations with mpg.
Problem 2: College Data
Section A
college<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv", header=TRUE)
college$Private<-as.factor(college$Private)
Section B
# Changing the name of the college from a variable to row names
rownames(college) <- college[ ,1]
college <- college[,-1]
# View(college)
Section C
# a.)
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
# b.)
pairs(college[,1:10])
# c.)
ggplot(college, aes(Outstate, Private, fill = Private)) +geom_boxplot()
# d.)
# Creating a variable called Elite that only includes colleges where more than half the students were in the top 10% of their high school class
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(Elite)
## No Yes
## 699 78
ggplot(college, aes(Outstate, Elite, fill = Elite)) + geom_boxplot()