R Markdown

# Homework #2 Script

library(tidyverse)
## ── Attaching packages ─────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data", 
                   header=TRUE,
                   na.strings = "?")

Section A

str(Auto)
## 'data.frame':    397 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : chr  "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
# Quantitative variables include: mpg, displacement, horsepower, weight, 
# acceleration, and year.

# Qualitative variables are origin, and name.

Section B

# Ranges of all quanitative variables:

range(Auto$mpg)
## [1]  9.0 46.6
range(Auto$displacement)
## [1]  68 455
range(Auto$horsepower)
## [1] NA NA
range(Auto$weight)
## [1] 1613 5140
range(Auto$acceleration)
## [1]  8.0 24.8
range(Auto$year)
## [1] 70 82

Section C

# Mean and standard deviation of all quantitative variables:

mean(Auto$mpg, na.rm=TRUE)
## [1] 23.51587
sd(Auto$mpg, na.rm=TRUE)
## [1] 7.825804
mean(Auto$cylinders, na.rm=TRUE)
## [1] 5.458438
sd(Auto$cylinders, na.rm=TRUE)
## [1] 1.701577
mean(Auto$displacement, na.rm=TRUE)
## [1] 193.5327
sd(Auto$displacement, na.rm=TRUE)
## [1] 104.3796
mean(Auto$horsepower, na.rm=TRUE)
## [1] 104.4694
sd(Auto$horsepower, na.rm=TRUE)
## [1] 38.49116
mean(Auto$weight, na.rm=TRUE)
## [1] 2970.262
sd(Auto$weight, na.rm=TRUE)
## [1] 847.9041
mean(Auto$acceleration, na.rm=TRUE)
## [1] 15.55567
sd(Auto$acceleration, na.rm=TRUE)
## [1] 2.749995
mean(Auto$year, na.rm=TRUE)
## [1] 75.99496
sd(Auto$year, na.rm=TRUE)
## [1] 3.690005

Section D

# Range, mean, and sd of quantitative variables without the 10-85th observations

indexedAuto <- Auto[-c(10:85)]

range(indexedAuto$mpg, na.rm = TRUE)
## [1]  9.0 46.6
mean(indexedAuto$mpg, na.rm=TRUE)
## [1] 23.51587
sd(indexedAuto$mpg, na.rm=TRUE)
## [1] 7.825804
range(indexedAuto$cylinders, na.rm = TRUE)
## [1] 3 8
mean(indexedAuto$cylinders, na.rm=TRUE)
## [1] 5.458438
sd(indexedAuto$cylinders, na.rm=TRUE)
## [1] 1.701577
range(indexedAuto$displacement, na.rm = TRUE)
## [1]  68 455
mean(indexedAuto$displacement, na.rm=TRUE)
## [1] 193.5327
sd(indexedAuto$displacement, na.rm=TRUE)
## [1] 104.3796
range(indexedAuto$horsepower, na.rm = TRUE)
## [1]  46 230
mean(indexedAuto$horsepower, na.rm=TRUE)
## [1] 104.4694
sd(indexedAuto$horsepower, na.rm=TRUE)
## [1] 38.49116
range(indexedAuto$weight, na.rm = TRUE)
## [1] 1613 5140
mean(indexedAuto$weight, na.rm=TRUE)
## [1] 2970.262
sd(indexedAuto$weight, na.rm=TRUE)
## [1] 847.9041
range(indexedAuto$acceleration, na.rm = TRUE)
## [1]  8.0 24.8
mean(indexedAuto$acceleration, na.rm=TRUE)
## [1] 15.55567
sd(indexedAuto$acceleration, na.rm=TRUE)
## [1] 2.749995
range(indexedAuto$year, na.rm = TRUE)
## [1] 70 82
mean(indexedAuto$year, na.rm=TRUE)
## [1] 75.99496
sd(indexedAuto$year, na.rm=TRUE)
## [1] 3.690005

Section E

pairs(Auto[,1:6])

# This shows a moderate positive correlation with weight and horsepower.
ggplot(Auto, aes(weight, horsepower))+geom_point()
## Warning: Removed 5 rows containing missing values (geom_point).

# This shows a moderate negative correlation between mpg and weight.
ggplot(Auto, aes(mpg, weight))+geom_point()

# This shows a small negative correlation between displacement and acceleration.
ggplot(Auto, aes(displacement, acceleration))+geom_point()

Section F

pairs(Auto[,1:6])

# Based on these plots, it looks like displacement, horsepower, and weight might help us predict mpg.
# These have the strongest associations with mpg.

Problem 2: College Data

Section A

college<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv", header=TRUE)
college$Private<-as.factor(college$Private)

Section B

# Changing the name of the college from a variable to row names
rownames(college) <- college[ ,1]
college <- college[,-1]

# View(college)

Section C

# a.)
summary(college)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
# b.)
pairs(college[,1:10])

# c.)
ggplot(college, aes(Outstate, Private, fill = Private)) +geom_boxplot()

# d.)

# Creating a variable called Elite that only includes colleges where more than half the students were in the top 10% of their high school class

Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)

summary(Elite)
##  No Yes 
## 699  78
ggplot(college, aes(Outstate, Elite, fill = Elite)) + geom_boxplot()