by Niko Hellman

Part 1: Auto

library(tidyverse)
## ── Attaching packages ───────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)

Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data", 
                   header=TRUE,
                   na.strings = "?") 

Auto=na.omit(Auto)
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

A. Looking at our predictors!

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int  33 127 331 337 355
##   ..- attr(*, "names")= chr  "33" "127" "331" "337" ...

The quantitative predictors are mpg, cylinders, displacement, horsepower, weight, acceleration, and year. The qualitative predictors are name and origin. (Note: origin is an int, but when looking up the data itself, we see that it has values 1, 2, and 3 to represent different countries, making it qualitative)

B. Range of quantitive predictors:

range(Auto$mpg)
## [1]  9.0 46.6
range(Auto$cylinders)
## [1] 3 8
range(Auto$displacement)
## [1]  68 455
range(Auto$horsepower)
## [1]  46 230
range(Auto$weight)
## [1] 1613 5140
range(Auto$acceleration)
## [1]  8.0 24.8
range(Auto$year)
## [1] 70 82

C. Mean and standard deviation:

mean(Auto$mpg)
## [1] 23.44592
sd(Auto$mpg)
## [1] 7.805007
mean(Auto$cylinders)
## [1] 5.471939
sd(Auto$cylinders)
## [1] 1.705783
mean(Auto$displacement)
## [1] 194.412
sd(Auto$displacement)
## [1] 104.644
mean(Auto$horsepower)
## [1] 104.4694
sd(Auto$horsepower)
## [1] 38.49116
mean(Auto$weight)
## [1] 2977.584
sd(Auto$weight)
## [1] 849.4026
mean(Auto$acceleration)
## [1] 15.54133
sd(Auto$acceleration)
## [1] 2.758864
mean(Auto$year)
## [1] 75.97959
sd(Auto$year)
## [1] 3.683737

D. Autosub, a subset of Auto excluding observations 10-85:

Autosub<-Auto[-(10:85),]
head(Autosub)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

Range, mean, and standard deviations of predictors in Autosub:

range(Autosub$mpg)
## [1] 11.0 46.6
mean(Autosub$mpg)
## [1] 24.40443
sd(Autosub$mpg)
## [1] 7.867283
range(Autosub$cylinders)
## [1] 3 8
mean(Autosub$cylinders)
## [1] 5.373418
sd(Autosub$cylinders)
## [1] 1.654179
range(Autosub$displacement)
## [1]  68 455
mean(Autosub$displacement)
## [1] 187.2405
sd(Autosub$displacement)
## [1] 99.67837
range(Autosub$horsepower)
## [1]  46 230
mean(Autosub$horsepower)
## [1] 100.7215
sd(Autosub$horsepower)
## [1] 35.70885
range(Autosub$weight)
## [1] 1649 4997
mean(Autosub$weight)
## [1] 2935.972
sd(Autosub$weight)
## [1] 811.3002
range(Autosub$acceleration)
## [1]  8.5 24.8
mean(Autosub$acceleration)
## [1] 15.7269
sd(Autosub$acceleration)
## [1] 2.693721
range(Autosub$year)
## [1] 70 82
mean(Autosub$year)
## [1] 77.14557
sd(Autosub$year)
## [1] 3.106217

E. Visual representations:

pairs(Auto)

Starting here, we can identify interesting relationships to further investigate.

ggplot(Auto, aes(x=year, y=mpg, color=origin)) +
  geom_point() 

In this plot, we see an upward trend in mpg over the years, generally with Japanese cars (3) having highest and American cars (1) having lowest in each year.

ggplot(Auto, aes(x=mpg))+
  geom_bar()

This plot gives us a sense of most common mpg values for cars in our data set. We see that 13 mpg is most common with 20 cars, followed by 14 mpg and 18mpg.

ggplot(Auto, aes(x=weight, y=displacement)) +
  geom_point()

Here we see an upward trend where displacement increases as weight increases.

F. Based on our first pair() plot, we are able to view the relationships mpg holds with all other variables. We see clear negative relationships with displacement, horsepower, and weight.There appears to be more positive relationships with acceleration and year. In my first ggplot() you see the clear relationship between mpg over time. This relationship would lead us to predict a higher mpg as time increases. The representation of countries of origin (by color) would also lead us to predict a higher mpg for Japanese cars and lower for American.

Part 2: College

A. Loading our data set:

college<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)

B. Viewing and adjusting (note: View() replaced by head() for Rpub)

head(college)
##                              X Private Apps Accept Enroll Top10perc Top25perc
## 1 Abilene Christian University     Yes 1660   1232    721        23        52
## 2           Adelphi University     Yes 2186   1924    512        16        29
## 3               Adrian College     Yes 1428   1097    336        22        50
## 4          Agnes Scott College     Yes  417    349    137        60        89
## 5    Alaska Pacific University     Yes  193    146     55        16        44
## 6            Albertson College     Yes  587    479    158        38        62
##   F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal
## 1        2885         537     7440       3300   450     2200  70       78
## 2        2683        1227    12280       6450   750     1500  29       30
## 3        1036          99    11250       3750   400     1165  53       66
## 4         510          63    12960       5450   450      875  92       97
## 5         249         869     7560       4120   800     1500  76       72
## 6         678          41    13500       3335   500      675  67       73
##   S.F.Ratio perc.alumni Expend Grad.Rate
## 1      18.1          12   7041        60
## 2      12.2          16  10527        56
## 3      12.9          30   8735        54
## 4       7.7          37  19016        59
## 5      11.9           2  10922        15
## 6       9.4          11   9727        55
rownames(college)<- college[,1]
head(college)
##                                                         X Private Apps Accept
## Abilene Christian University Abilene Christian University     Yes 1660   1232
## Adelphi University                     Adelphi University     Yes 2186   1924
## Adrian College                             Adrian College     Yes 1428   1097
## Agnes Scott College                   Agnes Scott College     Yes  417    349
## Alaska Pacific University       Alaska Pacific University     Yes  193    146
## Albertson College                       Albertson College     Yes  587    479
##                              Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## Abilene Christian University    721        23        52        2885         537
## Adelphi University              512        16        29        2683        1227
## Adrian College                  336        22        50        1036          99
## Agnes Scott College             137        60        89         510          63
## Alaska Pacific University        55        16        44         249         869
## Albertson College               158        38        62         678          41
##                              Outstate Room.Board Books Personal PhD Terminal
## Abilene Christian University     7440       3300   450     2200  70       78
## Adelphi University              12280       6450   750     1500  29       30
## Adrian College                  11250       3750   400     1165  53       66
## Agnes Scott College             12960       5450   450      875  92       97
## Alaska Pacific University        7560       4120   800     1500  76       72
## Albertson College               13500       3335   500      675  67       73
##                              S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University      18.1          12   7041        60
## Adelphi University                12.2          16  10527        56
## Adrian College                    12.9          30   8735        54
## Agnes Scott College                7.7          37  19016        59
## Alaska Pacific University         11.9           2  10922        15
## Albertson College                  9.4          11   9727        55
college<- college[,-1]
head(college)
##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55

C. Additional tasks:

  1. Numerical summary of variables
summary(college)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
  1. Scatterplot of first 10 variables
pairs(college[,1:10])

  1. Side-by-side boxplot of Outstate vs. Private
ggplot(college, aes(x=Private, y=Outstate))+
  geom_boxplot() 

  1. New variable “Elite”
Elite<-rep("No", nrow(college))
Elite[college$Top10perc>50]="Yes"
Elite<- as.factor(Elite)
college<-data.frame(college, Elite)

summary(college)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate      Elite    
##  Min.   : 10.00   No :699  
##  1st Qu.: 53.00   Yes: 78  
##  Median : 65.00            
##  Mean   : 65.46            
##  3rd Qu.: 78.00            
##  Max.   :118.00
ggplot(college, aes(x=Elite, y=Outstate))+
  geom_boxplot()