Getting set up for problem 1: I called up tidyverse and the Auto data set. Then, I did a listwise deletion of all the missing values.
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data",
header=TRUE,
na.strings = "?")
Auto=na.omit(Auto)
Problem 1: Auto Data
#problem 1a
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
#problem 1b
Auto %>%
summarise(range(mpg))
## range(mpg)
## 1 9.0
## 2 46.6
Auto %>%
summarise(range(cylinders))
## range(cylinders)
## 1 3
## 2 8
Auto %>%
summarise(range(displacement))
## range(displacement)
## 1 68
## 2 455
Auto %>%
summarise(range(horsepower))
## range(horsepower)
## 1 46
## 2 230
Auto %>%
summarise(range(weight))
## range(weight)
## 1 1613
## 2 5140
Auto %>%
summarise(range(acceleration))
## range(acceleration)
## 1 8.0
## 2 24.8
Auto %>%
summarise(range(year))
## range(year)
## 1 70
## 2 82
Auto %>%
summarise(range(origin))
## range(origin)
## 1 1
## 2 3
#problem 1c
Auto%>%
summarise(mean(mpg), sd(mpg))
## mean(mpg) sd(mpg)
## 1 23.44592 7.805007
Auto%>%
summarise(mean(cylinders), sd(cylinders))
## mean(cylinders) sd(cylinders)
## 1 5.471939 1.705783
Auto%>%
summarise(mean(displacement), sd(displacement))
## mean(displacement) sd(displacement)
## 1 194.412 104.644
Auto%>%
summarise(mean(horsepower), sd(horsepower))
## mean(horsepower) sd(horsepower)
## 1 104.4694 38.49116
Auto%>%
summarise(mean(weight), sd(weight))
## mean(weight) sd(weight)
## 1 2977.584 849.4026
Auto%>%
summarise(mean(acceleration), sd(acceleration))
## mean(acceleration) sd(acceleration)
## 1 15.54133 2.758864
Auto%>%
summarise(mean(year), sd(year))
## mean(year) sd(year)
## 1 75.97959 3.683737
Auto%>%
summarise(mean(origin), sd(origin))
## mean(origin) sd(origin)
## 1 1.576531 0.8055182
#problem 1d
Auto_rowsremoved <- Auto[c(1:10, 85:392),]
Auto_rowsremoved %>%
summarise(mean(mpg), sd(mpg))
## mean(mpg) sd(mpg)
## 1 24.33899 7.885977
Auto_rowsremoved %>%
summarise(mean(cylinders), sd(cylinders))
## mean(cylinders) sd(cylinders)
## 1 5.389937 1.662016
Auto_rowsremoved %>%
summarise(mean(displacement), sd(displacement))
## mean(displacement) sd(displacement)
## 1 188.3899 100.4242
Auto_rowsremoved %>%
summarise(mean(horsepower), sd(horsepower))
## mean(horsepower) sd(horsepower)
## 1 101.2358 36.18509
Auto_rowsremoved %>%
summarise(mean(weight), sd(weight))
## mean(weight) sd(weight)
## 1 2942.506 812.9713
Auto_rowsremoved %>%
summarise(mean(acceleration), sd(acceleration))
## mean(acceleration) sd(acceleration)
## 1 15.6956 2.71985
Auto_rowsremoved %>%
summarise(mean(year), sd(year))
## mean(year) sd(year)
## 1 77.11006 3.130771
Auto_rowsremoved %>%
summarise(mean(origin), sd(origin))
## mean(origin) sd(origin)
## 1 1.597484 0.8187048
In problem 1e, I used scatter plots to look at relationships between variables. I first wanted to know if there was a relatinship between time and MGP. The first plot looks at MPG over time and also takes weight into account. It looks like there is a relationships between time and MGP: as time increases, MPG increases. The colors of this plot lead me to my next question: is there a relationship between MGP and weight? I plotted them next and found that yes, not only is there a relationship between weight and MPG, but also with the origin of the car (in color). Then, I wanted to know if horsepower has changed over time, so I plotted this relationship and found that there isn’t really any relationship here, but there is more variation in the early 70s than in the early 80s.
#problem 1e
library(ggplot2)
ggplot(Auto, aes(x=year, y=mpg, color=weight))+
geom_point()+
expand_limits(y=0)
ggplot(Auto, aes(x=weight, y=mpg, color=origin))+
geom_point()+
expand_limits(y=0)
ggplot(Auto, aes(x=year, y=horsepower))+
geom_point()+
expand_limits(y=0)
Problem 1f asks us to find any predictors that are related to mpg. I found that mpg has a positive relationship with year and a negative repationship with weight. Origin of the car also is related to MPG.
Problem 2: College Data
#Problem 2a
college<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)
#Problem 2b
#View(college)
rownames(college)<- college[,1]
#View(college)
college<- college[,-1]
str(college)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Apps : int 1660 2186 1428 417 193 587 353 1899 1038 582 ...
## $ Accept : int 1232 1924 1097 349 146 479 340 1720 839 498 ...
## $ Enroll : int 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : int 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : int 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: int 2885 2683 1036 510 249 678 416 1594 973 799 ...
## $ P.Undergrad: int 537 1227 99 63 869 41 230 32 306 78 ...
## $ Outstate : int 7440 12280 11250 12960 7560 13500 13290 13868 15595 10468 ...
## $ Room.Board : int 3300 6450 3750 5450 4120 3335 5720 4826 4400 3380 ...
## $ Books : int 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : int 2200 1500 1165 875 1500 675 1500 850 500 1800 ...
## $ PhD : int 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : int 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: int 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : int 7041 10527 8735 19016 10922 9727 8861 11487 11644 8991 ...
## $ Grad.Rate : int 60 56 54 59 15 55 63 73 80 52 ...
college$Private<- as.factor(college$Private)
#Problem 2c
#a- Summary
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
#b- Scatter Plot Matrix
college_firstten<- college [,1:10]
pairs(college_firstten)
#c- boxplots
ggplot(college, aes(x=Outstate, y=Private))+
geom_boxplot()
#Problem 2d
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(Elite)
## No Yes
## 699 78
library(ggplot2)
ggplot(college, aes(x=Outstate, y=Elite))+
geom_boxplot()