In this assignment you will get to practice your R coding skills a little more and also learn some more useful functions! This assignment should be done in R Markdown (don’t worry we will cover that in next week’s lecture 3A. In the meantime, you can start on the code and just save it as an R script.)
This exercise involves the Auto data set that we studied during lab. Make sure that the missing values have been removed from the data.
Auto=read.table("Auto.data")
library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Auto=read.table("Auto.data",header=T,na.strings ="?")
dim(Auto)
## [1] 397 9
Auto=na.omit(Auto)
dim(Auto)
## [1] 392 9
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int 33 127 331 337 355
## ..- attr(*, "names")= chr "33" "127" "331" "337" ...
Quantitative: MPG, Displacement, Horsepower, Weight, and Acceleration. Qualitative: Cylinders, Year, Origin.
range(Auto$mpg)
## [1] 9.0 46.6
range(Auto$displacement)
## [1] 68 455
range(Auto$horsepower)
## [1] 46 230
range(Auto$weight)
## [1] 1613 5140
range(Auto$acceleration)
## [1] 8.0 24.8
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
mean(mpg)
## [1] 23.44592
mean(displacement)
## [1] 194.412
mean(horsepower)
## [1] 104.4694
mean(weight)
## [1] 2977.584
mean(acceleration)
## [1] 15.54133
sd(mpg)
## [1] 7.805007
sd(displacement)
## [1] 104.644
sd(horsepower)
## [1] 38.49116
sd(weight)
## [1] 849.4026
sd(acceleration)
## [1] 2.758864
AutoSub<-Auto[-c(10:85),]
dim(AutoSub)
## [1] 316 9
range(AutoSub$mpg)
## [1] 11.0 46.6
range(AutoSub$displacement)
## [1] 68 455
range(AutoSub$horsepower)
## [1] 46 230
range(AutoSub$weight)
## [1] 1649 4997
range(AutoSub$acceleration)
## [1] 8.5 24.8
mean(AutoSub$mpg)
## [1] 24.40443
mean(AutoSub$displacement)
## [1] 187.2405
mean(AutoSub$horsepower)
## [1] 100.7215
mean(AutoSub$weight)
## [1] 2935.972
mean(AutoSub$acceleration)
## [1] 15.7269
sd(AutoSub$mpg)
## [1] 7.867283
sd(AutoSub$displacement)
## [1] 99.67837
sd(AutoSub$horsepower)
## [1] 35.70885
sd(AutoSub$weight)
## [1] 811.3002
sd(AutoSub$acceleration)
## [1] 2.693721
ggplot(data = Auto) +
geom_point(mapping = aes(x= horsepower, y= acceleration, color = cylinders))
cor(horsepower, acceleration)
## [1] -0.6891955
ggplot(data = Auto) +
geom_smooth(mapping = aes(x= year, y= mpg))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(year, mpg)
## [1] 0.580541
ggplot(data = Auto, mapping = aes(x = weight, y = acceleration)) +
geom_point(mapping = aes(color = year)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(weight, acceleration)
## [1] -0.4168392
# Box office Star Wars (in millions!)
new_hope <- c(460.998, 314.4)
empire_strikes <- c(290.475, 247.900)
return_jedi <- c(309.306, 165.8)
# Vectors region and titles, used for naming
region <- c("US", "non-US")
titles <- c("A New Hope", "The Empire Strikes Back", "Return of
the Jedi")
starWars<-matrix(data= c(new_hope, empire_strikes, return_jedi), nrow = 3, ncol = 2, byrow = TRUE)
starWars
## [,1] [,2]
## [1,] 460.998 314.4
## [2,] 290.475 247.9
## [3,] 309.306 165.8
starWars<-matrix(data= c(new_hope, empire_strikes, return_jedi), nrow = 3, ncol = 2, byrow = TRUE)
rownames(starWars) <- c("A New Hope", "The Empire Strikes Back", "Return of the Jedi")
colnames(starWars) <- c("US", "non-US")
starWars
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
WorldwideBoxOffice<-rowSums(starWars)
WorldwideBoxOffice
## A New Hope The Empire Strikes Back Return of the Jedi
## 775.398 538.375 475.106
US<- c(460.998, 290.475, 309.306)
nonUS<- c(314.4, 247.9, 165.8)
WorldWideSales<- c(775.398, 538.375, 475.106)
starWars<- cbind(US, nonUS, WorldWideSales)
starWars<-matrix(data= c(new_hope, empire_strikes, return_jedi), nrow = 3, ncol = 3, byrow = TRUE)
colnames(starWars) <- c("US", "non-US", "WorldWideSales")
rownames(starWars) <- c("A New Hope", "The Empire Strikes Back", "Return of the Jedi")
starWars
## US non-US WorldWideSales
## A New Hope 460.998 314.400 290.475
## The Empire Strikes Back 247.900 309.306 165.800
## Return of the Jedi 460.998 314.400 290.475
# Prequels
phantom_menace <- c(474.5, 552.5, 1027.0)
attack_clones <- c(310.7, 338.7, 649.4)
revenge_sith <- c(380.3, 468.5, 848.8)
titles2<-c("The Phantom Menace”, “Attack of the Clones”, “Revenge of the Sith")
starWars2<- matrix(data= c(phantom_menace, attack_clones, revenge_sith), nrow = 3, ncol = 3, byrow = TRUE)
rownames(starWars2)<- c("The Phantom Menace", "Attack of the Clones", "revenge of the Sith")
colnames(starWars2) <- c("US", "non-US", "WroldWideSalesPrequels")
starWars2
## US non-US WroldWideSalesPrequels
## The Phantom Menace 474.5 552.5 1027.0
## Attack of the Clones 310.7 338.7 649.4
## revenge of the Sith 380.3 468.5 848.8
allStarWars<-rbind(starWars, starWars2)
allStarWars
## US non-US WorldWideSales
## A New Hope 460.998 314.400 290.475
## The Empire Strikes Back 247.900 309.306 165.800
## Return of the Jedi 460.998 314.400 290.475
## The Phantom Menace 474.500 552.500 1027.000
## Attack of the Clones 310.700 338.700 649.400
## revenge of the Sith 380.300 468.500 848.800
colSums(allStarWars)
## US non-US WorldWideSales
## 2335.396 2297.806 3271.950
college<-read.csv("College.csv", header = TRUE, na.strings = "?")
View(college)
rownames(college) <- college[,1]
View(college)
College <- college[,-1]
View(college)
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
pairs(College[,1:10])
OvsP<- ggplot(data = College) +
geom_boxplot(mapping = aes(x = College$Private, y = College$Outstate))
OvsP<- OvsP + scale_x_discrete(name = "Private") +
scale_y_continuous(name = "OutState")
OvsP
Elite <- rep("No", nrow(College))
Elite[College$Top10perc > 50] = "Yes"
Elite <- as.factor(Elite)
College <- data.frame(College, Elite)
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate Elite
## Min. : 10.00 No :699
## 1st Qu.: 53.00 Yes: 78
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
OvsE<- ggplot(data = College) +
geom_boxplot(mapping = aes(x = College$Elite, y = College$Outstate))
OvsE<- OvsE + scale_x_discrete(name = "Elite") +
scale_y_continuous(name = "OutState")
OvsE