library(tidyverse)
## ── Attaching packages ───────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Auto <- read.csv("auto.csv.xls", na.strings = "?")
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
The quantitative predictors in the Auto data set are mpg, cylinders, displacement, weight, acceleration, year, horsepower and origin. Name is qualitative.
The range of each quantitative predictors is as follows:
range(Auto$mpg)
## [1] 9.0 46.6
range(Auto$cylinders)
## [1] 3 8
range(Auto$displacement)
## [1] 68 455
range(Auto$weight)
## [1] 1613 5140
range(Auto$acceleration)
## [1] 8.0 24.8
range(Auto$year)
## [1] 70 82
range(Auto$origin)
## [1] 1 3
Mean and standard deviations:
Auto %>% summarise(mean(mpg), sd(mpg))
## mean(mpg) sd(mpg)
## 1 23.44592 7.805007
Auto %>% summarise(mean(cylinders), sd(cylinders))
## mean(cylinders) sd(cylinders)
## 1 5.471939 1.705783
Auto %>% summarise(mean(displacement), sd(displacement))
## mean(displacement) sd(displacement)
## 1 194.412 104.644
Auto %>% summarise(mean(weight), sd(weight))
## mean(weight) sd(weight)
## 1 2977.584 849.4026
Auto %>% summarise(mean(acceleration), sd(acceleration))
## mean(acceleration) sd(acceleration)
## 1 15.54133 2.758864
Auto %>% summarise(mean(year), sd(year))
## mean(year) sd(year)
## 1 75.97959 3.683737
Auto %>% summarise(mean(origin), sd(origin))
## mean(origin) sd(origin)
## 1 1.576531 0.8055182
autoSubset <- Auto[-c(10,85),]
autoSubset %>% summarise(mean(mpg), sd(mpg))
## mean(mpg) sd(mpg)
## 1 23.49436 7.795198
autoSubset %>% summarise(mean(cylinders), sd(cylinders))
## mean(cylinders) sd(cylinders)
## 1 5.458974 1.700479
autoSubset %>% summarise(mean(displacement), sd(displacement))
## mean(displacement) sd(displacement)
## 1 193.5115 104.1407
autoSubset %>% summarise(mean(weight), sd(weight))
## mean(weight) sd(weight)
## 1 2972.469 848.5121
autoSubset %>% summarise(mean(acceleration), sd(acceleration))
## mean(acceleration) sd(acceleration)
## 1 15.5659 2.739672
autoSubset %>% summarise(mean(year), sd(year))
## mean(year) sd(year)
## 1 76.00256 3.677556
autoSubset %>% summarise(mean(origin), sd(origin))
## mean(origin) sd(origin)
## 1 1.579487 0.8065221
ggplot(autoSubset, aes(x=year, y=mpg, color = origin)) + geom_point() + ggtitle("Year vs. MPG")
This scatter plot compares year to mpg. You can see a rough correlation between year and mpg.
ggplot(autoSubset, aes(x = year)) + geom_histogram(binwidth = 1) + ggtitle("Year of Car")
This histogram allows you to see how many cars were made in each year.
ggplot(autoSubset, aes(x = cylinders, y = weight, fill = cylinders, group = cylinders)) + geom_boxplot() + ggtitle("Cylinders vs. Weight Boxplot")
This boxplot compares the number of cylinders in a car with its weight. It can be seen that generally, as cylinders increase so does weight.
Based on the first plot, mpg seems to be roughly positively correlated on the age of the car. Meaning the newer the car, the higher the mpg is likely to be.
college <- read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv", header=TRUE)
view(college)
rownames(college) <- college[,1]
summary(college)
## X Private Apps Accept
## Length:777 Length:777 Min. : 81 Min. : 72
## Class :character Class :character 1st Qu.: 776 1st Qu.: 604
## Mode :character Mode :character Median : 1558 Median : 1110
## Mean : 3002 Mean : 2019
## 3rd Qu.: 3624 3rd Qu.: 2424
## Max. :48094 Max. :26330
## Enroll Top10perc Top25perc F.Undergrad
## Min. : 35 Min. : 1.00 Min. : 9.0 Min. : 139
## 1st Qu.: 242 1st Qu.:15.00 1st Qu.: 41.0 1st Qu.: 992
## Median : 434 Median :23.00 Median : 54.0 Median : 1707
## Mean : 780 Mean :27.56 Mean : 55.8 Mean : 3700
## 3rd Qu.: 902 3rd Qu.:35.00 3rd Qu.: 69.0 3rd Qu.: 4005
## Max. :6392 Max. :96.00 Max. :100.0 Max. :31643
## P.Undergrad Outstate Room.Board Books
## Min. : 1.0 Min. : 2340 Min. :1780 Min. : 96.0
## 1st Qu.: 95.0 1st Qu.: 7320 1st Qu.:3597 1st Qu.: 470.0
## Median : 353.0 Median : 9990 Median :4200 Median : 500.0
## Mean : 855.3 Mean :10441 Mean :4358 Mean : 549.4
## 3rd Qu.: 967.0 3rd Qu.:12925 3rd Qu.:5050 3rd Qu.: 600.0
## Max. :21836.0 Max. :21700 Max. :8124 Max. :2340.0
## Personal PhD Terminal S.F.Ratio
## Min. : 250 Min. : 8.00 Min. : 24.0 Min. : 2.50
## 1st Qu.: 850 1st Qu.: 62.00 1st Qu.: 71.0 1st Qu.:11.50
## Median :1200 Median : 75.00 Median : 82.0 Median :13.60
## Mean :1341 Mean : 72.66 Mean : 79.7 Mean :14.09
## 3rd Qu.:1700 3rd Qu.: 85.00 3rd Qu.: 92.0 3rd Qu.:16.50
## Max. :6800 Max. :103.00 Max. :100.0 Max. :39.80
## perc.alumni Expend Grad.Rate
## Min. : 0.00 Min. : 3186 Min. : 10.00
## 1st Qu.:13.00 1st Qu.: 6751 1st Qu.: 53.00
## Median :21.00 Median : 8377 Median : 65.00
## Mean :22.74 Mean : 9660 Mean : 65.46
## 3rd Qu.:31.00 3rd Qu.:10830 3rd Qu.: 78.00
## Max. :64.00 Max. :56233 Max. :118.00
pairs(college[,3:13])
ggplot(college, aes(x=Private, y=Outstate, group=Private, fill=Private)) + geom_boxplot()
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(Elite)
## No Yes
## 699 78
ggplot(college, aes(x=Elite, y=Outstate, group=Elite, fill=Elite)) + geom_boxplot() + ggtitle("Elite University vs Outstate Tuition")
Based on our criteria, there are 699 universities not considered elite, and 78 considered elite.