library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data",
header=TRUE,
na.strings = "?")
#View(Auto)
Auto = na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
The predictors that are quantitative are all but name and the predictor that is qualitative is name.
range(Auto$mpg)
## [1] 9.0 46.6
The range of Mpg is from 9 to 46.6
range(Auto$cylinders)
## [1] 3 8
The range of Cylinders is from 3 to 8
range(Auto$displacement)
## [1] 68 455
The range of displacement is from 68 to 455
range(Auto$horsepower)
## [1] 46 230
The range of horsepower is from 46 to 230
range(Auto$weight)
## [1] 1613 5140
The range of Weight is from 1613 to 5140
range(Auto$acceleration)
## [1] 8.0 24.8
The range of Acceleration is from 8.0 to 24.8
range(Auto$year)
## [1] 70 82
The range of Year is from 70 to 82
range(Auto$origin)
## [1] 1 3
The range of Origin is from 1 to 3
mean(Auto$mpg) #23.44592
## [1] 23.44592
sd(Auto$mpg) #7.805007
## [1] 7.805007
The mean for mpg is 23.44592 and the standard deviation for mpg is 7.805007
mean(Auto$cylinders)
## [1] 5.471939
sd(Auto$cylinders)
## [1] 1.705783
The mean for cylinders is 5.471939 and the standard deviation for cylinders is 1.705783
mean(Auto$displacement)
## [1] 194.412
sd(Auto$displacement)
## [1] 104.644
The mean for displacement is 194.412 and the standard deviation for displacement is 104.644
mean(Auto$horsepower)
## [1] 104.4694
sd(Auto$horsepower)
## [1] 38.49116
The mean for horsepower is 104.644 and the standard deviation for horsepower is 38.49116
mean(Auto$weight)
## [1] 2977.584
sd(Auto$weight)
## [1] 849.4026
The mean for weight is 2977.584 and the standard deviation for weight is 849.4026
mean(Auto$acceleration)
## [1] 15.54133
sd(Auto$acceleration)
## [1] 2.758864
The mean for acceleration is 15.54133 and the standard deviation for acceleration is 2.758864
mean(Auto$year)
## [1] 75.97959
sd(Auto$year)
## [1] 3.683737
The mean for year is 75.97959 and the standard deviation for year is 3.683737
mean(Auto$origin)
## [1] 1.576531
sd(Auto$origin)
## [1] 0.8055182
The mean for origin is 1.576531 and the standard deviation for origin is 0.8055182
r <- Auto[-c(10:84),]
view(r)
I did 10:84 because when I did 10:85, it did not include row 86, and I wanted 86.
range(r$mpg)
## [1] 11.0 46.6
mean(r$mpg)
## [1] 24.36845
sd(r$mpg)
## [1] 7.880898
range = 11 to 46.6, mean = 24.36845, standard deviation = 7.880898
range(r$cylinders)
## [1] 3 8
mean(r$cylinders)
## [1] 5.381703
sd(r$cylinders)
## [1] 1.658135
range = 3 to 8, mean = 5.381703, standard deviation = 1.658135
range(r$displacement)
## [1] 68 455
mean(r$displacement)
## [1] 187.7539
sd(r$displacement)
## [1] 99.93949
range = 68 to 455, mean = 187.7539, standard deviation = 99.93949
range(r$horsepower)
## [1] 46 230
mean(r$horsepower)
## [1] 100.9558
sd(r$horsepower)
## [1] 35.89557
range = 46 to 230, mean = 100.9558, standard deviation = 35.89557
range(r$weight)
## [1] 1649 4997
mean(r$weight)
## [1] 2939.644
sd(r$weight)
## [1] 812.6496
range = 1649 to 4997, mean = 2939.644, standard deviation = 812.6496
range(r$acceleration)
## [1] 8.5 24.8
mean(r$acceleration)
## [1] 15.7183
sd(r$acceleration)
## [1] 2.693813
range = 8.5 to 24.8, mean = 15.7183, standard deviation = 2.693813
range(r$year)
## [1] 70 82
mean(r$year)
## [1] 77.13249
sd(r$year)
## [1] 3.110026
range = 70 to 82, mean = 77.13249, standard deviation = 3.110026
range(r$origin)
## [1] 1 3
mean(r$origin)
## [1] 1.599369
sd(r$origin)
## [1] 0.8193079
range = 1 to 3, mean = 1.599369, standard deviation = 0.8193079
pairs(~mpg + weight + acceleration, Auto)
We can see from the graph that as weight increases the mpg decreases. Also, we can see that cars with more weight, have less acceleration. From the plot, there is a slight positive linear relationship between mpg and acceleration.
My plot suggests that if you want a car with good mpg, then you should not get a heavy car. Also, if you have a car with good acceleration, you most likely have good mpg since there is a linear relationship
college<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)
view(college)
rownames(college) <- college[,1]
#view(college)
college <- college[,-1]
#view(college)
summary(college)
## Private Apps Accept Enroll
## Length:777 Min. : 81 Min. : 72 Min. : 35
## Class :character 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242
## Mode :character Median : 1558 Median : 1110 Median : 434
## Mean : 3002 Mean : 2019 Mean : 780
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902
## Max. :48094 Max. :26330 Max. :6392
## Top10perc Top25perc F.Undergrad P.Undergrad
## Min. : 1.00 Min. : 9.0 Min. : 139 Min. : 1.0
## 1st Qu.:15.00 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0
## Median :23.00 Median : 54.0 Median : 1707 Median : 353.0
## Mean :27.56 Mean : 55.8 Mean : 3700 Mean : 855.3
## 3rd Qu.:35.00 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0
## Max. :96.00 Max. :100.0 Max. :31643 Max. :21836.0
## Outstate Room.Board Books Personal
## Min. : 2340 Min. :1780 Min. : 96.0 Min. : 250
## 1st Qu.: 7320 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850
## Median : 9990 Median :4200 Median : 500.0 Median :1200
## Mean :10441 Mean :4358 Mean : 549.4 Mean :1341
## 3rd Qu.:12925 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700
## Max. :21700 Max. :8124 Max. :2340.0 Max. :6800
## PhD Terminal S.F.Ratio perc.alumni
## Min. : 8.00 Min. : 24.0 Min. : 2.50 Min. : 0.00
## 1st Qu.: 62.00 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00
## Median : 75.00 Median : 82.0 Median :13.60 Median :21.00
## Mean : 72.66 Mean : 79.7 Mean :14.09 Mean :22.74
## 3rd Qu.: 85.00 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00
## Max. :103.00 Max. :100.0 Max. :39.80 Max. :64.00
## Expend Grad.Rate
## Min. : 3186 Min. : 10.00
## 1st Qu.: 6751 1st Qu.: 53.00
## Median : 8377 Median : 65.00
## Mean : 9660 Mean : 65.46
## 3rd Qu.:10830 3rd Qu.: 78.00
## Max. :56233 Max. :118.00
str(college)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Apps : int 1660 2186 1428 417 193 587 353 1899 1038 582 ...
## $ Accept : int 1232 1924 1097 349 146 479 340 1720 839 498 ...
## $ Enroll : int 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : int 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : int 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: int 2885 2683 1036 510 249 678 416 1594 973 799 ...
## $ P.Undergrad: int 537 1227 99 63 869 41 230 32 306 78 ...
## $ Outstate : int 7440 12280 11250 12960 7560 13500 13290 13868 15595 10468 ...
## $ Room.Board : int 3300 6450 3750 5450 4120 3335 5720 4826 4400 3380 ...
## $ Books : int 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : int 2200 1500 1165 875 1500 675 1500 850 500 1800 ...
## $ PhD : int 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : int 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: int 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : int 7041 10527 8735 19016 10922 9727 8861 11487 11644 8991 ...
## $ Grad.Rate : int 60 56 54 59 15 55 63 73 80 52 ...
college$Private<-as.factor(college$Private)
pairs(college[,1:10])
ggplot(college, aes(x = Private, y = Outstate)) +
geom_boxplot()
#2c(d).
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(Elite)
## No Yes
## 699 78
ggplot(college, aes(x = Elite, y = Outstate)) +
geom_boxplot()