#install.packages("ISLR")
library(ISLR)
data("College")
college<- read.csv("college.csv", header = T)
#fix(college)
#rownames(college)<-college[,1]
#college<-college[,-1]
#fix(college)
summary(college)
## X Private Apps Accept
## Length:777 Length:777 Min. : 81 Min. : 72
## Class :character Class :character 1st Qu.: 776 1st Qu.: 604
## Mode :character Mode :character Median : 1558 Median : 1110
## Mean : 3002 Mean : 2019
## 3rd Qu.: 3624 3rd Qu.: 2424
## Max. :48094 Max. :26330
## Enroll Top10perc Top25perc F.Undergrad
## Min. : 35 Min. : 1.00 Min. : 9.0 Min. : 139
## 1st Qu.: 242 1st Qu.:15.00 1st Qu.: 41.0 1st Qu.: 992
## Median : 434 Median :23.00 Median : 54.0 Median : 1707
## Mean : 780 Mean :27.56 Mean : 55.8 Mean : 3700
## 3rd Qu.: 902 3rd Qu.:35.00 3rd Qu.: 69.0 3rd Qu.: 4005
## Max. :6392 Max. :96.00 Max. :100.0 Max. :31643
## P.Undergrad Outstate Room.Board Books
## Min. : 1.0 Min. : 2340 Min. :1780 Min. : 96.0
## 1st Qu.: 95.0 1st Qu.: 7320 1st Qu.:3597 1st Qu.: 470.0
## Median : 353.0 Median : 9990 Median :4200 Median : 500.0
## Mean : 855.3 Mean :10441 Mean :4358 Mean : 549.4
## 3rd Qu.: 967.0 3rd Qu.:12925 3rd Qu.:5050 3rd Qu.: 600.0
## Max. :21836.0 Max. :21700 Max. :8124 Max. :2340.0
## Personal PhD Terminal S.F.Ratio
## Min. : 250 Min. : 8.00 Min. : 24.0 Min. : 2.50
## 1st Qu.: 850 1st Qu.: 62.00 1st Qu.: 71.0 1st Qu.:11.50
## Median :1200 Median : 75.00 Median : 82.0 Median :13.60
## Mean :1341 Mean : 72.66 Mean : 79.7 Mean :14.09
## 3rd Qu.:1700 3rd Qu.: 85.00 3rd Qu.: 92.0 3rd Qu.:16.50
## Max. :6800 Max. :103.00 Max. :100.0 Max. :39.80
## perc.alumni Expend Grad.Rate
## Min. : 0.00 Min. : 3186 Min. : 10.00
## 1st Qu.:13.00 1st Qu.: 6751 1st Qu.: 53.00
## Median :21.00 Median : 8377 Median : 65.00
## Mean :22.74 Mean : 9660 Mean : 65.46
## 3rd Qu.:31.00 3rd Qu.:10830 3rd Qu.: 78.00
## Max. :64.00 Max. :56233 Max. :118.00
pairs(College[,1:10])
#### 8c iii
#make sure there is factor variable
str(College$Private)
## Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
plot(College$Private, College$Outstate, col=c(2,3), varwidth=T, xlab = "Private University", ylab = "OutofState Tuition in USD", main = "Outstate Tuition plot")
#### 8c iv
Elite<-rep ("No", nrow( College ))
Elite[College$Top10perc>50]="Yes"
Elite<-as.factor (Elite)
college<-data.frame(college,Elite)
summary(Elite)
## No Yes
## 699 78
##there are 78 elite universities
#Elite is already a factor variable
plot(college$Elite, college$Outstate, col = c(2,3), varwidth=T, xlab = "Elite University", ylab = "OutofState Tuition in USD", main = "Outstate Tuition plot")
#### 8c v
par(mfrow=c(2,2))
hist(college$Books, col = 2, breaks = 50, xlab = "Books", ylab = "Count")
hist(college$PhD, col = 3, breaks = 50, xlab = "PhD", ylab = "Count")
hist(college$Grad.Rate, col = 4, breaks = 50, xlab = "Grad Rate", ylab = "Count")
hist(college$perc.alumni, col = 6, breaks = 50, xlab = "% alumni who donate", ylab = "Count")
#### 8c vi For example if we want to know which universities have the highest grad rate with, then we do data exploration
summary(college$Grad.Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 53.00 65.00 65.46 78.00 118.00
The range is from 10 min to 108 max. check the number of college with grad rate 118
nrow(subset1<-college[college$Grad.Rate == 118,])
## [1] 1
#There is only 1 university which seems to be an outlier. #Now we want to find out which university that row belongs to
row.names(subset1)
## [1] "96"
Use the str function to determine data type for all the variables
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
What is the range of each quantitative predictor?
range(Auto$mpg, Auto$cylinders, Auto$displacement, Auto$horsepower, Auto$weight, Auto$acceleration, Auto$year, Auto$origin)
## [1] 1 5140
What is the mean and standard deviation of each quantitative predictor?
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
subset1<- subset(Auto[-(10:85),])
summary(subset1)
## mpg cylinders displacement horsepower weight
## Min. :11.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1649
## 1st Qu.:18.00 1st Qu.:4.000 1st Qu.:100.2 1st Qu.: 75.0 1st Qu.:2214
## Median :23.95 Median :4.000 Median :145.5 Median : 90.0 Median :2792
## Mean :24.40 Mean :5.373 Mean :187.2 Mean :100.7 Mean :2936
## 3rd Qu.:30.55 3rd Qu.:6.000 3rd Qu.:250.0 3rd Qu.:115.0 3rd Qu.:3508
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :4997
##
## acceleration year origin
## Min. : 8.50 Min. :70.00 Min. :1.000
## 1st Qu.:14.00 1st Qu.:75.00 1st Qu.:1.000
## Median :15.50 Median :77.00 Median :1.000
## Mean :15.73 Mean :77.15 Mean :1.601
## 3rd Qu.:17.30 3rd Qu.:80.00 3rd Qu.:2.000
## Max. :24.80 Max. :82.00 Max. :3.000
##
## name
## ford pinto : 5
## toyota corolla : 5
## amc matador : 4
## chevrolet chevette : 4
## amc hornet : 3
## chevrolet caprice classic: 3
## (Other) :292
Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.
pairs(Auto[1:8])
# 9f Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.
Since we want to predict mpg as response to other independent variables in the dataset. We see in plot there is linear relationship between cylinders and mpg. Hence it should be useful.
This exercise involves the Boston housing data set. (a) To begin, load in the Boston data set. The Boston data set is part of the MASS library in R.
library(MASS)
data("Boston")
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# How many rows are in this data set? How many columns? What do the rows and columns represent?
Make some pairwise scatterplots of the predictors (columns) in this data set. Describe your findings.
pairs(Boston)
#10c Are any of the predictors associated with per capita crime rate? If so, explain the relationship.
hist(Boston$crim, breaks = 50)
we see that 80% of the 80% of data falls in low crime less than 20
#10d
Do any of the suburbs of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.
pairs(Boston)