• Private : Public/private indicator • Apps : Number of applications received • Accept : Number of applicants accepted • Enroll : Number of new students enrolled • Top10perc : New students from top 10 % of high school class • Top25perc : New students from top 25 % of high school class • F.Undergrad : Number of full-time undergraduates • P.Undergrad : Number of part-time undergraduates • Outstate : Out-of-state tuition • Room.Board : Room and board costs • Books : Estimated book costs • Personal : Estimated personal spending • PhD : Percent of faculty with Ph.D.’s • Terminal : Percent of faculty with terminal degree • S.F.Ratio : Student/faculty ratio • perc.alumni : Percent of alumni who donate • Expend : Instructional expenditure per student • Grad.Rate : Graduation rate
install.packages("ISLR")
##
## The downloaded binary packages are in
## /var/folders/71/vnhmr1ts6w354s54vd5n1xw80000gn/T//RtmpvTz9R3/downloaded_packages
library(ISLR)
data("College")
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
college = read.csv("/Users/gopakumargeetha/Google\ Drive/Predictive-Modelling-in-R/ISLR/College.csv",header = T)
rownames(college)<-college[,1]
college<-college[,-1]
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
pairs(college[,1:10])
plot(college$Private,college$Outstate,col = c("red","green")) # for box plots the first argument should be a factor
Elite=rep("No",nrow(college)) # creates a vector with name Elite repeating No for all entries
Elite[college$Top10perc >50]="Yes" # for the vector Elite make "yes" for all entries for which the condition is satisfied.
Elite=as.factor(Elite) #store this as a factor variable
college=data.frame(college ,Elite) # add this to the data frame making it the 19th variable
Use the summary() function to see how many elite univer- sities there are. Now use the plot() function to produce side-by-side boxplots of Outstate versus Elite.
summary(college$Elite)
## No Yes
## 699 78
plot(college$Elite,college$Outstate,col=c("brown","red"))
v. Use the hist() function to produce some histograms with differing numbers of bins for a few of the quantitative vari- ables. You may find the command par(mfrow=c(2,2)) useful: it will divide the print window into four regions so that four plots can be made simultaneously. Modifying the arguments to this function will divide the screen in other ways. vi. Continue exploring the data, and provide a brief summary of what you discover.
par(mfrow=c(2,2))
hist(college$Accept,main="Number of applications accepted",col="red",breaks=50)
hist(college$Enroll,main="Number of new students enrolled",col="green",bin=100)
## Warning in plot.window(xlim, ylim, "", ...): "bin" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "bin" is not a graphical parameter
## Warning in axis(1, ...): "bin" is not a graphical parameter
## Warning in axis(2, ...): "bin" is not a graphical parameter
hist(college$PhD,main="Percent of faculties with PhD",col="red",breaks=20)
hist(college$perc.alumni,main="Percent of alumini donate",col="green")
summary(college$Apps)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81 776 1558 3002 3624 48090
Which of the predictors are quantitative, and which are qualitative? quantitative variables : mpg, cylinders, displacement, horsepower, weight, accelearation qualitative variables : year, origin, name
What is the range of each quantitative predictor? You can an- swer this using the range() function. range()
What is the mean and standard deviation of each quantitative predictor?
data("Auto")
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name
## amc matador : 5
## ford pinto : 5
## toyota corolla : 5
## amc gremlin : 4
## amc hornet : 4
## chevrolet chevette: 4
## (Other) :365
range(Auto$mpg)
## [1] 9.0 46.6
range(Auto$cylinders)
## [1] 3 8
summary(Auto[,c(7:9),]) # summary of qualitative variables
## year origin name
## Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :76.00 Median :1.000 toyota corolla : 5
## Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
summary(Auto[,c(1:6),]) # summary of quantitative variables
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
## weight acceleration
## Min. :1613 Min. : 8.00
## 1st Qu.:2225 1st Qu.:13.78
## Median :2804 Median :15.50
## Mean :2978 Mean :15.54
## 3rd Qu.:3615 3rd Qu.:17.02
## Max. :5140 Max. :24.80
# range
sapply(Auto[,c(1:6),],range)
## mpg cylinders displacement horsepower weight acceleration
## [1,] 9.0 3 68 46 1613 8.0
## [2,] 46.6 8 455 230 5140 24.8
#mean and standard deviation of quantitative predictors
sapply(Auto[,c(1:6),],mean)
## mpg cylinders displacement horsepower weight
## 23.445918 5.471939 194.411990 104.469388 2977.584184
## acceleration
## 15.541327
sapply(Auto[,c(1:6),],sd)
## mpg cylinders displacement horsepower weight
## 7.805007 1.705783 104.644004 38.491160 849.402560
## acceleration
## 2.758864
new.auto = subset(Auto[-c(10:85),])
sapply(new.auto[,-c(9)],range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
## origin
## [1,] 1
## [2,] 3
sapply(new.auto[,-c(9)],mean)
## mpg cylinders displacement horsepower weight
## 24.404430 5.373418 187.240506 100.721519 2935.971519
## acceleration year origin
## 15.726899 77.145570 1.601266
sapply(new.auto[,-c(9)],sd)
## mpg cylinders displacement horsepower weight
## 7.867283 1.654179 99.678367 35.708853 811.300208
## acceleration year origin
## 2.693721 3.106217 0.819910
pairs(Auto,panel=panel.smooth,main="scatter plots of all pairs of variables")
#Seems mpg is incerasing till number of cylinders = 4 and then drops down. So large number of cylinders in a car does not mean fuel efficiency.
plot(as.factor(Auto$cylinders),Auto$mpg) # this can be seen clearly in this boxplot.
#mpg Vz weight of the car
plot(Auto$mpg,Auto$weight)
#this shows the inverse proportinality which we expect between mpg and weight of the car.
#To understand about the performance of the car (mpg) with respect to the important parameters, we do plot the pairs plot again
pairs(~ mpg + horsepower + weight + displacement, data = Auto, panel = panel.smooth)
# We can see mpg is inversely proportional to horsepower, weight and displacement # hosrsepower, weight and displacement are directly proportional to each other.
# Now lets see the manufacturer and the mpg of the car
#Here 1 is American, 2 is European and 3 is Japanese
plot(factor(Auto$origin),Auto$mpg,names=(c("American","European","Japanese")))
# Japanese cars have larger miles per gallon as expected in comparison with the US gas guzzlers.
From the above plots and discussions thereon, we can understand that mpg(miles per gallon) is proportional to variables like number of cylinders, horsepower, weight, displacement etc..
library(MASS)
data(Boston)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
str(Boston) # rows:506 and columns = 14
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
pairs(Boston) #this is very difficult to undestand? Lets consider few and do pairwise graphs
pairs(~ crim + lstat + black, data = Boston, panel = panel.smooth)
pairs(~ black + medv + dis + age , data = Boston, panel = panel.smooth)
plot(as.factor(Boston$chas),Boston$age) # older buildings near the river
plot(as.factor(Boston$chas),Boston$lstat) # lower status population less near river
plot(as.factor(Boston$chas),Boston$medv) # homes expensive near the river - but outliers with expensive houses far from river too...
(c) Are any of the predictors associated with per capita crime rate? If so, explain the relationship. From the above plots we can roughtly say that the crime rate is more on the lower status populated places.
plot(as.factor(Boston$chas),Boston$crim) # crime is more far from river area
plot(Boston$dis,Boston$crim) # crime is large close to five Boston employment centres
(d) Do any of the suburbs of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.
plot(as.factor(Boston$chas),Boston$tax) # surprisingly the tax is less near the river area..
range(Boston$crim)
## [1] 0.00632 88.97620
hist(Boston$tax)
hist(Boston$crim,breaks=50)
(e) How many of the suburbs in this data set bound the Charles river?
table(Boston$chas) # 35 near the Charles river
##
## 0 1
## 471 35
median(Boston$ptratio)
## [1] 19.05
plot(as.factor(Boston$chas),Boston$medv) #median value of owner occupied homes more in Charles river area
which.min(Boston$medv)
## [1] 399
Boston[which.min(Boston$medv),] # it is far from Charles river area; far from radial highways,
## crim zn indus chas nox rm age dis rad tax ptratio black
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 396.9
## lstat medv
## 399 30.59 5
summary(Boston$crim) # max is 88.98 and the median is 0.25; for the median value of owner occupied homes the crime is around 38.3518 and we can see that the crime is larger in this area.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25650 3.61400 3.67700 88.98000
summary(Boston$rm) # Average is around 6.285 rooms
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.561 5.886 6.208 6.285 6.624 8.780
table(Boston$rm > 7) # 64 houses with more than 7 rooms
##
## FALSE TRUE
## 442 64
table(Boston$rm >8) # 13 houses with more than 8 rooms
##
## FALSE TRUE
## 493 13
rooms8 = Boston[Boston$rm > 8, ]
summary(rooms8)
## crim zn indus chas
## Min. :0.02009 Min. : 0.00 Min. : 2.680 Min. :0.0000
## 1st Qu.:0.33147 1st Qu.: 0.00 1st Qu.: 3.970 1st Qu.:0.0000
## Median :0.52014 Median : 0.00 Median : 6.200 Median :0.0000
## Mean :0.71879 Mean :13.62 Mean : 7.078 Mean :0.1538
## 3rd Qu.:0.57834 3rd Qu.:20.00 3rd Qu.: 6.200 3rd Qu.:0.0000
## Max. :3.47428 Max. :95.00 Max. :19.580 Max. :1.0000
## nox rm age dis
## Min. :0.4161 Min. :8.034 Min. : 8.40 Min. :1.801
## 1st Qu.:0.5040 1st Qu.:8.247 1st Qu.:70.40 1st Qu.:2.288
## Median :0.5070 Median :8.297 Median :78.30 Median :2.894
## Mean :0.5392 Mean :8.349 Mean :71.54 Mean :3.430
## 3rd Qu.:0.6050 3rd Qu.:8.398 3rd Qu.:86.50 3rd Qu.:3.652
## Max. :0.7180 Max. :8.780 Max. :93.90 Max. :8.907
## rad tax ptratio black
## Min. : 2.000 Min. :224.0 Min. :13.00 Min. :354.6
## 1st Qu.: 5.000 1st Qu.:264.0 1st Qu.:14.70 1st Qu.:384.5
## Median : 7.000 Median :307.0 Median :17.40 Median :386.9
## Mean : 7.462 Mean :325.1 Mean :16.36 Mean :385.2
## 3rd Qu.: 8.000 3rd Qu.:307.0 3rd Qu.:17.40 3rd Qu.:389.7
## Max. :24.000 Max. :666.0 Max. :20.20 Max. :396.9
## lstat medv
## Min. :2.47 Min. :21.9
## 1st Qu.:3.32 1st Qu.:41.7
## Median :4.14 Median :48.3
## Mean :4.31 Mean :44.2
## 3rd Qu.:5.12 3rd Qu.:50.0
## Max. :7.44 Max. :50.0
#Crime seems to be less in the houses which have 8 rooms
table(rooms8$chas)
##
## 0 1
## 11 2
# 11 of the houses with 8 rooms are not near Charles river (only 2 are near Charles river)
summary(rooms8$black)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354.6 384.5 386.9 385.2 389.7 396.9
summary(Boston$black)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.32 375.40 391.40 356.70 396.20 396.90
# All the rooms8 houses blacks population