Preparatory Steps

Option 1: Download College.CSV file from Here. Download the college data set from there either manually and save the file in your working directory, or use the download and load commands below.

setwd("C:/Users/kannanramachandra.sr/Desktop/ISLR/Practice")
#download.file("http://www-bcf.usc.edu/~gareth/ISL/College.csv",destfile="./college.csv",mode="wb") 
#for Windows machines use mode = "wb"
#load("./college.csv")

Option 2: There is an alternate way to download this data. If you install the ISLR package, you can directly load the College dataset.If you do this, R automatically does the row.names for you after it sees that the column has no column name.

#install.packages("ISLR")
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.2.5
data(College)

Question 8

#loading data using read.csv command
college<- read.csv("./ch2/college.csv", header = T)
fix(college)
rownames(college)<-college[,1]
college<-college[,-1]
fix(college) 
# you can also use View(college)

8c.i

summary(college)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00

8c.ii

pairs(college[,1:10])

8c.iii

str(college$Private) #check to make sure we have a factor variable
##  Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
plot(college$Private, college$Outstate, col=c(2,3), varwidth=T, xlab = "Private University", ylab = "Out of State Tuition in USD", main = "Outstate Tuition plot")

#to generate a box plot, the first variable in the plot function should be a factor variable

8c.iv

Elite<-rep ("No", nrow( college ))
Elite[college$Top10perc>50]="Yes"
Elite<-as.factor (Elite)
college<-data.frame(college,Elite)
summary(Elite) #there are 78 elite universities
##  No Yes 
## 699  78
#Elite is already a factor variable
plot(college$Elite, college$Outstate, col = c(2,3), varwidth=T, xlab = "Elite University", ylab = "Out of State Tuition in USD", main = "Outstate Tuition plot")

8c.v

par(mfrow=c(2,2))
hist(college$Books, col = 2, breaks = 50, xlab = "Books", ylab = "Count")
hist(college$PhD, col = 3, breaks = 50, xlab = "PhD", ylab = "Count")
hist(college$Grad.Rate, col = 4, breaks = 50, xlab = "Grad Rate", ylab = "Count")
hist(college$perc.alumni, col = 6, breaks = 50, xlab = "% alumni who donate", ylab = "Count")

# In order to divide the screen by rows, you have to use the function -
#par(mfcol = c(2,2))
#Use the help screen to experiment with different parameters.
#?par

8c.vi

#This is one example
#This depends on what question you want to answer from this dataset. If you want to know which universities have the highest % of faculty with PhDs, then we can start to dig into that.
summary(college$PhD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   62.00   75.00   72.66   85.00  103.00
#The range of % of faculty with PhDs is 8 to 103 and the median is 75. The 103% throws me off a little bit. Not sure if that is data integrity (outlier). 
#Let us find the number of colleges with that 103 %
nrow(subset1<-college[college$PhD==103,])
## [1] 1
#There is only 1 such university. Clearly an outlier. We can either choose to correct this to 100% in our final analysis or we can just ignore this record altogether and move on with the rest of the data.
#To find out which university that row belongs to,
row.names(subset1)
## [1] "Texas A&M University at Galveston"

Question 9

#loading the data
library(ISLR)
data(Auto)

9.a

Use the str function to determine data type for all the variables

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
#all variables except name are numeric.

9.b

Easier option is to use the summary function to determine the range of all numeric variables. The Min and Max give you the range.

summary(Auto)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365

9.c

sapply function can be used to apply a function to multiple arrays at a time. Use the help ?sapply to learn more.

sapply(Auto[ ,(1:8)], mean)
##          mpg    cylinders displacement   horsepower       weight 
##    23.445918     5.471939   194.411990   104.469388  2977.584184 
## acceleration         year       origin 
##    15.541327    75.979592     1.576531
sapply(Auto[ ,(1:8)], sd)
##          mpg    cylinders displacement   horsepower       weight 
##    7.8050075    1.7057832  104.6440039   38.4911599  849.4025600 
## acceleration         year       origin 
##    2.7588641    3.6837365    0.8055182

9.d

subset1<- subset(Auto[-(10:85),])
sapply(subset1[ ,(1:8)], mean)
##          mpg    cylinders displacement   horsepower       weight 
##    24.404430     5.373418   187.240506   100.721519  2935.971519 
## acceleration         year       origin 
##    15.726899    77.145570     1.601266
sapply(subset1[ ,(1:8)], sd)
##          mpg    cylinders displacement   horsepower       weight 
##     7.867283     1.654179    99.678367    35.708853   811.300208 
## acceleration         year       origin 
##     2.693721     3.106217     0.819910

9.e

pairs(Auto[1:8]) 

#this doesn't seem to be very helpful to me.
str(Auto$name) 
##  Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
# there are 304 different levels. Not very useful for grouping by name

CYLINDERS

str(Auto$cylinders) #cylinders should be a factor variable. Let us convert it.
##  num [1:392] 8 8 8 8 8 8 8 8 8 8 ...
Auto$cylinders<-as.factor(Auto$cylinders)
plot(Auto$cylinders, Auto$mpg, xlab = "Cylinders", ylab ="Mileage", varwidth = T, col = c(2:6))

Conclusion: We seem to get more mileage per gallon on a 4 cyl vehicle than the others.

*** WEIGHT, DISPLACEMENT, HORSEPOWER ***###

plot(Auto$horsepower, Auto$mpg, xlab = "Horsepower", ylab ="Mileage", col = "black", pch=19)

plot(Auto$weight, Auto$mpg, xlab = "Weight", ylab ="Mileage", col = "blue", pch=19)

plot(Auto$displacement, Auto$mpg, xlab = "Displacement", ylab ="Mileage", col = "blue", pch=19)

Weight, displacement and horsepower seem to have an inverse effect with mpg. The plots look like they are almost the same shape, so I am going to test if the 2 variables are correlated.

plot(Auto$weight, Auto$horsepower, xlab = "Weight", ylab ="Horsepower", col = "blue", pch=19)

plot(Auto$displacement, Auto$horsepower, xlab = "Weight", ylab ="Horsepower", col = "blue", pch=19)

plot(Auto$displacement, Auto$weight, xlab = "Weight", ylab ="Horsepower", col = "blue", pch=19)

#cor function to find the correlation.
cor(Auto$weight, Auto$horsepower) 
## [1] 0.8645377
#highly correlated. So we may not need to use both variables when predicting mpg.
cor(Auto$displacement, Auto$horsepower) 
## [1] 0.897257
#highly correlated. So we may not need to use both variables when predicting mpg.
cor(Auto$displacement, Auto$weight) 
## [1] 0.9329944
#highly correlated. So we may not need to use both variables when predicting mpg.

Because these 3 variables are highly correlated with each other, it would be unnecessary to use them all for predicting MPG. We can pick just one

*** YEAR ***

str(Auto$year)
##  num [1:392] 70 70 70 70 70 70 70 70 70 70 ...
Auto$year<-as.factor(Auto$year)
plot(Auto$year, Auto$mpg, varwidth=T, xlab = "Year", ylab ="Mileage", col = "blue", pch=19)

Overall increase in mpg over the years. Almost doubled in 1 decade.

*** ORIGIN ***###

# ?Auto shows that the origin variable refers to 1 = usa; 2 = europe; 3 = japan
Auto$origin<-as.factor(Auto$origin)
plot(Auto$origin, Auto$mpg, varwidth=T, xaxt="n", xlab = "", ylab ="Mileage", col = "blue", pch=19)
labels <-paste(c("USA", "Europe","Japan"))
text(x = seq_along(labels),y = par("usr")[3]-1, srt = 0,labels = labels, xpd = TRUE)

Conclusion: Japanese cars have higher mpg than US or European cars.

*** ACCELERATION ***###

plot(Auto$acceleration, Auto$mpg, xlab = "Acceleration", ylab ="Mileage", col = "black", pch=19)

cor(Auto$acceleration, Auto$mpg)
## [1] 0.4233285

Not a strong correlation. I am going to leave this variable out when predicting mpg because it doesn’t show me a good pattern.

9.f

From these plots above the following variables can be used as predictors - cylinders, horsepower, year and origin. because the box plots and scatter plots against mpg seem to indicate patterns.

Question 10

library (MASS)
#print(Boston)
#View(Boston)
??Boston 
## starting httpd help server ...
##  done
#506 rows and 14 columns.

10.b

#load the dataset
data(Boston)

pairs(Boston)

Boston$chas<-as.factor(Boston$chas)
Boston$crim<-Boston$crim/100

plot(~zn + crim,Boston, col="#00000022", pch=19)

plot(~indus + crim,Boston, col="#00000022", pch=19)

plot(~chas + crim,Boston, col="#00000022", pch=19)

plot(~nox + crim,Boston, col="#00000022", pch=19)

plot(~rm + crim,Boston, col="#00000022", pch=19)

plot(~age + crim,Boston, col="#00000022", pch=19)

plot(~dis + crim,Boston, col="#00000022", pch=19)

plot(~rad + crim,Boston, col="#00000022", pch=19)

plot(~tax + crim,Boston, col="#00000022", pch=19)

plot(~ptratio + crim,Boston, col="#00000022", pch=19)

plot(~black + crim,Boston, col="#00000022", pch=19)

plot(~lstat + crim,Boston, col="#00000022", pch=19)

plot(~medv + crim,Boston, col="#00000022", pch=19)

Hard to predict anything about the crime rate from the scatterplot matrices or from individual graphs.

attach(Boston)
hist(crim, breaks = 50)

pairs(Boston[crim<10,])

nrow(Boston[crim<6,])/nrow(Boston)
## [1] 1

80% of neighborhoods have crime rate per capita of less than 6%

10.c

hist(crim, breaks = 50)

#most suburbs dont have any crime. 80% of data falls in crim<20
pairs(Boston[crim<20,])

After drawing scatter plots using pairs function above, we can eyeball the scatter plots and guess that maybe there is a relationship between crim and nox, rm, age, dis, lstat and medv

10.d

nrow(Boston[crim>20,])
## [1] 0
nrow(Boston[crim>20,])/nrow(Boston)
## [1] 0
#3.5% of Boston's suburbs have crime rate of more than 20%
range(Boston[crim>20,]$crim)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## [1]  Inf -Inf
range(Boston[crim>20,]$tax)
## Warning in min(x): no non-missing arguments to min; returning Inf

## Warning in min(x): no non-missing arguments to max; returning -Inf
## [1]  Inf -Inf
range(Boston[crim>20,]$ptratio)
## Warning in min(x): no non-missing arguments to min; returning Inf

## Warning in min(x): no non-missing arguments to max; returning -Inf
## [1]  Inf -Inf
hist(Boston[tax==666,]$crim)

hist(Boston[ptratio>15,]$crim)

#Use the range function to find the range of numerical variables

Comment: For the 18 suburbs with crime rate> 20, the tax rate is 666, and the ptratio is 20.2. This is not representative of anything particular, in my opinion.

10.e

nrow(Boston[chas==1,])
## [1] 35

Answer : 35

10.f

median(ptratio)
## [1] 19.05

Answer : 19

10.g

row.names(Boston[min(medv),])
## [1] "5"
#row.names function will return the row number of whatever criteria is in its parameter.
#range for tax in the entire data set
range(tax)
## [1] 187 711
#value of tax when medv is minimum
Boston[min(medv),]$tax
## [1] 222
#do the same for other predictors as well.

10.h

nrow(Boston[rm>8,])
## [1] 13
#13 suburbs have more than 8 rooms per dwelling
range(Boston[rm>8,]$crim)
## [1] 0.0002009 0.0347428
#low crime rate
median(Boston[rm>8,]$ptratio)
## [1] 17.4
median(ptratio)
## [1] 19.05
#pt ratio is not the lowest, but lower than the median of the full data
median(Boston[rm>8,]$tax)
## [1] 307
median(tax)
## [1] 330

Median tax per $10k is still around the same.

Continue to do this for the other predictors. You could use the median/range/mean functions along with sapply for all the columns to find data for all columns at the same time.

Don’t forget to detach after you attach once.

detach(Boston)