#install.packages("ISLR")
library(ISLR)
data("College")

Question 8a,b

college<- read.csv("college.csv", header = T)
#fix(college)
#rownames(college)<-college[,1]
#college<-college[,-1]
#fix(college)

Question8c

summary(college)
##       X               Private               Apps           Accept     
##  Length:777         Length:777         Min.   :   81   Min.   :   72  
##  Class :character   Class :character   1st Qu.:  776   1st Qu.:  604  
##  Mode  :character   Mode  :character   Median : 1558   Median : 1110  
##                                        Mean   : 3002   Mean   : 2019  
##                                        3rd Qu.: 3624   3rd Qu.: 2424  
##                                        Max.   :48094   Max.   :26330  
##      Enroll       Top10perc       Top25perc      F.Undergrad   
##  Min.   :  35   Min.   : 1.00   Min.   :  9.0   Min.   :  139  
##  1st Qu.: 242   1st Qu.:15.00   1st Qu.: 41.0   1st Qu.:  992  
##  Median : 434   Median :23.00   Median : 54.0   Median : 1707  
##  Mean   : 780   Mean   :27.56   Mean   : 55.8   Mean   : 3700  
##  3rd Qu.: 902   3rd Qu.:35.00   3rd Qu.: 69.0   3rd Qu.: 4005  
##  Max.   :6392   Max.   :96.00   Max.   :100.0   Max.   :31643  
##   P.Undergrad         Outstate       Room.Board       Books       
##  Min.   :    1.0   Min.   : 2340   Min.   :1780   Min.   :  96.0  
##  1st Qu.:   95.0   1st Qu.: 7320   1st Qu.:3597   1st Qu.: 470.0  
##  Median :  353.0   Median : 9990   Median :4200   Median : 500.0  
##  Mean   :  855.3   Mean   :10441   Mean   :4358   Mean   : 549.4  
##  3rd Qu.:  967.0   3rd Qu.:12925   3rd Qu.:5050   3rd Qu.: 600.0  
##  Max.   :21836.0   Max.   :21700   Max.   :8124   Max.   :2340.0  
##     Personal         PhD            Terminal       S.F.Ratio    
##  Min.   : 250   Min.   :  8.00   Min.   : 24.0   Min.   : 2.50  
##  1st Qu.: 850   1st Qu.: 62.00   1st Qu.: 71.0   1st Qu.:11.50  
##  Median :1200   Median : 75.00   Median : 82.0   Median :13.60  
##  Mean   :1341   Mean   : 72.66   Mean   : 79.7   Mean   :14.09  
##  3rd Qu.:1700   3rd Qu.: 85.00   3rd Qu.: 92.0   3rd Qu.:16.50  
##  Max.   :6800   Max.   :103.00   Max.   :100.0   Max.   :39.80  
##   perc.alumni        Expend        Grad.Rate     
##  Min.   : 0.00   Min.   : 3186   Min.   : 10.00  
##  1st Qu.:13.00   1st Qu.: 6751   1st Qu.: 53.00  
##  Median :21.00   Median : 8377   Median : 65.00  
##  Mean   :22.74   Mean   : 9660   Mean   : 65.46  
##  3rd Qu.:31.00   3rd Qu.:10830   3rd Qu.: 78.00  
##  Max.   :64.00   Max.   :56233   Max.   :118.00

8c ii

pairs(College[,1:10])

#### 8c iii

#make sure there is factor variable
str(College$Private)
##  Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
plot(College$Private, College$Outstate, col=c(2,3), varwidth=T, xlab = "Private University", ylab = "OutofState Tuition in USD", main = "Outstate Tuition plot")

#### 8c iv

Elite<-rep ("No", nrow( College ))
Elite[College$Top10perc>50]="Yes"
Elite<-as.factor (Elite)
college<-data.frame(college,Elite)
summary(Elite) 
##  No Yes 
## 699  78

##there are 78 elite universities

#Elite is already a factor variable
plot(college$Elite, college$Outstate, col = c(2,3), varwidth=T, xlab = "Elite University", ylab = "OutofState Tuition in USD", main = "Outstate Tuition plot")

#### 8c v

par(mfrow=c(2,2))
hist(college$Books, col = 2, breaks = 50, xlab = "Books", ylab = "Count")
hist(college$PhD, col = 3, breaks = 50, xlab = "PhD", ylab = "Count")
hist(college$Grad.Rate, col = 4, breaks = 50, xlab = "Grad Rate", ylab = "Count")
hist(college$perc.alumni, col = 6, breaks = 50, xlab = "% alumni who donate", ylab = "Count")

#### 8c vi For example if we want to know which universities have the highest grad rate with, then we do data exploration

summary(college$Grad.Rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   53.00   65.00   65.46   78.00  118.00

The range is from 10 min to 108 max. check the number of college with grad rate 118

nrow(subset1<-college[college$Grad.Rate == 118,])
## [1] 1

#There is only 1 university which seems to be an outlier. #Now we want to find out which university that row belongs to

row.names(subset1)
## [1] "96"

Question 9

we download Auto dataset into our working directory

9a

Use the str function to determine data type for all the variables

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

we can see that all variables are numeric except name which is categorical

9b

What is the range of each quantitative predictor?

range(Auto$mpg, Auto$cylinders, Auto$displacement, Auto$horsepower, Auto$weight, Auto$acceleration, Auto$year, Auto$origin)
## [1]    1 5140

9c

What is the mean and standard deviation of each quantitative predictor?

summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365

9d

Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

subset1<- subset(Auto[-(10:85),])
summary(subset1)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   :11.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1649  
##  1st Qu.:18.00   1st Qu.:4.000   1st Qu.:100.2   1st Qu.: 75.0   1st Qu.:2214  
##  Median :23.95   Median :4.000   Median :145.5   Median : 90.0   Median :2792  
##  Mean   :24.40   Mean   :5.373   Mean   :187.2   Mean   :100.7   Mean   :2936  
##  3rd Qu.:30.55   3rd Qu.:6.000   3rd Qu.:250.0   3rd Qu.:115.0   3rd Qu.:3508  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :4997  
##                                                                                
##   acceleration        year           origin     
##  Min.   : 8.50   Min.   :70.00   Min.   :1.000  
##  1st Qu.:14.00   1st Qu.:75.00   1st Qu.:1.000  
##  Median :15.50   Median :77.00   Median :1.000  
##  Mean   :15.73   Mean   :77.15   Mean   :1.601  
##  3rd Qu.:17.30   3rd Qu.:80.00   3rd Qu.:2.000  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                 
##                         name    
##  ford pinto               :  5  
##  toyota corolla           :  5  
##  amc matador              :  4  
##  chevrolet chevette       :  4  
##  amc hornet               :  3  
##  chevrolet caprice classic:  3  
##  (Other)                  :292

9e

Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.

pairs(Auto[1:8]) 

# 9f Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.

Since we want to predict mpg as response to other independent variables in the dataset. We see in plot there is linear relationship between cylinders and mpg. Hence it should be useful.

Question 10

This exercise involves the Boston housing data set. (a) To begin, load in the Boston data set. The Boston data set is part of the MASS library in R.

library(MASS)
data("Boston")
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# How many rows are in this data set? How many columns? What do the rows and columns represent?

10b

Make some pairwise scatterplots of the predictors (columns) in this data set. Describe your findings.

pairs(Boston)

#10c Are any of the predictors associated with per capita crime rate? If so, explain the relationship.

hist(Boston$crim, breaks = 50)

we see that 80% of the 80% of data falls in low crime less than 20

#10d

Do any of the suburbs of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.

pairs(Boston)