The variables are:
chooseCRANmirror(graphics=FALSE, ind=1) #Allows HTML in the environment
college = read.csv('College.csv') #Imports College dataset into R
rownames(college)=college[,1] #Makes college names variable the rownames
college = college[,-1] #Removes the first column in the data where the names are stored
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
pairs(college[,1:10])
attach(college)
boxplot(Outstate ~ Private, ylab='Private', col=c("red", "blue"))
attach(college)
## The following objects are masked from college (pos = 3):
##
## Accept, Apps, Books, Enroll, Expend, F.Undergrad, Grad.Rate,
## Outstate, P.Undergrad, perc.alumni, Personal, PhD, Private,
## Room.Board, S.F.Ratio, Terminal, Top10perc, Top25perc
college$Elite <- college$Top10perc > 50
college$Elite = as.factor(college$Elite)
summary(college$Elite)
## FALSE TRUE
## 699 78
par(mfrow=c(2,2)) #divides the print windo into four regions so that plots can be compare simulatenously
?hist
?plot
hist(college$Apps, col='blue',main = 'Histogram of College Applications',
xlab = 'Number of Applications received',breaks = seq(0,50000,1000), xlim = c(0,30000))
hist(college$Accept, col='red', main = 'Histogram of College Acceptance',
xlab = 'Number of Applications accepted', breaks = seq(0,30000,500), xlim = c(0,20000))
hist(college$Enroll, col='orange',main = 'Histogram of College Enrollments',
xlab = 'Number of new students enrolled',breaks = seq(0,7000,100), xlim = c(0,7000))
hist(college$Grad.Rate,col='green', main = 'Histogram of College Graduation',
xlab = 'Graduation Rate %', breaks = seq(0,120,5), xlim = c(0,100))
###9. This exercise involves the Auto data set studied in the lab.
auto = read.table('Auto.data.txt',header=TRUE,na.strings = "?")
auto=na.omit(auto)
str(auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:5] 33 127 331 337 355
## .. ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
#For convenience, it is preferable to have qualitative predictors as factors and quantitative predictors as type numerics: ‘year’ and ‘origin’ as factor type variables.
auto$cylinders <- as.numeric(auto$cylinders)
auto$horsepower <- as.numeric(auto$horsepower )
auto$year <- as.factor(auto$year)
auto$origin <- as.factor(auto$origin)
str(auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : Factor w/ 13 levels "70","71","72",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ origin : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:5] 33 127 331 337 355
## .. ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Mpg, cylinders, displacement, horsepower, weight, aceleration are quantitative.
Year, origin and name are qualitative.
attach(auto)
apply(auto[,1:6],2,range)
## mpg cylinders displacement horsepower weight acceleration
## [1,] 9.0 3 68 46 1613 8.0
## [2,] 46.6 8 455 230 5140 24.8
?colMeans
?apply
colMeans(auto[,1:6])
## mpg cylinders displacement horsepower weight
## 23.445918 5.471939 194.411990 104.469388 2977.584184
## acceleration
## 15.541327
apply(auto[,1:6],2,sd)
## mpg cylinders displacement horsepower weight
## 7.805007 1.705783 104.644004 38.491160 849.402560
## acceleration
## 2.758864
auto_subset <- auto[-10:-85,] #Auto_subset is a new dataset without the 10th through 85th observations.
print('Range')
## [1] "Range"
apply(auto_subset[,1:6],2,range)
## mpg cylinders displacement horsepower weight acceleration
## [1,] 11.0 3 68 46 1649 8.5
## [2,] 46.6 8 455 230 4997 24.8
print('Means')
## [1] "Means"
colMeans(auto_subset[,1:6])
## mpg cylinders displacement horsepower weight
## 24.404430 5.373418 187.240506 100.721519 2935.971519
## acceleration
## 15.726899
print('Standard Deviation')
## [1] "Standard Deviation"
apply(auto_subset[,1:6],2,sd)
## mpg cylinders displacement horsepower weight
## 7.867283 1.654179 99.678367 35.708853 811.300208
## acceleration
## 2.693721
pairs(~mpg+cylinders+displacement+horsepower+weight+acceleration+year,auto)
Yes, it apoears that most of the other variables would be decent predictors of mpg. Acceleration and year have a possitive relationship with mpg. On the other hand, cylinders displacement, horsepower and weight have a negative relationship with mpg.
How many rows are in this data set? How many columns? What do the rows and columns represent?
There are 506 rows and 14 columns in the Boston dataset. Each row represents a town in Boston and the columns are different variables such as crime or industry.
library(MASS)
boston = Boston
dim(Boston)
## [1] 506 14
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
pairs(~crim+age+tax+ptratio+lstat,boston)
Age, tax, ptratio and lstat seem to be assoaciated with per capita crime per rate.
par(mfrow=c(2,2))
hist(boston$crim, main="Crime Rates with y-axis limited\n (long tail)",
breaks="FD",col='red', ylim=c(0, 30))
hist(boston$crim, main="Crime Rates with y-axis limited\n (high crime-rate otuliers)",
ylim=c(0, 30), breaks="FD",col='blue')
hist(boston$tax, main="Tax rates\n (high-tax outliers)",
breaks=seq(150,750,50),col='green')
hist(boston$ptratio, main="Pupil-teacher ratio with y-axis limited",
breaks="FD",col='yellow', ylim=c(0, 80))
35 suburbs bounnd the Charlers river.
?Boston
table(boston$chas)
##
## 0 1
## 471 35
The median pupil-teacher ratio among the towns is 19.05.
median(boston$ptratio)
## [1] 19.05
which.min(boston$medv) #Finding out the record with lowest median value of owner occupied home.
## [1] 399
What are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors?
boston[399,] #Indexing the values of other predictors for that surburb
## crim zn indus chas nox rm age dis rad tax ptratio black
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 396.9
## lstat medv
## 399 30.59 5
summary(boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
par(mfrow=c(5,3), mar=c(2, 2, 1, 0))
for (i in 1:ncol(Boston)){
hist(Boston[, i], main=colnames(Boston)[i], breaks="FD")
abline(v=Boston[399, i], col="red", lw=3)
}
?abline #This function adds one or more straight lines through the current plot.
?Boston
summary(boston$rm>7)
## Mode FALSE TRUE
## logical 442 64
summary(boston$rm>8)
## Mode FALSE TRUE
## logical 493 13
boston[boston$rm>8,]
## crim zn indus chas nox rm age dis rad tax ptratio black
## 98 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 396.90
## 164 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 388.45
## 205 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 390.55
## 225 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 385.05
## 226 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 382.00
## 227 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 387.38
## 233 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 385.91
## 234 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 378.95
## 254 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 396.90
## 258 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 389.70
## 263 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 386.86
## 268 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 384.54
## 365 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 354.55
## lstat medv
## 98 4.21 38.7
## 164 3.32 50.0
## 205 2.88 50.0
## 225 4.14 44.8
## 226 4.63 50.0
## 227 3.13 37.6
## 233 2.47 41.7
## 234 3.95 48.3
## 254 3.54 42.8
## 258 5.12 50.0
## 263 5.91 48.8
## 268 7.44 50.0
## 365 5.29 21.9