college <- read.csv("C:/Users/Kajal/Downloads/College.csv")
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
c.ii.)
pairs(college[, 1:10])
c.iii.)
boxplot(college$Outstate ~ college$Private, col = c("purple", "pink"), main = "Outstate versus Private",
xlab = "Private", ylab = "Outstate")
c.iv.) There are 78 elite universities and 699 non-elite universities
Elite = rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
Elite = as.factor(Elite)
college = data.frame(college, Elite)
fix(college)
summary(college$Elite)
## No Yes
## 699 78
boxplot(college$Outstate ~ college$Elite, col = c("red", "orange"), main = "Outstate versus Elite",
xlab = "Elite", ylab = "Outstate")
c.v.)
par(mfcol = c(2, 3))
hist(college$Room.Board, breaks = 6, freq = TRUE, col = "pink", main = "Histogram Room and Board Costs",
xlab = "Room Board", ylab = "Value")
hist(college$Room.Board, breaks = 9, freq = TRUE, col = "purple", main = "Histogram Room and Board Costs",
xlab = "Room Board", ylab = "Value")
hist(college$Books, breaks = 6, freq = TRUE, col = "pink", main = "Histogram of Book Costs",
xlab = "Books", ylab = "Value")
hist(college$Books, breaks = 9, freq = TRUE, col = "purple", main = "Histogram of Book Costs",
xlab = "Books", ylab = "Value")
hist(college$Personal, breaks = 6, freq = TRUE, col = "pink", main = "Histogram of Personal Costs",
xlab = "Personal", ylab = "Value")
hist(college$Personal, breaks = 9, freq = TRUE, col = "purple", main = "Histogram of Personal Costs",
xlab = "Personal", ylab = "Value")
c.vi From this exercise it is clear to me as the bins increase, the data becomes easier to follow as it is more broken up. Especially with the books, we see that 5 bins is not enough to understand how much students are paying for books on average.
From the histograms I’ve created, assuming the Room/Board, Books, and Personal are costs, we see on average most students spend the most on room and board and that is the most normally distributed. From the other histograms, its clear that there are more students in the top 25% and it is normally distributed.
par(mfcol = c(2, 2))
hist(college$Top10perc, breaks = 9, freq = TRUE, col = "red", main = "Histogram Top 10%",
xlab = "Top 10 Percent", ylab = "Value")
hist(college$Top25perc, breaks = 9, freq = TRUE, col = "blue", main = "Histogram Top 25%",
xlab = "Top 25 Percent", ylab = "Value")
library(class)
training_set <- read.csv("C:/Users/Kajal/Downloads/PA_HW1_train.csv")
ts_new<-cbind(training_set$x1, training_set$x2)
a<-knn(ts_new, ts_new, cl = training_set$col, k=1, prob=TRUE)
table(a,training_set$col)
##
## a green red
## green 75 0
## red 0 100
plot(training_set$x1, training_set$x2, col=as.character(a), pch=16, main = "KNN Test Data k=1", xlab = "X1", ylab= "X2")
test_set <-read.csv("C:/Users/Kajal/Downloads/PA_HW1_test.csv")
test_set_new<-test_set[,-3]
b<-knn(ts_new, test_set_new, cl = training_set$col, k=1, prob=TRUE)
table(b,test_set$col)
##
## b green red
## green 398 367
## red 352 633
plot(test_set$x1, test_set$x2, col=as.character(b), pch=16, main = "KNN Training k=1", xlab = "X1", ylab= "X2")
mean(b==test_set$col)
## [1] 0.5891429
library(MASS)
color<-rep(NA, 150)
color[iris$Species=="setosa"]<-"green"
color[iris$Species=="versicolor"]<-"blue"
color[iris$Species=="virginica"]<-"red"
plot(iris$Sepal.Length, iris$Sepal.Width, col= color, pch=16, xlab = "Sepal Length", ylab = "Sepal Width", main = "Irises based on Sepal Length and Sepal Width")
The predictions that are made are shown in the confusion matrix below.
iris_data <- lda(iris$Species ~ iris$Sepal.Length + iris$Sepal.Width, data=iris)
predicted <- predict(iris_data, iris)
table(predicted$class, iris$Species)
##
## setosa versicolor virginica
## setosa 49 0 0
## versicolor 1 36 15
## virginica 0 14 35