######################################
# section 4.2.3
######################################
# Using R to Perform a K-Means Analysis
# install packages, if necessary
library(plyr)
library(ggplot2)
library(cluster)
library(lattice)
library(graphics)
library(grid)
library(gridExtra)
#import the student grades
grade_input = as.data.frame(read.csv("grades_km_input.csv"))
##Q1: Excecute "nrow()" to find the sample size of "grade_input" object.
nrow(grade_input)
## [1] 620
#How many students are included in this dataset?
kmdata_orig = as.matrix(grade_input[,c("Student","English", "Math","Science")])
kmdata <- kmdata_orig[,2:4]
kmdata[1:10,]
## English Math Science
## [1,] 99 96 97
## [2,] 99 96 97
## [3,] 98 97 97
## [4,] 95 100 95
## [5,] 95 96 96
## [6,] 96 97 96
## [7,] 100 96 97
## [8,] 95 98 98
## [9,] 98 96 96
## [10,] 99 99 95
#Q2: What is the English score of the the seventh student?
#You must solve this by writing r commands.
#Hint: Subset the Student ID with value 7 or subset the 7th row.
kmdata[7,1]
## English
## 100
# calculate WSS of each of the 15 clusters.
wss <- numeric(15)
for (k in 1:15) wss[k] <- sum(kmeans(kmdata, centers=k, nstart=25)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")
set.seed(1)
km = kmeans(kmdata,3, nstart=25)
km
## K-means clustering with 3 clusters of sizes 158, 218, 244
##
## Cluster means:
## English Math Science
## 1 97.21519 93.37342 94.86076
## 2 73.22018 64.62844 65.84862
## 3 85.84426 79.68033 81.50820
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1 3 3 3
## [186] 1 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [260] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [297] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [334] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [371] 3 3 3 3 3 3 2 2 2 2 2 2 2 3 2 3 2 3 3 3 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2
## [408] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [445] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2
## [482] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [519] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [556] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [593] 2 2 2 2 2 2 2 2 3 3 2 2 3 3 3 3 1 1 3 3 3 2 2 3 2 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 6692.589 34806.339 22984.131
## (between_SS / total_SS = 76.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Q3:How many clusters did you just create in "km = kmeans(kmdata,3, nstart=25)"?
#**use ?kmeans() to learn about this function.
#* #What is the reason to produce this number of K based on the text book explanation?
c(wss[3] , sum(km$withinss) )
## [1] 64483.06 64483.06
#prepare the student data and clustering results for plotting
df = as.data.frame(kmdata_orig[,2:4])
df$cluster = factor(km$cluster)
centers=as.data.frame(km$centers)
g1= ggplot(data=df, aes(x=English, y=Math, color=cluster )) +
geom_point() + theme(legend.position="right") +
geom_point(data=centers, aes(x=English,y=Math, color=as.factor(c(1,2,3))),
size=10, alpha=.3, show.legend=FALSE)
g2 =ggplot(data=df, aes(x=English, y=Science, color=cluster )) +
geom_point() +
geom_point(data=centers, aes(x=English,y=Science, color=as.factor(c(1,2,3))),
size=10, alpha=.3, show.legend=FALSE)
#Original code with error======================
#g3 = ggplot(data=df, aes(x=Math, y=Science, color=cluster )) +
# geom_point() +
# geom_point(data=centers, aes(x=Math,y=Science, color=as.factor(c(1,2,3))),
# size=10, alpha=.3, show.legend=FALSE)
g3 = ggplot(data=df, aes(x=Math, y=Science, color=cluster )) +
geom_point() +
geom_point(data=centers, aes(x=Math,y=Science, color=as.factor(c(1,2,3))),
size=10, alpha=.3, show.legend = F)
tmp = ggplot_gtable(ggplot_build(g1))

#Original code with errors
#grid.arrange(arrangeGrob(g1 + theme(legend.position="none"),
# g2 + theme(legend.position="none"),
# g3 + theme(legend.position="none"),
# main ="High School Student Cluster Analysis", ncol=1))
grid.arrange(g1, g2, g3, ncol=1)

##Q4: Answer this question based on the three graphs you just created.How many students are strugling with science
#(aquired grades lower than 80) but excel in English (higher than or equal to 90)?