kMeanCluster.R

######################################
# section 4.2.3
######################################

# Using R to Perform a K-Means Analysis

# install packages, if necessary
library(plyr)
library(ggplot2)
library(cluster)
library(lattice)
library(graphics)
library(grid)
library(gridExtra)

#import the student grades
grade_input = as.data.frame(read.csv("grades_km_input.csv"))

##Q1: Excecute "nrow()" to find the sample size of "grade_input" object. 
nrow(grade_input)

## [1] 620

#How many students are included in this dataset? 
kmdata_orig = as.matrix(grade_input[,c("Student","English", "Math","Science")])
kmdata <- kmdata_orig[,2:4]
kmdata[1:10,]

##       English Math Science
##  [1,]      99   96      97
##  [2,]      99   96      97
##  [3,]      98   97      97
##  [4,]      95  100      95
##  [5,]      95   96      96
##  [6,]      96   97      96
##  [7,]     100   96      97
##  [8,]      95   98      98
##  [9,]      98   96      96
## [10,]      99   99      95

#Q2: What is the English score of the the seventh student? 
#You must solve this by writing r commands. 
#Hint: Subset the Student ID with value 7 or subset the 7th row.
kmdata[7,1]

## English 
##     100

# calculate WSS of each of the 15 clusters.
wss <- numeric(15) 
for (k in 1:15) wss[k] <- sum(kmeans(kmdata, centers=k, nstart=25)$withinss)

plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares") 

set.seed(1)
km = kmeans(kmdata,3, nstart=25)
km

## K-means clustering with 3 clusters of sizes 158, 218, 244
## 
## Cluster means:
##    English     Math  Science
## 1 97.21519 93.37342 94.86076
## 2 73.22018 64.62844 65.84862
## 3 85.84426 79.68033 81.50820
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1 3 3 3
## [186] 1 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [260] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [297] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [334] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [371] 3 3 3 3 3 3 2 2 2 2 2 2 2 3 2 3 2 3 3 3 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2
## [408] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [445] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2
## [482] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [519] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [556] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [593] 2 2 2 2 2 2 2 2 3 3 2 2 3 3 3 3 1 1 3 3 3 2 2 3 2 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  6692.589 34806.339 22984.131
##  (between_SS / total_SS =  76.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#Q3:How many clusters did you just create in "km = kmeans(kmdata,3, nstart=25)"? 
#**use ?kmeans() to learn about this function.
#* #What is the reason to produce this number of K based on the text book explanation?

c(wss[3] , sum(km$withinss) )

## [1] 64483.06 64483.06

#prepare the student data and clustering results for plotting
df = as.data.frame(kmdata_orig[,2:4])
df$cluster = factor(km$cluster)
centers=as.data.frame(km$centers)

g1= ggplot(data=df, aes(x=English, y=Math, color=cluster )) + 
  geom_point() + theme(legend.position="right") +
  geom_point(data=centers, aes(x=English,y=Math, color=as.factor(c(1,2,3))), 
             size=10, alpha=.3, show.legend=FALSE)

g2 =ggplot(data=df, aes(x=English, y=Science, color=cluster )) + 
  geom_point() + 
  geom_point(data=centers, aes(x=English,y=Science, color=as.factor(c(1,2,3))), 
             size=10, alpha=.3, show.legend=FALSE)

#Original code with error====================== 
#g3 = ggplot(data=df, aes(x=Math, y=Science, color=cluster )) + 
#   geom_point() +
#   geom_point(data=centers, aes(x=Math,y=Science, color=as.factor(c(1,2,3))), 
#              size=10, alpha=.3, show.legend=FALSE)

g3 = ggplot(data=df, aes(x=Math, y=Science, color=cluster )) + 
    geom_point() +
    geom_point(data=centers, aes(x=Math,y=Science, color=as.factor(c(1,2,3))), 
               size=10, alpha=.3, show.legend = F)

tmp = ggplot_gtable(ggplot_build(g1))

#Original code with errors
#grid.arrange(arrangeGrob(g1 + theme(legend.position="none"),
#                          g2 + theme(legend.position="none"),
#                         g3 + theme(legend.position="none"),
#                         main ="High School Student Cluster Analysis", ncol=1))

grid.arrange(g1, g2, g3, ncol=1)

##Q4: Answer this question based on the three graphs you just created.How many students are strugling with science 
#(aquired grades lower than 80) but excel in English (higher than or equal to 90)?

kMeanCluster.R

sharl

2021-11-14