Cluster Analysis

Part # 1:

library(knitr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.2

Using the mtcars data set

data('mtcars')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

set.seed(1234)
wi = 0

for (i in 2:10) {
  z = kmeans(mtcars[,1:3], i)
  wi[i-1] = z$betweenss / z$totss
}
# report the list of within cluster sum of squares
plot(2:10, wi, type='l', xlab="k=2:10", ylab="Within Sum of Squares")

Create a kmeans object from the first, second, and third columns What is the size of each cluster?

r = kmeans(mtcars[,1:3], centers =4, nstart=20)
r$size

## [1]  8  4  9 11

What are the centers of each cluster?

r$centers

##        mpg      cyl      disp
## 1 20.50000 5.500000 164.08750
## 2 13.67500 8.000000 443.00000
## 3 27.34444 4.000000  96.55556
## 4 16.19091 7.818182 311.76364

What is the average disp, wt, and qsec of each cluster?

mtcars %>% 
    mutate(clusters = r$cluster) %>% 
    group_by(clusters) %>% 
    summarise_each(funs(mean), c(disp, wt, qsec)) %>% kable()

## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`

clusters	disp	wt	qsec
1	164.08750	3.118125	18.66250
2	443.00000	4.966000	17.56750
3	96.55556	2.089222	18.62333
4	311.76364	3.576364	16.72545

Describe each cluster in English: As we can see from the above, Cluster 1 has cars with medium disp, medium wt and qsec. Cluster 2 has car with highest disp, highest wt and second lowest qsec. Cluster 3 has car with lowest disp , wt and medium qsec. Cluster 4 has cars with medium disp, second highest wt and lowest qsec.

Part 2:

Find a data set with at least 4 columns of numeric data and a categorical column

library(readr)
U <- read_csv("C:/Users/PinakKumar/Desktop/timesData.csv")

## Parsed with column specification:
## cols(
##   world_rank = col_character(),
##   university_name = col_character(),
##   country = col_character(),
##   teaching = col_double(),
##   international = col_character(),
##   research = col_double(),
##   citations = col_double(),
##   income = col_character(),
##   total_score = col_character(),
##   num_students = col_number(),
##   student_staff_ratio = col_double(),
##   international_students = col_character(),
##   female_male_ratio = col_character(),
##   year = col_integer()
## )

topU.2016 <- U[which(U$year == 2016), ][1:200, ]
## Get 2016's top 200 academic institutes
topU.2016$international <- as.numeric(as.character(topU.2016$international))
topU.2016$income <- as.numeric(as.character(topU.2016$income))

## Warning: NAs introduced by coercion

topU.2016$total_score <- as.numeric(as.character(topU.2016$total_score))
require(stringr)

## Loading required package: stringr

topU.2016$num_students <- as.numeric(gsub(",", "", as.character(topU.2016$num_students)))
topU.2016$international_students <-
    as.numeric(gsub("%", "", as.character(topU.2016$international_students))) / 100
topU.2016$female_students <-
    (as.numeric(substr(as.character(topU.2016$female_male_ratio),
                       1,
                       regexpr(" ", as.character(topU.2016$female_male_ratio))[1] - 1))
     / 100)
topU.2016$world_rank <- as.numeric(gsub("=", "", as.character(topU.2016$world_rank)))
row.names(topU.2016) <- 1:200

## Warning: Setting row names on a tibble is deprecated.

topU.2016$seq <- 1:200

topU.2016$continent <-
    ifelse(topU.2016$country %in% c("South Africa"),
           "Africa",
           ifelse(topU.2016$country %in% c("China",
                                           "Hong Kong",
                                           "Japan",
                                           "South Korea",
                                           "Taiwan",
                                           "Singapore",
                                           "Israel"),
                  "Asia",
                  ifelse(topU.2016$country %in% c("Canada",
                                                  "United States of America"),
                         "North America",
                         ifelse(topU.2016$country %in% c("Australia",
                                                         "New Zealand"),
                                "Oceania",
                                "Europe"))))

Run several scatter plots of the data

require(ggplot2)
ggplot(topU.2016) +
    geom_point(aes(teaching, citations)) +
    labs(title = "Teaching vs. Citations")

ggplot(topU.2016) +
    geom_point(aes(teaching, research)) +
    labs(title = "Teaching vs. Research")

ggplot(topU.2016) +
    geom_point(aes(teaching, international)) +
    labs(title = "Teaching vs. international")

Create a kmeans object from the numeric data, you can pick K to be whatever you want Determine the size of each cluster

## Standardize input variables
attach(topU.2016)
teaching.Z      <- (teaching - mean(teaching)) / sd(teaching)
international.Z <- (international - mean(international)) / sd(international)
research.Z      <- (research - mean(research)) / sd(research)
citations.Z     <- (citations - mean(citations)) / sd(citations)
detach(topU.2016)

topU.2016.stdz <- as.data.frame(cbind(teaching.Z,
                                      international.Z,
                                      research.Z,
                                      citations.Z))

## Performa 3-cluster K-means cluster analysis
topU.2016.K3 <- kmeans(topU.2016.stdz,
                       centers = 3)
topU.2016.K3

## K-means clustering with 3 clusters of sizes 71, 50, 79
## 
## Cluster means:
##   teaching.Z international.Z research.Z citations.Z
## 1 -0.5524026       0.8828056 -0.4327508 -0.41902959
## 2  1.3879612       0.1635113  1.3535184  0.66985197
## 3 -0.3819933      -0.8968957 -0.4677292 -0.04736073
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 1 3 2 2 2 3 2 1 2 2 1 1 1 3 1 1 3 3 3 3 1 3 3 3 1 1
##  [71] 1 1 1 3 3 1 3 3 3 1 3 1 1 3 3 1 3 3 1 3 1 3 1 3 3 1 1 1 3 3 1 1 3 3 1
## [106] 1 3 3 3 1 3 1 3 1 1 1 3 3 3 1 1 1 3 3 3 3 3 3 3 1 1 1 1 3 1 1 3 1 1 3
## [141] 1 3 1 1 3 1 3 3 3 1 1 1 3 3 1 3 1 3 3 1 1 3 3 3 1 3 3 1 3 1 1 1 1 3 3
## [176] 1 1 3 1 3 3 1 1 3 3 1 3 3 3 1 3 3 1 3 3 1 3 3 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 125.3292 105.0441 158.6333
##  (between_SS / total_SS =  51.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Determine the centers of each cluster

topU.2016.K3$size

## [1] 71 50 79

Compare the clusters to the categorical data column as we did with the iris$Species column

K3.centers <- as.data.frame(topU.2016.K3$centers)

## Convert standardized scores back to original values
attach(topU.2016)
cntr.teaching      <- K3.centers$teaching.Z * sd(teaching) + mean(teaching)
cntr.international <- K3.centers$international.Z * sd(international) + mean(international)
cntr.research      <- K3.centers$research.Z * sd(research) + mean(research)
cntr.citations    <- K3.centers$citations.Z * sd(citations) + mean(citations)
detach(topU.2016)
K3.centers <- as.data.frame(cbind(cntr.teaching,
                                  cntr.international,
                                  cntr.research,
                                  cntr.citations))
row.names(K3.centers) <- 1:3

## Display cluster centers
K3.centers

##   cntr.teaching cntr.international cntr.research cntr.citations
## 1      41.25211           84.09296      45.35211       77.55211
## 2      72.85800           69.99400      80.65800       91.48800
## 3      44.02785           49.20886      44.66076       82.30886

topU.2016$K3.clus <- topU.2016.K3$cluster
table(topU.2016$K3.clus, topU.2016$continent)

##    
##     Africa Asia Europe North America Oceania
##   1      1    4     53             5       8
##   2      0    4     17            28       1
##   3      0    7     35            37       0

Part 3:

Describe what each row of data represents: The dataset is downloaded from www.kaggle.com. The Times Higher Education World University Ranking is widely regarded as one of the most influential and widely observed university measures. This dataset has 14 columns as shown in summary below. There are 13 perfomance indicators captured , the performance is scaled from 0-100 range to get the higherst ranking. The performance indicators are assigned different weights, and grouped into five areas, including teaching, research, citations, international outlook, and industry income.

summary(U)

##   world_rank        university_name      country             teaching   
##  Length:2603        Length:2603        Length:2603        Min.   : 9.9  
##  Class :character   Class :character   Class :character   1st Qu.:24.7  
##  Mode  :character   Mode  :character   Mode  :character   Median :33.9  
##                                                           Mean   :37.8  
##                                                           3rd Qu.:46.4  
##                                                           Max.   :99.7  
##                                                                         
##  international         research       citations         income         
##  Length:2603        Min.   : 2.90   Min.   :  1.20   Length:2603       
##  Class :character   1st Qu.:19.60   1st Qu.: 45.50   Class :character  
##  Mode  :character   Median :30.50   Median : 62.50   Mode  :character  
##                     Mean   :35.91   Mean   : 60.92                     
##                     3rd Qu.:47.25   3rd Qu.: 79.05                     
##                     Max.   :99.40   Max.   :100.00                     
##                                                                        
##  total_score         num_students    student_staff_ratio
##  Length:2603        Min.   :   462   Min.   :  0.60     
##  Class :character   1st Qu.: 12638   1st Qu.: 11.97     
##  Mode  :character   Median : 20851   Median : 16.10     
##                     Mean   : 23874   Mean   : 18.45     
##                     3rd Qu.: 29991   3rd Qu.: 21.50     
##                     Max.   :379231   Max.   :162.60     
##                     NA's   :59       NA's   :59         
##  international_students female_male_ratio       year     
##  Length:2603            Length:2603        Min.   :2011  
##  Class :character       Class :character   1st Qu.:2013  
##  Mode  :character       Mode  :character   Median :2014  
##                                            Mean   :2014  
##                                            3rd Qu.:2016  
##                                            Max.   :2016  
##

Describe the size and means of clusters

topU.2016.K3$size

## [1] 71 50 79

Cluster 1: Top tier Cluster 2: Traditional Cluster 3: Diversified

Cluster Analysis

AG

13 December 2017