Part # 1:
library(knitr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
Using the mtcars data set
data('mtcars')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(1234)
wi = 0
for (i in 2:10) {
z = kmeans(mtcars[,1:3], i)
wi[i-1] = z$betweenss / z$totss
}
# report the list of within cluster sum of squares
plot(2:10, wi, type='l', xlab="k=2:10", ylab="Within Sum of Squares")
Create a kmeans object from the first, second, and third columns What is the size of each cluster?
r = kmeans(mtcars[,1:3], centers =4, nstart=20)
r$size
## [1] 8 4 9 11
What are the centers of each cluster?
r$centers
## mpg cyl disp
## 1 20.50000 5.500000 164.08750
## 2 13.67500 8.000000 443.00000
## 3 27.34444 4.000000 96.55556
## 4 16.19091 7.818182 311.76364
What is the average disp, wt, and qsec of each cluster?
mtcars %>%
mutate(clusters = r$cluster) %>%
group_by(clusters) %>%
summarise_each(funs(mean), c(disp, wt, qsec)) %>% kable()
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
| clusters | disp | wt | qsec |
|---|---|---|---|
| 1 | 164.08750 | 3.118125 | 18.66250 |
| 2 | 443.00000 | 4.966000 | 17.56750 |
| 3 | 96.55556 | 2.089222 | 18.62333 |
| 4 | 311.76364 | 3.576364 | 16.72545 |
Describe each cluster in English: As we can see from the above, Cluster 1 has cars with medium disp, medium wt and qsec. Cluster 2 has car with highest disp, highest wt and second lowest qsec. Cluster 3 has car with lowest disp , wt and medium qsec. Cluster 4 has cars with medium disp, second highest wt and lowest qsec.
Part 2:
Find a data set with at least 4 columns of numeric data and a categorical column
library(readr)
U <- read_csv("C:/Users/PinakKumar/Desktop/timesData.csv")
## Parsed with column specification:
## cols(
## world_rank = col_character(),
## university_name = col_character(),
## country = col_character(),
## teaching = col_double(),
## international = col_character(),
## research = col_double(),
## citations = col_double(),
## income = col_character(),
## total_score = col_character(),
## num_students = col_number(),
## student_staff_ratio = col_double(),
## international_students = col_character(),
## female_male_ratio = col_character(),
## year = col_integer()
## )
topU.2016 <- U[which(U$year == 2016), ][1:200, ]
## Get 2016's top 200 academic institutes
topU.2016$international <- as.numeric(as.character(topU.2016$international))
topU.2016$income <- as.numeric(as.character(topU.2016$income))
## Warning: NAs introduced by coercion
topU.2016$total_score <- as.numeric(as.character(topU.2016$total_score))
require(stringr)
## Loading required package: stringr
topU.2016$num_students <- as.numeric(gsub(",", "", as.character(topU.2016$num_students)))
topU.2016$international_students <-
as.numeric(gsub("%", "", as.character(topU.2016$international_students))) / 100
topU.2016$female_students <-
(as.numeric(substr(as.character(topU.2016$female_male_ratio),
1,
regexpr(" ", as.character(topU.2016$female_male_ratio))[1] - 1))
/ 100)
topU.2016$world_rank <- as.numeric(gsub("=", "", as.character(topU.2016$world_rank)))
row.names(topU.2016) <- 1:200
## Warning: Setting row names on a tibble is deprecated.
topU.2016$seq <- 1:200
topU.2016$continent <-
ifelse(topU.2016$country %in% c("South Africa"),
"Africa",
ifelse(topU.2016$country %in% c("China",
"Hong Kong",
"Japan",
"South Korea",
"Taiwan",
"Singapore",
"Israel"),
"Asia",
ifelse(topU.2016$country %in% c("Canada",
"United States of America"),
"North America",
ifelse(topU.2016$country %in% c("Australia",
"New Zealand"),
"Oceania",
"Europe"))))
Run several scatter plots of the data
require(ggplot2)
ggplot(topU.2016) +
geom_point(aes(teaching, citations)) +
labs(title = "Teaching vs. Citations")
ggplot(topU.2016) +
geom_point(aes(teaching, research)) +
labs(title = "Teaching vs. Research")
ggplot(topU.2016) +
geom_point(aes(teaching, international)) +
labs(title = "Teaching vs. international")
Create a kmeans object from the numeric data, you can pick K to be whatever you want Determine the size of each cluster
## Standardize input variables
attach(topU.2016)
teaching.Z <- (teaching - mean(teaching)) / sd(teaching)
international.Z <- (international - mean(international)) / sd(international)
research.Z <- (research - mean(research)) / sd(research)
citations.Z <- (citations - mean(citations)) / sd(citations)
detach(topU.2016)
topU.2016.stdz <- as.data.frame(cbind(teaching.Z,
international.Z,
research.Z,
citations.Z))
## Performa 3-cluster K-means cluster analysis
topU.2016.K3 <- kmeans(topU.2016.stdz,
centers = 3)
topU.2016.K3
## K-means clustering with 3 clusters of sizes 71, 50, 79
##
## Cluster means:
## teaching.Z international.Z research.Z citations.Z
## 1 -0.5524026 0.8828056 -0.4327508 -0.41902959
## 2 1.3879612 0.1635113 1.3535184 0.66985197
## 3 -0.3819933 -0.8968957 -0.4677292 -0.04736073
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 1 3 2 2 2 3 2 1 2 2 1 1 1 3 1 1 3 3 3 3 1 3 3 3 1 1
## [71] 1 1 1 3 3 1 3 3 3 1 3 1 1 3 3 1 3 3 1 3 1 3 1 3 3 1 1 1 3 3 1 1 3 3 1
## [106] 1 3 3 3 1 3 1 3 1 1 1 3 3 3 1 1 1 3 3 3 3 3 3 3 1 1 1 1 3 1 1 3 1 1 3
## [141] 1 3 1 1 3 1 3 3 3 1 1 1 3 3 1 3 1 3 3 1 1 3 3 3 1 3 3 1 3 1 1 1 1 3 3
## [176] 1 1 3 1 3 3 1 1 3 3 1 3 3 3 1 3 3 1 3 3 1 3 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 125.3292 105.0441 158.6333
## (between_SS / total_SS = 51.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Determine the centers of each cluster
topU.2016.K3$size
## [1] 71 50 79
Compare the clusters to the categorical data column as we did with the iris$Species column
K3.centers <- as.data.frame(topU.2016.K3$centers)
## Convert standardized scores back to original values
attach(topU.2016)
cntr.teaching <- K3.centers$teaching.Z * sd(teaching) + mean(teaching)
cntr.international <- K3.centers$international.Z * sd(international) + mean(international)
cntr.research <- K3.centers$research.Z * sd(research) + mean(research)
cntr.citations <- K3.centers$citations.Z * sd(citations) + mean(citations)
detach(topU.2016)
K3.centers <- as.data.frame(cbind(cntr.teaching,
cntr.international,
cntr.research,
cntr.citations))
row.names(K3.centers) <- 1:3
## Display cluster centers
K3.centers
## cntr.teaching cntr.international cntr.research cntr.citations
## 1 41.25211 84.09296 45.35211 77.55211
## 2 72.85800 69.99400 80.65800 91.48800
## 3 44.02785 49.20886 44.66076 82.30886
topU.2016$K3.clus <- topU.2016.K3$cluster
table(topU.2016$K3.clus, topU.2016$continent)
##
## Africa Asia Europe North America Oceania
## 1 1 4 53 5 8
## 2 0 4 17 28 1
## 3 0 7 35 37 0
Part 3:
Describe what each row of data represents: The dataset is downloaded from www.kaggle.com. The Times Higher Education World University Ranking is widely regarded as one of the most influential and widely observed university measures. This dataset has 14 columns as shown in summary below. There are 13 perfomance indicators captured , the performance is scaled from 0-100 range to get the higherst ranking. The performance indicators are assigned different weights, and grouped into five areas, including teaching, research, citations, international outlook, and industry income.
summary(U)
## world_rank university_name country teaching
## Length:2603 Length:2603 Length:2603 Min. : 9.9
## Class :character Class :character Class :character 1st Qu.:24.7
## Mode :character Mode :character Mode :character Median :33.9
## Mean :37.8
## 3rd Qu.:46.4
## Max. :99.7
##
## international research citations income
## Length:2603 Min. : 2.90 Min. : 1.20 Length:2603
## Class :character 1st Qu.:19.60 1st Qu.: 45.50 Class :character
## Mode :character Median :30.50 Median : 62.50 Mode :character
## Mean :35.91 Mean : 60.92
## 3rd Qu.:47.25 3rd Qu.: 79.05
## Max. :99.40 Max. :100.00
##
## total_score num_students student_staff_ratio
## Length:2603 Min. : 462 Min. : 0.60
## Class :character 1st Qu.: 12638 1st Qu.: 11.97
## Mode :character Median : 20851 Median : 16.10
## Mean : 23874 Mean : 18.45
## 3rd Qu.: 29991 3rd Qu.: 21.50
## Max. :379231 Max. :162.60
## NA's :59 NA's :59
## international_students female_male_ratio year
## Length:2603 Length:2603 Min. :2011
## Class :character Class :character 1st Qu.:2013
## Mode :character Mode :character Median :2014
## Mean :2014
## 3rd Qu.:2016
## Max. :2016
##
Describe the size and means of clusters
topU.2016.K3$size
## [1] 71 50 79
Cluster 1: Top tier Cluster 2: Traditional Cluster 3: Diversified