This is the CODE-ONLY cribsheet for the Novice workshop. Full document to be provided during & after workshop.

Schedule

– 6.30-6.35pm: A. RStudio - Basics
– 6.35-6.40pm: B. Pre-loaded data - Examine
– 6:40-6:50pm: C. Make your own data - Geo Chart
– 6:50-6:55pm: D. ggplot2 - Set-up
– 6:55-7:15pm: E. ggplot2 - Data Viz + Extensions
– 7:15-7.30pm: BREAK
– 7.30-8.15pm: F. Machine Learning technique - Clustering + Extension

A. RStudio - Basics

(i) Open & Save new R Script

(ii) Running code

(iii) Creating R Objects

object <- 3 + 5 

object
## [1] 8

B. Pre-loaded Data - Examine

data()
data("iris")
data("mtcars")
data("longley")
data("USArrests")
data("VADeaths")
class()
dim()
str()
summary()
head()
tail()
View()
?<name of data set>

C. Make your own data - Geo Chart

(i)

Country <-    c("United Kingdom", "France", "Spain", "Germany", "US", "Australia", "Thailand" )
Popularity <- c()
geodata <- data.frame(Country, Popularity)

View(geodata)
class(geodata)

(ii)

install.packages("googleVis")
library(googleVis)

(iii)

?gvisGeoChart
args(gvisGeoChart)

(iv)

geochart <- gvisGeoChart(geodata, 
                         locationvar = "Country",
                         colorvar = "Popularity")
plot(geochart)

D. ggplot2

(i)

install.packages("ggplot2")

(ii)

library(ggplot2)

(iii)

?ggplot
data("diamonds", "economics")

E. ggplot2 - Data Viz

(i) Line Chart

line.graph <- ggplot(data = economics, aes(x = date, y = pop)) + geom_line()

plot(line.graph)

(ii) Bar Chart

bar.chart <- ggplot(data = diamonds, aes(x = cut)) + geom_bar()

plot(bar.chart)

(iii) Box Plot

box.plot <- ggplot(data = diamonds, aes(x = cut, y = price)) + geom_boxplot()

plot(box.plot)

(iv) Scatterplot

scatterplot <- ggplot(data = diamonds, aes(x = price, y = carat)) + geom_point()

plot(scatterplot)

Extensions!!

–

+ ggtitle("insert title here")
+ xlab("insert x-axis label here")
+ ylab("insert y-axis label here")

–

+ geom_line(colour = "tomato")
+ geom_bar(colour = "slategrey", fill = "peachpuff")
+ geom_boxplot(colour = "navy", fill = "oldlace")

–

bar.chart.subset <- ggplot(data = diamonds, aes(x = cut)) + geom_bar(aes(fill = clarity))

plot(bar.chart.subset)

scatterplot.subset <- ggplot(data = diamonds, aes(x = price, y = carat)) + geom_point(aes(colour = cut))

plot(scatterplot.subset)

–

bar.chart.facet <- ggplot(data = diamonds, aes(x = cut)) + geom_bar() + facet_grid(color~.)
bar.chart.facet <- ggplot(data = diamonds, aes(x = cut)) + geom_bar(aes(fill = color)) + facet_grid(color~.)

plot(bar.chart.facet)

F. Machine Learning technique - Clustering

Intro + Steps

Steps:
(i) Load & Explore data
(ii) Clean data
(iii) Standardise data
(iv) Calculate Euclidean Distance Matrix
(v) Calculate Clusters
(vi) Make Dendrogram data viz
(vii) Define appropriate no. of Clusters
(viii) Variable combinations determining Cluster Membership
(ix) Create final Cluster Membership data

(i) Load & Explore data

data(mtcars)
View(mtcars)
?mtcars

(ii) Clean data

mtcars1 <- mtcars[, -c(8, 9)]

View(mtcars1)

(iii) Standardise data

medians <- apply(mtcars1, 2, median)

mads <- apply(mtcars1, 2, mad)

mtcars2 <- scale(mtcars1, center = medians, scale = mads)
print(mtcars2, digits=2)
##                      mpg   cyl  disp    hp  drat    wt  qsec  gear  carb
## Mazda RX4          0.333  0.00 -0.26 -0.17  0.29 -0.92 -0.88  0.00  1.35
## Mazda RX4 Wag      0.333  0.00 -0.26 -0.17  0.29 -0.59 -0.49  0.00  1.35
## Datsun 710         0.665 -0.67 -0.63 -0.39  0.22 -1.31  0.64  0.00 -0.67
## Hornet 4 Drive     0.407  0.00  0.44 -0.17 -0.87 -0.14  1.22 -0.67 -0.67
## Hornet Sportabout -0.092  0.67  1.17  0.67 -0.77  0.15 -0.49 -0.67  0.00

(iv) Calculate Euclidean Distance Matrix

mtcars3 <- dist(mtcars2, method = "euclidean")
print(mtcars3, digits=2)

(v) Calculate Clusters

clusters <- hclust(mtcars3, method = "ward.D2")

(vi) Make Dendrogram data viz

plot(clusters)
plot(clusters, hang = -1)

(vii) Define appropriate no. of Clusters

rect.hclust(clusters, 6)

clusters.6 <- cutree(clusters, 6)

(viii) Variable combinations determining Cluster Membership

means <- aggregate(mtcars2, list(clusters.6), mean)

options(digits = 2)
means
##   Group.1   mpg   cyl  disp     hp   drat    wt   qsec  gear  carb
## 1       1  0.10  0.00 -0.26  0.067  0.223 -0.39 -0.335  0.13  1.62
## 2       2  0.48 -0.48 -0.26 -0.352 -0.153 -0.50  1.595 -0.29 -0.39
## 3       3 -0.45  0.67  0.85  0.610 -0.916  0.47 -0.275 -0.67  0.29
## 4       4 -0.85  0.67  1.03  1.936 -0.028  0.28 -1.852  0.00  2.02
## 5       5 -1.36  0.67  1.86  1.215 -0.911  2.63  0.021 -0.67  1.35
## 6       6  2.01 -0.67 -0.78 -0.616  0.790 -1.89  0.486  0.22 -0.34
means2 <- aggregate(mtcars[,-c(8,9)], list(clusters.6), mean)

means2
##   Group.1 mpg cyl disp  hp drat  wt qsec gear carb
## 1       1  20 6.0  160 128  3.9 3.0   17  4.2  4.4
## 2       2  22 4.6  160  96  3.6 2.9   20  3.6  1.4
## 3       3  17 8.0  316 170  3.0 3.7   17  3.0  2.4
## 4       4  15 8.0  340 272  3.7 3.5   15  4.0  5.0
## 5       5  12 8.0  457 217  3.1 5.3   18  3.0  4.0
## 6       6  30 4.0   87  76  4.3 1.9   18  4.3  1.5

(ix) Create final Cluster Membership data

mtcars.membership <- cbind(clusters.6, mtcars)
mtcars.membership
##                   clusters.6 mpg cyl disp  hp drat  wt qsec vs am gear
## Mazda RX4                  1  21   6  160 110  3.9 2.6   16  0  1    4
## Mazda RX4 Wag              1  21   6  160 110  3.9 2.9   17  0  1    4
## Datsun 710                 2  23   4  108  93  3.9 2.3   19  1  1    4
## Hornet 4 Drive             2  21   6  258 110  3.1 3.2   19  1  0    3
## Hornet Sportabout          3  19   8  360 175  3.1 3.4   17  0  0    3
## Valiant                    2  18   6  225 105  2.8 3.5   20  1  0    3

Extension!!

install.packages("sparcl")
library(sparcl)

ColorDendrogram(clusters, y = clusters.6, labels = names(clusters.6), branchlength = 5)