R Markdown

This is an code profolio for EDA class assignment. Including week7 to week 11 topics.

R for Data Science Ch3: data visualization

library(tidyverse)
## -- Attaching packages ----------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts -------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
ggplot2::mpg
## # A tibble: 234 x 11
##    manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4      2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4      2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4      2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4      2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4      3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 q~   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 q~   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 q~   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows
##create a ggplot for mpg data set, displ on x, hwy on y 
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy))

#The plot shows a negative relationship between engine size (displ) and fuel efficiency (hwy)

##graphing template 
#ggplot(data = <DATA>) + 
#  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, color = class))

# map the colors of the points to the class variable to reveal the class of each car

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, size = class))
## Warning: Using size for a discrete variable is not advised.

# mapped class to the size

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
## Warning: Using alpha for a discrete variable is not advised.

#map class to the alpha aesthetic, which controls the transparency of the points


ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).

#map class to the shape aesthetic


ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

# set the point blue


##Facets
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) + 
  facet_wrap(~ class, nrow = 2)

#split your plot into facets, subplots that each display one subset of the data.


ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) + 
  facet_grid(drv ~ cyl)

#contain two variable names separated by a ~
#If prefer to not facet in the rows or columns dimension, use a . instead of a variable name, e.g. + facet_grid(. ~ cyl)

#To change the geom in your plot, change the geom function that you add to ggplot()

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy))

#dot plot

ggplot(data = mpg) + 
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#line plot


#geom_smooth() will draw a different line, with a different linetype, for each unique value of the variable that you map to linetype

ggplot(data = mpg) + 
  geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#separates the cars into three lines based on their drv value

ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy, group = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) +
  geom_smooth(
    mapping = aes(x = displ, y = hwy, color = drv),
    show.legend = FALSE
  )
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) +
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#display multiple geoms in the same plot


ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#passing a set of mappings to ggplot()

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#display different aesthetics in different layers


ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#specify different data for each layer

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

data(diamonds)

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut))

#more diamonds are available with high quality cuts than with low quality cuts

ggplot(data = diamonds) + 
  stat_count(mapping = aes(x = cut))

#recreate the previous plot using stat_count()



demo <- tribble(
  ~cut,         ~freq,
  "Fair",       1610,
  "Good",       4906,
  "Very Good",  12082,
  "Premium",    13791,
  "Ideal",      21551
)

ggplot(data = demo) +
  geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")

#change the stat of geom_bar() from count (the default) to identity to map the height of the bars to the raw values of a y variable

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

#display a bar chart of proportion, rather than count

ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

#summarises the y values for each unique x value


ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, colour = cut))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = cut))

#colour a bar chart using either the colour aesthetic, or fill

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity))

#the bars are automatically stacked by clarity 

ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar(alpha = 1/5, position = "identity")

ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) + 
  geom_bar(fill = NA, position = "identity")

#to make the bars slightly transparent by setting alpha to a small value, or completely transparent by setting fill = NA

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")

#position = "fill" works like stacking, but makes each set of stacked bars the same height,This makes it easier to compare proportions across groups.

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")

#position = "fill" works like stacking, but makes each set of stacked bars the same height



ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")

#set the position adjustment to “jitter”. position = "jitter" adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise.


ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot() +
  coord_flip()

#coord_flip() switches the x and y axes.


nz <- map_data("nz")

ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", colour = "black")

ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", colour = "black") +
  coord_quickmap()

#coord_quickmap() sets the aspect ratio correctly for maps


bar <- ggplot(data = diamonds) + 
  geom_bar(
    mapping = aes(x = cut, fill = cut), 
    show.legend = FALSE,
    width = 1
  ) + 
  theme(aspect.ratio = 1) +
  labs(x = NULL, y = NULL)

bar + coord_flip()

bar + coord_polar()

#coord_polar() uses polar coordinates. Polar coordinates reveal an interesting connection between a bar chart and a Coxcomb chart.

Peng, R . Chapter 7 Plotting Systems

data(cars)
# Create the plot / draw canvas
 with(cars, plot(speed, dist))
 title("Speed vs. Stopping distance")#Add annotation

library(lattice) 
state <- data.frame(state.x77, region = state.region)

xyplot(Life.Exp ~ Income | region, data = state, layout = c(4, 1))

#lattice plot that looks at the relationship between life expectancy and income and how that relationship varies by region in the United States

Peng, R. (2010). Chapter 8

with(faithful, plot(eruptions, waiting)) 
title(main = "Old Faithful Geyser data")  

# Annotate with a title
library(datasets)

## Create a plot
with(faithful, plot(eruptions, waiting))  
#Add a main title
title(main = "Old Faithful Geyser data")  

#copy plot to a PNG file
dev.copy(png, file = "geyserplot.png")  
## png 
##   3
## close the PNG device
dev.off()  
## png 
##   2

Peng, R. Chapter 12 K-Means Clustering

#Simulated dataset
set.seed(1234)
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

#implements the K-means algorithm
dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
#centers which is either an integer indicating the number of clusters or a matrix indicating the locations of the initial cluster centroids
names(kmeansObj)
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
kmeansObj$cluster
##  [1] 3 1 1 3 2 2 2 2 2 2 2 2
#see which cluster each data point got assigned to by looking at the cluster element of the list returned by the kmeans() function

set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj <- kmeans(dataMatrix, centers = 3)
#find the K-means solution

par(mfrow = c(1, 2))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n", main = "Original Data")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n", main = "Clustered Data")

#image plot using the K-means clusters