This is an code profolio for EDA class assignment. Including week7 to week 11 topics.
library(tidyverse)
## -- Attaching packages ----------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
ggplot2::mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 q~ 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 q~ 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 q~ 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 224 more rows
##create a ggplot for mpg data set, displ on x, hwy on y
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
#The plot shows a negative relationship between engine size (displ) and fuel efficiency (hwy)
##graphing template
#ggplot(data = <DATA>) +
# <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class))
# map the colors of the points to the class variable to reveal the class of each car
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, size = class))
## Warning: Using size for a discrete variable is not advised.
# mapped class to the size
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
## Warning: Using alpha for a discrete variable is not advised.
#map class to the alpha aesthetic, which controls the transparency of the points
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
#map class to the shape aesthetic
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
# set the point blue
##Facets
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
#split your plot into facets, subplots that each display one subset of the data.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl)
#contain two variable names separated by a ~
#If prefer to not facet in the rows or columns dimension, use a . instead of a variable name, e.g. + facet_grid(. ~ cyl)
#To change the geom in your plot, change the geom function that you add to ggplot()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
#dot plot
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#line plot
#geom_smooth() will draw a different line, with a different linetype, for each unique value of the variable that you map to linetype
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#separates the cars into three lines based on their drv value
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, group = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg) +
geom_smooth(
mapping = aes(x = displ, y = hwy, color = drv),
show.legend = FALSE
)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#display multiple geoms in the same plot
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#passing a set of mappings to ggplot()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#display different aesthetics in different layers
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#specify different data for each layer
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
data(diamonds)
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
#more diamonds are available with high quality cuts than with low quality cuts
ggplot(data = diamonds) +
stat_count(mapping = aes(x = cut))
#recreate the previous plot using stat_count()
demo <- tribble(
~cut, ~freq,
"Fair", 1610,
"Good", 4906,
"Very Good", 12082,
"Premium", 13791,
"Ideal", 21551
)
ggplot(data = demo) +
geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")
#change the stat of geom_bar() from count (the default) to identity to map the height of the bars to the raw values of a y variable
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
#display a bar chart of proportion, rather than count
ggplot(data = diamonds) +
stat_summary(
mapping = aes(x = cut, y = depth),
fun.ymin = min,
fun.ymax = max,
fun.y = median
)
#summarises the y values for each unique x value
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, colour = cut))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = cut))
#colour a bar chart using either the colour aesthetic, or fill
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity))
#the bars are automatically stacked by clarity
ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) +
geom_bar(alpha = 1/5, position = "identity")
ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) +
geom_bar(fill = NA, position = "identity")
#to make the bars slightly transparent by setting alpha to a small value, or completely transparent by setting fill = NA
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")
#position = "fill" works like stacking, but makes each set of stacked bars the same height,This makes it easier to compare proportions across groups.
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")
#position = "fill" works like stacking, but makes each set of stacked bars the same height
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")
#set the position adjustment to “jitter”. position = "jitter" adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise.
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot() +
coord_flip()
#coord_flip() switches the x and y axes.
nz <- map_data("nz")
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", colour = "black")
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", colour = "black") +
coord_quickmap()
#coord_quickmap() sets the aspect ratio correctly for maps
bar <- ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = cut, fill = cut),
show.legend = FALSE,
width = 1
) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL)
bar + coord_flip()
bar + coord_polar()
#coord_polar() uses polar coordinates. Polar coordinates reveal an interesting connection between a bar chart and a Coxcomb chart.
data(cars)
# Create the plot / draw canvas
with(cars, plot(speed, dist))
title("Speed vs. Stopping distance")#Add annotation
library(lattice)
state <- data.frame(state.x77, region = state.region)
xyplot(Life.Exp ~ Income | region, data = state, layout = c(4, 1))
#lattice plot that looks at the relationship between life expectancy and income and how that relationship varies by region in the United States
with(faithful, plot(eruptions, waiting))
title(main = "Old Faithful Geyser data")
# Annotate with a title
library(datasets)
## Create a plot
with(faithful, plot(eruptions, waiting))
#Add a main title
title(main = "Old Faithful Geyser data")
#copy plot to a PNG file
dev.copy(png, file = "geyserplot.png")
## png
## 3
## close the PNG device
dev.off()
## png
## 2
#Simulated dataset
set.seed(1234)
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))
#implements the K-means algorithm
dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
#centers which is either an integer indicating the number of clusters or a matrix indicating the locations of the initial cluster centroids
names(kmeansObj)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
kmeansObj$cluster
## [1] 3 1 1 3 2 2 2 2 2 2 2 2
#see which cluster each data point got assigned to by looking at the cluster element of the list returned by the kmeans() function
set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj <- kmeans(dataMatrix, centers = 3)
#find the K-means solution
par(mfrow = c(1, 2))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n", main = "Original Data")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n", main = "Clustered Data")
#image plot using the K-means clusters