The following code should be run in R Studio. All corresponding documentation is available on GitHub here.
This markdown file can also be viewed on R Pubs here.
The following packages may be useful to you but will not be the focus of the demonstrations during the Data Tools Forum:
library(swirl) # This is the tutorial we encourage you to go through prior to the session, specifically the first two modules
library(haven)
library(xlsx)
These packages will be used and discussed during the Wrangling portion of the presentation:
library(dplyr) # We will use this package during demonstrations
library(tidyr) # more data restructuring
library(stringr) # for regular expressions
These packages will be used and discussed during the Analysis through data visualization portion of the presentation:
library(ggplot2) # We will use this package during demonstrations
library(caret) # for classification and regression training
library(zoo) # for time series data
We’re going to be using the ‘diamonds’ dataset, which can be accessed by installing and loading ggplot2
# Check out the structure of the dataset
# str() function
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# head() for first few records
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
Diamonds munging:
# Subsetting out first 10 rows
diamonds[1:10,]
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
# Asigning the first 10 rows to a variable (object) called di_sub
di_sub <- diamonds[1:10,]
# Print the object and check its structure
di_sub
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
str(di_sub)
## 'data.frame': 10 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4
## $ table : num 55 61 65 58 58 57 57 55 61 61
## $ price : int 326 326 327 334 335 336 336 337 337 338
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39
# What if we just want to see a few columns of the new dataframe?
# We can use select()
di_sub2 <- select(di_sub, carat, cut, color, price, clarity)
di_sub2
## carat cut color price clarity
## 1 0.23 Ideal E 326 SI2
## 2 0.21 Premium E 326 SI1
## 3 0.23 Good E 327 VS1
## 4 0.29 Premium I 334 VS2
## 5 0.31 Good J 335 SI2
## 6 0.24 Very Good J 336 VVS2
## 7 0.24 Very Good I 336 VVS1
## 8 0.26 Very Good H 337 SI1
## 9 0.22 Fair E 337 VS2
## 10 0.23 Very Good H 338 VS1
# Try out filtering on the larger dataset for diamonds where the cut is ideal
diamonds_ideal <- filter(diamonds, cut=="Ideal")
head(diamonds_ideal)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
# Use mutate to add a variable to the ideal diamonds dataset
diamonds_ideal <- mutate(diamonds_ideal, price_per_carat = price/carat)
head(diamonds_ideal)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
## price_per_carat
## 1 1417.391
## 2 1478.261
## 3 1109.677
## 4 1160.000
## 5 1221.212
## 6 1221.212
# Write one of our new objects to a CSV file in your working directory
file <- write.csv(di_sub,"di_sub.csv")
# Read in the CSV file you just created
di_sub2 <- read.csv("di_sub.csv")
# Subset the dataset you just read in and asign it to a new object
new_diamonds <- di_sub2[,2:10]
# print the new object
new_diamonds
## carat cut color clarity depth table price x y
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05
There are many other functions available in dplyr for munging. Check out further documentation here.
We’ll start with some very simple base plots and move on to using ggplot2
# Base plotting for comparison
base_scatter <- plot(diamonds$carat, diamonds$cut,
xlab = "carat", ylab = "cut")
# Try another base plot
base_price <- plot(diamonds$carat, diamonds$price)
# Now we'll check out how base plotting differs from ggplot2 functionality
# Make a box plot
diamonds_box <- ggplot(diamonds, aes(y= carat, x= cut)) + geom_boxplot()
diamonds_box
# How about a violin plot? Just change the geom parameter
diamonds_violin <- ggplot(diamonds, aes(y= carat, x= cut)) + geom_violin ()
diamonds_violin
# Build up the plot with another layer by adding another geopm to the violin plot.
diamonds_vpoint <- diamonds_violin + geom_point()
diamonds_vpoint
A more advanced example: build a heatmap.
library(RColorBrewer) # for our color palette
diamonds_heat <- select(diamonds_ideal, color, clarity, price_per_carat)
str(diamonds_heat)
## 'data.frame': 21551 obs. of 3 variables:
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 7 7 6 6 6 7 4 6 6 ...
## $ clarity : Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 5 2 2 2 2 3 5 3 2 ...
## $ price_per_carat: num 1417 1478 1110 1160 1221 ...
diamonds_heat <- diamonds_heat[1:1500,]
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")), space="Lab")
hm <- ggplot(diamonds_heat,
aes(x = color, y = clarity, fill = price_per_carat))
hm <- hm + geom_tile()
hm <- hm + scale_fill_gradientn(colours = myPalette(100))
hm <- hm + scale_x_discrete(expand = c(0, 0))
hm <- hm + scale_y_discrete(expand = c(0, 0))
hm <- hm + coord_equal()
hm <- hm + theme_bw()
hm