library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
housing.df <- read_csv("BostonHousing.csv", col_names = TRUE)
## Rows: 506 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, ls...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
housing.df <- rename_with(housing.df,toupper)

head(housing.df, 9)
## # A tibble: 9 × 14
##      CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX PTRATIO     B
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296    15.3  397.
## 2 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242    17.8  397.
## 3 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242    17.8  393.
## 4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222    18.7  395.
## 5 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3   222    18.7  397.
## 6 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222    18.7  394.
## 7 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311    15.2  396.
## 8 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311    15.2  397.
## 9 0.211    12.5  7.87     0 0.524  5.63 100    6.08     5   311    15.2  387.
## # ℹ 2 more variables: LSTAT <dbl>, MEDV <dbl>
## line chart for the Amtrak data
Amtrak.df <- read.csv("Amtrak.csv")

use time series analysis

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
ridership.ts <- ts(Amtrak.df$Ridership, start = c(1991, 1), end = c(2004, 3), freq = 12)
plot(ridership.ts, xlab = "Year", ylab = "Ridership (in 000s)", ylim = c(1300, 2300))

## scatter plot with axes names

plot(housing.df$MEDV ~ housing.df$LSTAT, xlab = "MDEV", ylab = "LSTAT")

# alternative plot with ggplot
library(ggplot2)
ggplot(housing.df) + geom_point(aes(x = LSTAT, y = MEDV), colour = "navy", alpha = 0.7)

## barchart of CHAS vs. mean MEDV # compute mean MEDV per CHAS = (0, 1)

data.for.plot <- aggregate(housing.df$MEDV, by = list(housing.df$CHAS), FUN = mean)
names(data.for.plot) <- c("CHAS", "MeanMEDV")
barplot(data.for.plot$MeanMED, names.arg = data.for.plot$CHAS, xlab = "CHAS", ylab = "Avg. MEDV")

# alternative plot with ggplot
ggplot(data.for.plot) + geom_bar(aes(x = CHAS, y = MeanMEDV), stat = "identity")

## barchart of CHAS vs. % CAT.MEDV

housing.df <- housing.df %>%
  mutate(CAT_MEDV = if_else(MEDV >30,1,0) )
head(housing.df)
## # A tibble: 6 × 15
##      CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX PTRATIO     B
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 0.00632    18  2.31     0 0.538  6.58  65.2  4.09     1   296    15.3  397.
## 2 0.0273      0  7.07     0 0.469  6.42  78.9  4.97     2   242    17.8  397.
## 3 0.0273      0  7.07     0 0.469  7.18  61.1  4.97     2   242    17.8  393.
## 4 0.0324      0  2.18     0 0.458  7.00  45.8  6.06     3   222    18.7  395.
## 5 0.0690      0  2.18     0 0.458  7.15  54.2  6.06     3   222    18.7  397.
## 6 0.0298      0  2.18     0 0.458  6.43  58.7  6.06     3   222    18.7  394.
## # ℹ 3 more variables: LSTAT <dbl>, MEDV <dbl>, CAT_MEDV <dbl>
data.for.plot<-  housing.df %>%
  select(CAT_MEDV,CHAS)%>%
  group_by(CHAS) %>%
  summarise(MeanCATMEDV = mean(CAT_MEDV))

data.for.plot2 <- aggregate(housing.df$CAT_MEDV, by = list(housing.df$CHAS), FUN = mean)
names(data.for.plot2) <- c("CHAS", "MeanCATMEDV")
barplot(data.for.plot2$MeanCATMEDV * 100, names.arg = data.for.plot2$CHAS, xlab = "CHAS", ylab = "% of CAT.MEDV")

Distribution Plots: Boxplots and Histograms

histogram of MEDV

hist(housing.df$MEDV, xlab = "MEDV")

# alternative plot with ggplot
library(ggplot2)
ggplot(housing.df) + geom_histogram(aes(x = MEDV), binwidth = 5)

boxplot of MEDV for different values of CHAS

boxplot(housing.df$MEDV ~ housing.df$CHAS, xlab = "CHAS", ylab = "MEDV")

# alternative plot with ggplot
ggplot(housing.df) + geom_boxplot(aes(x = as.factor(CHAS), y = MEDV)) + xlab("CHAS")

side-by-side boxplots

use par() to split the plots into panels

par(mfcol = c(1, 4))
boxplot(housing.df$NOX ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "NOX")
boxplot(housing.df$LSTAT ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "LSTAT")
boxplot(housing.df$PTRATIO ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "PTRATIO")
boxplot(housing.df$INDUS ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "INDUS")

# Heatmaps: Visualizing Correlations and Missing Values ## simple heatmap of correlations (without values)

heatmap(cor(housing.df), Rowv = NA, Colv = NA)

## heatmap with values

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(cor(housing.df), Rowv = FALSE, Colv = FALSE, dendrogram = "none",
cellnote = round(cor(housing.df),2),
notecol = "black", key = FALSE, trace = 'none', margins = c(10,10))

# alternative plot with ggplot

library(ggplot2)
library(reshape) # to generate input for the plot
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
## 
##     stamp
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
cor.mat <- round(cor(housing.df),2) # rounded correlation matrix
melted.cor.mat <- melt(cor.mat)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
ggplot(melted.cor.mat, aes(x = X1, y = X2, fill = value)) +
geom_tile() +
geom_text(aes(x = X1, y = X2, label = value))

# replace dataFrame with your data. # is.na() returns a Boolean (TRUE/FALSE) output indicating the location of missing # values. # multiplying the Boolean value by 1 converts the output into binary (0/1).

data(diamonds)
sum(is.na(diamonds))
## [1] 0
heatmap(1 * is.na(diamonds), Rowv = NA, Colv = NA)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

3.4 Multidimensional Visualization

Adding Variables: Color, Size, Shape, Multiple Panels, and Animation

color plot

par(xpd=TRUE) # allow legend to be displayed outside of plot area
plot(housing.df$NOX ~ housing.df$LSTAT, ylab = "NOX", xlab = "LSTAT",
col = ifelse(housing.df$CAT_MEDV == 1, "black", "gray"))
# add legend outside of plotting area
# In legend() use argument inset = to control the location of the legend relative
# to the plot.
legend("topleft", inset=c(0, -0.2),
legend = c("CAT.MEDV = 1", "CAT.MEDV = 0"), col = c("black", "gray"),
pch = 1, cex = 0.5)

# alternative plot with ggplot
library(ggplot2)
ggplot(housing.df, aes(y = NOX, x = LSTAT, colour= CAT_MEDV)) +
geom_point(alpha = 0.6)

## panel plots # compute mean MEDV per RAD and CHAS # In aggregate() use argument drop = FALSE to include all combinations # (exiting and missing) of RAD X CHAS.

data.for.plot <- aggregate(housing.df$MEDV, by = list(housing.df$RAD, housing.df$CHAS),
FUN = mean, drop = FALSE)
names(data.for.plot) <- c("RAD", "CHAS", "meanMEDV")
# plot the data
par(mfcol = c(2,1))
barplot(height = data.for.plot$meanMEDV[data.for.plot$CHAS == 0],
names.arg = data.for.plot$RAD[data.for.plot$CHAS == 0],
xlab = "RAD", ylab = "Avg. MEDV", main = "CHAS = 0")
barplot(height = data.for.plot$meanMEDV[data.for.plot$CHAS == 1],
names.arg = data.for.plot$RAD[data.for.plot$CHAS == 1],
xlab = "RAD", ylab = "Avg. MEDV", main = "CHAS = 1")

# alternative plot with ggplot
ggplot(data.for.plot) +
geom_bar(aes(x = as.factor(RAD), y = `meanMEDV`), stat = "identity") +
xlab("RAD") + facet_grid(CHAS ~ .)
## Warning: Removed 3 rows containing missing values (`position_stack()`).

## simple plot # use plot() to generate a matrix of 4X4 panels with variable name on the diagonal, # and scatter plots in the remaining panels.

plot(housing.df[, c(1, 3, 13, 14)])

# alternative, nicer plot (displayed)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(housing.df[, c(1, 3, 13, 14)])

# Manipulations: Rescaling, Aggregation and Hierarchies, Zooming, Filtering

Rescaling

options(scipen=999) # avoid scientific notation
## scatter plot: regular and log scale
plot(housing.df$MEDV ~ housing.df$CRIM, xlab = "CRIM", ylab = "MEDV")

# to use logarithmic scale set argument log = to either 'x', 'y', or 'xy'.
plot(housing.df$MEDV ~ housing.df$CRIM,
xlab = "CRIM", ylab = "MEDV", log = 'xy')

# alternative log-scale plot with ggplot
library(ggplot2)
ggplot(housing.df) + geom_point(aes(x = CRIM, y = MEDV)) +
scale_x_log10(breaks = 10^(-2:2),
labels = format(10^(-2:2), scientific = FALSE, drop0trailing = TRUE)) +
scale_y_log10(breaks = c(5, 10, 20, 40))

## boxplot: regular and log scale
boxplot(housing.df$CRIM ~ housing.df$CAT_MEDV,
xlab = "CAT.MEDV", ylab = "CRIM")

boxplot(housing.df$CRIM ~ housing.df$CAT_MEDV,
xlab = "CAT.MEDV", ylab = "CRIM", log = 'y')

## Aggregation and Hierarchies ## Time series forecasting

library(forecast)
Amtrak.df <- read.csv("Amtrak.csv")
ridership.ts <- ts(Amtrak.df$Ridership, start = c(1991, 1), end = c(2004, 3), freq = 12)

## fit curve
ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))
plot(ridership.ts, xlab = "Year", ylab = "Ridership (in 000s)", ylim = c(1300, 2300))
lines(ridership.lm$fitted, lwd = 2)

# alternative plot with ggplot
library(ggplot2)
ggplot(Amtrak.df, aes(y = Ridership, x = Month, group = 12)) +
geom_line() + geom_smooth(formula = y ~ poly(x, 2), method= "lm",
colour = "navy", se = FALSE, na.rm = TRUE)

## zoom in, monthly, and annual plots
ridership.2yrs <- window(ridership.ts, start = c(1991,1), end = c(1992,12))
plot(ridership.2yrs, xlab = "Year", ylab = "Ridership (in 000s)", ylim = c(1300, 2300))

monthly.ridership.ts <- tapply(ridership.ts, cycle(ridership.ts), mean)
plot(monthly.ridership.ts, xlab = "Month", ylab = "Average Ridership",
ylim = c(1300, 2300), type = "l", xaxt = 'n')


## set x labels
axis(1, at = c(1:12), labels = c("Jan","Feb","Mar", "Apr","May","Jun",
"Jul","Aug","Sep", "Oct","Nov","Dec"))

annual.ridership.ts <- aggregate(ridership.ts, FUN = mean)
plot(annual.ridership.ts, xlab = "Year", ylab = "Average Ridership",
ylim = c(1300, 2300))

Scaling up to Large Datasets

utilities.df <- read.csv("utilities.csv")
plot(utilities.df$Fuel_Cost ~ utilities.df$Sales,
xlab = "Sales", ylab = "Fuel Cost", xlim = c(2000, 20000))
text(x = utilities.df$Sales, y = utilities.df$Fuel_Cost,
labels = utilities.df$Company, pos = 4, cex = 0.8, srt = 20, offset = 0.2)

# alternative with ggplot
library(ggplot2)
ggplot(utilities.df, aes(y = Fuel_Cost, x = Sales)) + geom_point() +
geom_text(aes(label = paste(" ", Company)), size = 4, hjust = 0.0, angle = 15) +
ylim(0.25, 2.25) + xlim(3000, 18000)

Multivariate Plot: Parallel Coordinates Plot

use function alpha() in library scales to add transparent colors

library(scales) plot(jitter(universal.df\(CCAvg, 1) ~ jitter(universal.df\)Income, 1), col = alpha(ifelse(universal.df$Securities.Account == 0, “gray”, “black”), 0.4), pch = 20, log = ‘xy’, ylim = c(0.1, 10), xlab = “Income”, ylab = “CCAvg”) # alternative with ggplot library(ggplot2) ggplot(universal.df) + geom_jitter(aes(x = Income, y = CCAvg, colour = Securities.Account)) + scale_x_log10(breaks = c(10, 20, 40, 60, 100, 200)) + scale_y_log10(breaks = c(0.1, 0.2, 0.4, 0.6, 1.0, 2.0, 4.0, 6.0))

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
par(mfcol = c(2,1))
parcoord(housing.df[housing.df$CAT_MEDV == 0, -14], main = "CAT.MEDV = 0")
parcoord(housing.df[housing.df$CAT_MEDV == 1, -14], main = "CAT.MEDV = 1")

3.5 Specialized Visualizations

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
ebay.df <- read.csv("eBayNetwork.csv")
# transform node ids to factors
ebay.df[,1] <- as.factor(ebay.df[,1])
ebay.df[,2] <- as.factor(ebay.df[,2])
graph.edges <- as.matrix(ebay.df[,1:2])
g <- graph.edgelist(graph.edges, directed = FALSE)
isBuyer <- V(g)$name %in% graph.edges[,2]
plot(g, vertex.label = NA, vertex.color = ifelse(isBuyer, "gray", "black"),
vertex.size = ifelse(isBuyer, 7, 10))

Visualizing Hierarchical Data: Treemaps

library(treemap)
tree.df <- read.csv("EbayTreemap.csv")
# add column for negative feedback
tree.df$negative.feedback <- 1* (tree.df$Seller.Feedback < 0)
# draw treemap
treemap(tree.df, index = c("Category","Sub.Category", "Brand"),
vSize = "High.Bid", vColor = "negative.feedback", fun.aggregate = "mean",
align.labels = list(c("left", "top"), c("right", "bottom"), c("center", "center")),
palette = rev(gray.colors(3)), type = "manual", title = "")

library(ggmap)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
SCstudents <- read.csv("SC-US-students-GPS-data-2016.csv")
pacman::p_load(ggmap, osmdata)
Map <- get_map(location = getbb("Denver"), zoom = 3, source ="stamen")
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
ggmap(Map) + geom_point(aes(x = longitude, y = latitude), data = SCstudents,
alpha = 0.4, colour = "red", size = 0.5)
## Warning: Removed 1687 rows containing missing values (`geom_point()`).

library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:igraph':
## 
##     compare
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
gdp.df <- read.csv("gdp.csv", skip = 4, stringsAsFactors = FALSE)
names(gdp.df)[4] <- "GDP2015"
names(gdp.df)[1] <- "Country_Name"
#happiness.df <- read.csv("Veerhoven.csv")
# gdp map
mWorldMap(gdp.df, key = "Country_Name", fill = "GDP2015") + coord_map()
## Mapping API still under development and may change in future releases.
## Warning in standardName(x, countryAlternatives, ignore.case = ignore.case, :
## 2372 items were not translated