library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
housing.df <- read_csv("BostonHousing.csv", col_names = TRUE)
## Rows: 506 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, ls...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
housing.df <- rename_with(housing.df,toupper)
head(housing.df, 9)
## # A tibble: 9 × 14
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296 15.3 397.
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242 17.8 397.
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 393.
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222 18.7 395.
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222 18.7 397.
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222 18.7 394.
## 7 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311 15.2 396.
## 8 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311 15.2 397.
## 9 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311 15.2 387.
## # ℹ 2 more variables: LSTAT <dbl>, MEDV <dbl>
## line chart for the Amtrak data
Amtrak.df <- read.csv("Amtrak.csv")
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
ridership.ts <- ts(Amtrak.df$Ridership, start = c(1991, 1), end = c(2004, 3), freq = 12)
plot(ridership.ts, xlab = "Year", ylab = "Ridership (in 000s)", ylim = c(1300, 2300))
## scatter plot with axes names
plot(housing.df$MEDV ~ housing.df$LSTAT, xlab = "MDEV", ylab = "LSTAT")
# alternative plot with ggplot
library(ggplot2)
ggplot(housing.df) + geom_point(aes(x = LSTAT, y = MEDV), colour = "navy", alpha = 0.7)
## barchart of CHAS vs. mean MEDV # compute mean MEDV per CHAS = (0,
1)
data.for.plot <- aggregate(housing.df$MEDV, by = list(housing.df$CHAS), FUN = mean)
names(data.for.plot) <- c("CHAS", "MeanMEDV")
barplot(data.for.plot$MeanMED, names.arg = data.for.plot$CHAS, xlab = "CHAS", ylab = "Avg. MEDV")
# alternative plot with ggplot
ggplot(data.for.plot) + geom_bar(aes(x = CHAS, y = MeanMEDV), stat = "identity")
## barchart of CHAS vs. % CAT.MEDV
housing.df <- housing.df %>%
mutate(CAT_MEDV = if_else(MEDV >30,1,0) )
head(housing.df)
## # A tibble: 6 × 15
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296 15.3 397.
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242 17.8 397.
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 393.
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222 18.7 395.
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222 18.7 397.
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222 18.7 394.
## # ℹ 3 more variables: LSTAT <dbl>, MEDV <dbl>, CAT_MEDV <dbl>
data.for.plot<- housing.df %>%
select(CAT_MEDV,CHAS)%>%
group_by(CHAS) %>%
summarise(MeanCATMEDV = mean(CAT_MEDV))
data.for.plot2 <- aggregate(housing.df$CAT_MEDV, by = list(housing.df$CHAS), FUN = mean)
names(data.for.plot2) <- c("CHAS", "MeanCATMEDV")
barplot(data.for.plot2$MeanCATMEDV * 100, names.arg = data.for.plot2$CHAS, xlab = "CHAS", ylab = "% of CAT.MEDV")
hist(housing.df$MEDV, xlab = "MEDV")
# alternative plot with ggplot
library(ggplot2)
ggplot(housing.df) + geom_histogram(aes(x = MEDV), binwidth = 5)
boxplot(housing.df$MEDV ~ housing.df$CHAS, xlab = "CHAS", ylab = "MEDV")
# alternative plot with ggplot
ggplot(housing.df) + geom_boxplot(aes(x = as.factor(CHAS), y = MEDV)) + xlab("CHAS")
par(mfcol = c(1, 4))
boxplot(housing.df$NOX ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "NOX")
boxplot(housing.df$LSTAT ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "LSTAT")
boxplot(housing.df$PTRATIO ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "PTRATIO")
boxplot(housing.df$INDUS ~ housing.df$CAT_MEDV, xlab = "CAT.MEDV", ylab = "INDUS")
# Heatmaps: Visualizing Correlations and Missing Values ## simple
heatmap of correlations (without values)
heatmap(cor(housing.df), Rowv = NA, Colv = NA)
## heatmap with values
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(cor(housing.df), Rowv = FALSE, Colv = FALSE, dendrogram = "none",
cellnote = round(cor(housing.df),2),
notecol = "black", key = FALSE, trace = 'none', margins = c(10,10))
# alternative plot with ggplot
library(ggplot2)
library(reshape) # to generate input for the plot
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
cor.mat <- round(cor(housing.df),2) # rounded correlation matrix
melted.cor.mat <- melt(cor.mat)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
ggplot(melted.cor.mat, aes(x = X1, y = X2, fill = value)) +
geom_tile() +
geom_text(aes(x = X1, y = X2, label = value))
# replace dataFrame with your data. # is.na() returns a Boolean
(TRUE/FALSE) output indicating the location of missing # values. #
multiplying the Boolean value by 1 converts the output into binary
(0/1).
data(diamonds)
sum(is.na(diamonds))
## [1] 0
heatmap(1 * is.na(diamonds), Rowv = NA, Colv = NA)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
par(xpd=TRUE) # allow legend to be displayed outside of plot area
plot(housing.df$NOX ~ housing.df$LSTAT, ylab = "NOX", xlab = "LSTAT",
col = ifelse(housing.df$CAT_MEDV == 1, "black", "gray"))
# add legend outside of plotting area
# In legend() use argument inset = to control the location of the legend relative
# to the plot.
legend("topleft", inset=c(0, -0.2),
legend = c("CAT.MEDV = 1", "CAT.MEDV = 0"), col = c("black", "gray"),
pch = 1, cex = 0.5)
# alternative plot with ggplot
library(ggplot2)
ggplot(housing.df, aes(y = NOX, x = LSTAT, colour= CAT_MEDV)) +
geom_point(alpha = 0.6)
## panel plots # compute mean MEDV per RAD and CHAS # In aggregate() use
argument drop = FALSE to include all combinations # (exiting and
missing) of RAD X CHAS.
data.for.plot <- aggregate(housing.df$MEDV, by = list(housing.df$RAD, housing.df$CHAS),
FUN = mean, drop = FALSE)
names(data.for.plot) <- c("RAD", "CHAS", "meanMEDV")
# plot the data
par(mfcol = c(2,1))
barplot(height = data.for.plot$meanMEDV[data.for.plot$CHAS == 0],
names.arg = data.for.plot$RAD[data.for.plot$CHAS == 0],
xlab = "RAD", ylab = "Avg. MEDV", main = "CHAS = 0")
barplot(height = data.for.plot$meanMEDV[data.for.plot$CHAS == 1],
names.arg = data.for.plot$RAD[data.for.plot$CHAS == 1],
xlab = "RAD", ylab = "Avg. MEDV", main = "CHAS = 1")
# alternative plot with ggplot
ggplot(data.for.plot) +
geom_bar(aes(x = as.factor(RAD), y = `meanMEDV`), stat = "identity") +
xlab("RAD") + facet_grid(CHAS ~ .)
## Warning: Removed 3 rows containing missing values (`position_stack()`).
## simple plot # use plot() to generate a matrix of 4X4 panels with
variable name on the diagonal, # and scatter plots in the remaining
panels.
plot(housing.df[, c(1, 3, 13, 14)])
# alternative, nicer plot (displayed)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(housing.df[, c(1, 3, 13, 14)])
# Manipulations: Rescaling, Aggregation and Hierarchies, Zooming,
Filtering
options(scipen=999) # avoid scientific notation
## scatter plot: regular and log scale
plot(housing.df$MEDV ~ housing.df$CRIM, xlab = "CRIM", ylab = "MEDV")
# to use logarithmic scale set argument log = to either 'x', 'y', or 'xy'.
plot(housing.df$MEDV ~ housing.df$CRIM,
xlab = "CRIM", ylab = "MEDV", log = 'xy')
# alternative log-scale plot with ggplot
library(ggplot2)
ggplot(housing.df) + geom_point(aes(x = CRIM, y = MEDV)) +
scale_x_log10(breaks = 10^(-2:2),
labels = format(10^(-2:2), scientific = FALSE, drop0trailing = TRUE)) +
scale_y_log10(breaks = c(5, 10, 20, 40))
## boxplot: regular and log scale
boxplot(housing.df$CRIM ~ housing.df$CAT_MEDV,
xlab = "CAT.MEDV", ylab = "CRIM")
boxplot(housing.df$CRIM ~ housing.df$CAT_MEDV,
xlab = "CAT.MEDV", ylab = "CRIM", log = 'y')
## Aggregation and Hierarchies ## Time series forecasting
library(forecast)
Amtrak.df <- read.csv("Amtrak.csv")
ridership.ts <- ts(Amtrak.df$Ridership, start = c(1991, 1), end = c(2004, 3), freq = 12)
## fit curve
ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))
plot(ridership.ts, xlab = "Year", ylab = "Ridership (in 000s)", ylim = c(1300, 2300))
lines(ridership.lm$fitted, lwd = 2)
# alternative plot with ggplot
library(ggplot2)
ggplot(Amtrak.df, aes(y = Ridership, x = Month, group = 12)) +
geom_line() + geom_smooth(formula = y ~ poly(x, 2), method= "lm",
colour = "navy", se = FALSE, na.rm = TRUE)
## zoom in, monthly, and annual plots
ridership.2yrs <- window(ridership.ts, start = c(1991,1), end = c(1992,12))
plot(ridership.2yrs, xlab = "Year", ylab = "Ridership (in 000s)", ylim = c(1300, 2300))
monthly.ridership.ts <- tapply(ridership.ts, cycle(ridership.ts), mean)
plot(monthly.ridership.ts, xlab = "Month", ylab = "Average Ridership",
ylim = c(1300, 2300), type = "l", xaxt = 'n')
## set x labels
axis(1, at = c(1:12), labels = c("Jan","Feb","Mar", "Apr","May","Jun",
"Jul","Aug","Sep", "Oct","Nov","Dec"))
annual.ridership.ts <- aggregate(ridership.ts, FUN = mean)
plot(annual.ridership.ts, xlab = "Year", ylab = "Average Ridership",
ylim = c(1300, 2300))
utilities.df <- read.csv("utilities.csv")
plot(utilities.df$Fuel_Cost ~ utilities.df$Sales,
xlab = "Sales", ylab = "Fuel Cost", xlim = c(2000, 20000))
text(x = utilities.df$Sales, y = utilities.df$Fuel_Cost,
labels = utilities.df$Company, pos = 4, cex = 0.8, srt = 20, offset = 0.2)
# alternative with ggplot
library(ggplot2)
ggplot(utilities.df, aes(y = Fuel_Cost, x = Sales)) + geom_point() +
geom_text(aes(label = paste(" ", Company)), size = 4, hjust = 0.0, angle = 15) +
ylim(0.25, 2.25) + xlim(3000, 18000)
Multivariate Plot: Parallel Coordinates Plot
library(scales) plot(jitter(universal.df\(CCAvg, 1) ~ jitter(universal.df\)Income, 1), col = alpha(ifelse(universal.df$Securities.Account == 0, “gray”, “black”), 0.4), pch = 20, log = ‘xy’, ylim = c(0.1, 10), xlab = “Income”, ylab = “CCAvg”) # alternative with ggplot library(ggplot2) ggplot(universal.df) + geom_jitter(aes(x = Income, y = CCAvg, colour = Securities.Account)) + scale_x_log10(breaks = c(10, 20, 40, 60, 100, 200)) + scale_y_log10(breaks = c(0.1, 0.2, 0.4, 0.6, 1.0, 2.0, 4.0, 6.0))
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
par(mfcol = c(2,1))
parcoord(housing.df[housing.df$CAT_MEDV == 0, -14], main = "CAT.MEDV = 0")
parcoord(housing.df[housing.df$CAT_MEDV == 1, -14], main = "CAT.MEDV = 1")
3.5 Specialized Visualizations
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
ebay.df <- read.csv("eBayNetwork.csv")
# transform node ids to factors
ebay.df[,1] <- as.factor(ebay.df[,1])
ebay.df[,2] <- as.factor(ebay.df[,2])
graph.edges <- as.matrix(ebay.df[,1:2])
g <- graph.edgelist(graph.edges, directed = FALSE)
isBuyer <- V(g)$name %in% graph.edges[,2]
plot(g, vertex.label = NA, vertex.color = ifelse(isBuyer, "gray", "black"),
vertex.size = ifelse(isBuyer, 7, 10))
Visualizing Hierarchical Data: Treemaps
library(treemap)
tree.df <- read.csv("EbayTreemap.csv")
# add column for negative feedback
tree.df$negative.feedback <- 1* (tree.df$Seller.Feedback < 0)
# draw treemap
treemap(tree.df, index = c("Category","Sub.Category", "Brand"),
vSize = "High.Bid", vColor = "negative.feedback", fun.aggregate = "mean",
align.labels = list(c("left", "top"), c("right", "bottom"), c("center", "center")),
palette = rev(gray.colors(3)), type = "manual", title = "")
library(ggmap)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
SCstudents <- read.csv("SC-US-students-GPS-data-2016.csv")
pacman::p_load(ggmap, osmdata)
Map <- get_map(location = getbb("Denver"), zoom = 3, source ="stamen")
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
ggmap(Map) + geom_point(aes(x = longitude, y = latitude), data = SCstudents,
alpha = 0.4, colour = "red", size = 0.5)
## Warning: Removed 1687 rows containing missing values (`geom_point()`).
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:igraph':
##
## compare
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
gdp.df <- read.csv("gdp.csv", skip = 4, stringsAsFactors = FALSE)
names(gdp.df)[4] <- "GDP2015"
names(gdp.df)[1] <- "Country_Name"
#happiness.df <- read.csv("Veerhoven.csv")
# gdp map
mWorldMap(gdp.df, key = "Country_Name", fill = "GDP2015") + coord_map()
## Mapping API still under development and may change in future releases.
## Warning in standardName(x, countryAlternatives, ignore.case = ignore.case, :
## 2372 items were not translated