# Load MASS package
library(MASS)
# whiteside data:
# two winter heating seasons
# 3 variables:
# Temp:
# a measure of the outside temperature during one week
# Gas:
# the amount of heating gas consumed during that week
# Insul:
# a categorical variable with two values, indicating whether the measurements were made before or after an insulation upgrade was made to the house
# whiteside is in your workspace
str(whiteside)
## 'data.frame': 56 obs. of 3 variables:
## $ Insul: Factor w/ 2 levels "Before","After": 1 1 1 1 1 1 1 1 1 1 ...
## $ Temp : num -0.8 -0.7 0.4 2.5 2.9 3.2 3.6 3.9 4.2 4.3 ...
## $ Gas : num 7.2 6.9 6.4 6 5.8 5.8 5.6 4.7 5.8 5.2 ...
plot(whiteside)
# Plot Gas vs. Temp
plot(whiteside$Temp, whiteside$Gas,
xlab = "Outside temperature",
ylab = "Heating gas consumption")
generic => results of applying the function depend on the nature of the object to which it is applied
plot(whiteside$Insul)
# Notice that since Insul is a factor variable, you got a barplot instead of a scatterplot
# Load the MASS package
library(MASS)
head(Cars93)
## Manufacturer Model Type Min.Price Price Max.Price MPG.city
## 1 Acura Integra Small 12.9 15.9 18.8 25
## 2 Acura Legend Midsize 29.2 33.9 38.7 18
## 3 Audi 90 Compact 25.9 29.1 32.3 20
## 4 Audi 100 Midsize 30.8 37.7 44.6 19
## 5 BMW 535i Midsize 23.7 30.0 36.2 22
## 6 Buick Century Midsize 14.2 15.7 17.3 22
## MPG.highway AirBags DriveTrain Cylinders EngineSize
## 1 31 None Front 4 1.8
## 2 25 Driver & Passenger Front 6 3.2
## 3 26 Driver only Front 6 2.8
## 4 26 Driver & Passenger Front 6 2.8
## 5 30 Driver only Rear 4 3.5
## 6 31 Driver only Front 4 2.2
## Horsepower RPM Rev.per.mile Man.trans.avail Fuel.tank.capacity
## 1 140 6300 2890 Yes 13.2
## 2 200 5500 2335 Yes 18.0
## 3 172 5500 2280 Yes 16.9
## 4 172 5500 2535 Yes 21.1
## 5 208 5700 2545 Yes 21.1
## 6 110 5200 2565 No 16.4
## Passengers Length Wheelbase Width Turn.circle Rear.seat.room
## 1 5 177 102 68 37 26.5
## 2 5 195 115 71 38 30.0
## 3 5 180 102 67 37 28.0
## 4 6 193 106 70 37 31.0
## 5 4 186 109 69 39 27.0
## 6 6 189 105 69 41 28.0
## Luggage.room Weight Origin Make
## 1 11 2705 non-USA Acura Integra
## 2 15 3560 non-USA Acura Legend
## 3 14 3375 non-USA Audi 90
## 4 17 3405 non-USA Audi 100
## 5 13 3640 non-USA BMW 535i
## 6 16 2880 USA Buick Century
str(Cars93)
## 'data.frame': 93 obs. of 27 variables:
## $ Manufacturer : Factor w/ 32 levels "Acura","Audi",..: 1 1 2 2 3 4 4 4 4 5 ...
## $ Model : Factor w/ 93 levels "100","190E","240",..: 49 56 9 1 6 24 54 74 73 35 ...
## $ Type : Factor w/ 6 levels "Compact","Large",..: 4 3 1 3 3 3 2 2 3 2 ...
## $ Min.Price : num 12.9 29.2 25.9 30.8 23.7 14.2 19.9 22.6 26.3 33 ...
## $ Price : num 15.9 33.9 29.1 37.7 30 15.7 20.8 23.7 26.3 34.7 ...
## $ Max.Price : num 18.8 38.7 32.3 44.6 36.2 17.3 21.7 24.9 26.3 36.3 ...
## $ MPG.city : int 25 18 20 19 22 22 19 16 19 16 ...
## $ MPG.highway : int 31 25 26 26 30 31 28 25 27 25 ...
## $ AirBags : Factor w/ 3 levels "Driver & Passenger",..: 3 1 2 1 2 2 2 2 2 2 ...
## $ DriveTrain : Factor w/ 3 levels "4WD","Front",..: 2 2 2 2 3 2 2 3 2 2 ...
## $ Cylinders : Factor w/ 6 levels "3","4","5","6",..: 2 4 4 4 2 2 4 4 4 5 ...
## $ EngineSize : num 1.8 3.2 2.8 2.8 3.5 2.2 3.8 5.7 3.8 4.9 ...
## $ Horsepower : int 140 200 172 172 208 110 170 180 170 200 ...
## $ RPM : int 6300 5500 5500 5500 5700 5200 4800 4000 4800 4100 ...
## $ Rev.per.mile : int 2890 2335 2280 2535 2545 2565 1570 1320 1690 1510 ...
## $ Man.trans.avail : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 1 1 1 ...
## $ Fuel.tank.capacity: num 13.2 18 16.9 21.1 21.1 16.4 18 23 18.8 18 ...
## $ Passengers : int 5 5 5 6 4 6 6 6 5 6 ...
## $ Length : int 177 195 180 193 186 189 200 216 198 206 ...
## $ Wheelbase : int 102 115 102 106 109 105 111 116 108 114 ...
## $ Width : int 68 71 67 70 69 69 74 78 73 73 ...
## $ Turn.circle : int 37 38 37 37 39 41 42 45 41 43 ...
## $ Rear.seat.room : num 26.5 30 28 31 27 28 30.5 30.5 26.5 35 ...
## $ Luggage.room : int 11 15 14 17 13 16 17 21 14 18 ...
## $ Weight : int 2705 3560 3375 3405 3640 2880 3470 4105 3495 3620 ...
## $ Origin : Factor w/ 2 levels "USA","non-USA": 2 2 2 2 2 1 1 1 1 1 ...
## $ Make : Factor w/ 93 levels "Acura Integra",..: 1 2 4 3 5 6 7 9 8 10 ...
# plot( , , pch =, col = )
# point shapes and colors to show how
# different variables
# or subsets of your data relate to each other
# Plot Max.Price vs. Price as red triangles
plot(Cars93$Price, Cars93$Max.Price,
pch = 17, # plot solid triangles
col = 'red') # red solid triangles
# add a new (2nd) set of points to your existing scatterplot
# Add Min.Price vs. Price as blue circles
points(Cars93$Price, Cars93$Min.Price,
pch = 16, # solid circles
col = 'blue')
# Add an equality reference line
# dashed equality reference line
# (i.e., a line with y-intercept 0 and slope 1)
# https://www.rdocumentation.org/packages/graphics/topics/abline
abline(a = 0, b = 1, lty = 2)
library(robustbase)
str(Animals2)
## 'data.frame': 65 obs. of 2 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...
head(Animals2)
## body brain
## Mountain beaver 1.35 8.1
## Cow 465.00 423.0
## Grey wolf 36.33 119.5
## Goat 27.66 115.0
## Guinea pig 1.04 5.5
## Dipliodocus 11700.00 50.0
# Set up the side-by-side plot array
par(mfrow = c(1, 2))
# plot array with 1 row and 2 columns
# First plot: brain vs. body in its original form
plot(Animals2$body, Animals2$brain)
# Add the first title
title("Original representation")
# not always the best one for visualization or analysis.
# Second plot: log-log plot of brain vs. body
plot(Animals2$body, Animals2$brain,
log = "xy")
# generate a plot of both the x and y variables in log scale
# Add the second title
title("Log-log plot")
# log scale => better see and understand the data.
Pie charts are a very bad way of displaying information
library(insuranceData)
data(dataCar) # get the dataCar df
str(dataCar)
## 'data.frame': 67856 obs. of 11 variables:
## $ veh_value: num 1.06 1.03 3.26 4.14 0.72 2.01 1.6 1.47 0.52 0.38 ...
## $ exposure : num 0.304 0.649 0.569 0.318 0.649 ...
## $ clm : int 0 0 0 0 0 0 0 0 0 0 ...
## $ numclaims: int 0 0 0 0 0 0 0 0 0 0 ...
## $ claimcst0: num 0 0 0 0 0 0 0 0 0 0 ...
## $ veh_body : Factor w/ 13 levels "BUS","CONVT",..: 4 4 13 11 4 5 8 4 4 4 ...
## $ veh_age : int 3 2 2 2 4 3 3 2 4 4 ...
## $ gender : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 2 1 1 ...
## $ area : Factor w/ 6 levels "A","B","C","D",..: 3 1 5 4 3 3 1 2 1 2 ...
## $ agecat : int 2 4 2 2 2 4 4 6 3 4 ...
## $ X_OBSTAT_: Factor w/ 1 level "01101 0 0 0": 1 1 1 1 1 1 1 1 1 1 ...
head(dataCar)
## veh_value exposure clm numclaims claimcst0 veh_body veh_age gender area
## 1 1.06 0.3039014 0 0 0 HBACK 3 F C
## 2 1.03 0.6488706 0 0 0 HBACK 2 F A
## 3 3.26 0.5694730 0 0 0 UTE 2 F E
## 4 4.14 0.3175907 0 0 0 STNWG 2 F D
## 5 0.72 0.6488706 0 0 0 HBACK 4 F C
## 6 2.01 0.8542094 0 0 0 HDTOP 3 M C
## agecat X_OBSTAT_
## 1 2 01101 0 0 0
## 2 4 01101 0 0 0
## 3 2 01101 0 0 0
## 4 2 01101 0 0 0
## 5 2 01101 0 0 0
## 6 4 01101 0 0 0
# table
# of counts
# of the distinct levels
# of the veh_body variable
# in the dataCar data frame,
# in decreasing order
tbl <- sort(table(dataCar$veh_body),
decreasing = T)
str(tbl)
## 'table' int [1:13(1d)] 22233 18915 16261 4586 1750 1579 780 752 717 127 ...
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:13] "SEDAN" "HBACK" "STNWG" "UTE" ...
head(tbl)
##
## SEDAN HBACK STNWG UTE TRUCK HDTOP
## 22233 18915 16261 4586 1750 1579
tbl
##
## SEDAN HBACK STNWG UTE TRUCK HDTOP COUPE PANVN MIBUS MCARA CONVT BUS
## 22233 18915 16261 4586 1750 1579 780 752 717 127 81 48
## RDSTR
## 27
par(mfrow = c(1,2)) # side-by-side plot array with 1 row and 2 columns
pie(tbl)
title("Pie chart")
barplot(tbl,
las = 2, # both sets of labels perpendicular to the axes
cex.names = 0.5 # name labels half the default size
)
title("Bar chart")
# Create a histogram of counts with hist()
hist(Cars93$Horsepower,
main = "hist() plot" # title of plot
)
# Create a normalized histogram with truehist()
truehist(Cars93$Horsepower,
main = "truehist() plot"
)
# Set up a side-by-side plot array
par(mfrow = c(1, 2))
# Create a histogram of counts with hist()
hist(Cars93$Horsepower,
main = "hist() plot" # title of plot
)
# Create a normalized histogram with truehist()
truehist(Cars93$Horsepower,
main = "truehist() plot"
)
NB: the histograms are on two different scales
density estimates may be regarded as smoothed histograms
# ChickWeight contains a collection of chicks' weights
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
## $ weight: num 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : num 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "formula")=Class 'formula' language weight ~ Time | Chick
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "outer")=Class 'formula' language ~Diet
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time"
## ..$ y: chr "Body weight"
## - attr(*, "units")=List of 2
## ..$ x: chr "(days)"
## ..$ y: chr "(gm)"
# which() function
# selects records from the ChickWeight df
# with Time equal to 16
# pointing to 16-week chicks
index16 <- which(ChickWeight$Time == 16)
index16
## [1] 9 21 33 45 57 69 81 93 104 116 128 140 152 164 191 205 217
## [18] 229 241 253 265 277 289 301 313 325 337 349 361 373 385 397 409 421
## [35] 433 445 457 469 481 493 505 515 527 539 551 563 575
# Get the 16-week chick weights
weights <- ChickWeight$weight[index16]
weights
## [1] 149 162 163 136 197 155 218 126 93 101 182 162 71 227 113 106 98
## [18] 275 131 145 72 197 169 144 184 187 143 170 221 151 235 287 198 135
## [35] 192 170 215 175 204 197 145 147 173 168 222 184 205
# Plot the normalized histogram
truehist(weights)
# overlay a density plot of the weights values on the histogram
# Add the density curve to the histogram
# density plot of this type of variable
# is often expected to conform approximately
# to the bell-shaped curve,
# aka Gaussian distribution
lines(density(weights))
look to see whether the points lie approximately on a straight line.
# Load the car package to make qqPlot() available
library(car)
## Loading required package: carData
# From Previous Exercise:
# index16 <- which(ChickWeight$Time == 16)
# weights <- ChickWeight$weight[index16]
# Show the normal QQ-plot of the chick weights
qqPlot(weights)
## [1] 32 18
The tax variable in the Boston data frame is definitely not normally distributed
# Show the normal QQ-plot of the Boston$tax data
# Gaussian distribution is obviously a poor fit,
# but the results also show:
# the presence of repeated values
# (flat stretches in the plot)
# portions of the data range where there are no observations
# (vertical "jumps" in the plot).
qqPlot(Boston$tax)
## [1] 489 490
# Set up a side-by-side plot array
par(mfrow = c(1, 2))
# LEFT
# Create the standard scatterplot
plot(Boston$zn, Boston$rad)
# Add the title
title("Standard scatterplot")
# RIGHT
# Create the sunflowerplot
sunflowerplot(Boston$zn, Boston$rad)
# Add the title
title("Sunflower plot")
# Create a variable-width boxplot with log y-axis & horizontal labels
boxplot(crim ~ rad,
data = Boston,
varwidth = T,
log = "y",
las = 1)
# Add a title
title("Crime rate vs. radial highway index")
# Create a mosaic plot using the formula interface
mosaicplot(carb ~ cyl,
data = mtcars,
color = T)
# Create a side-by-side boxplot summary
par(mfrow = c(1,2))
boxplot(Cars93$Min.Price, Cars93$Max.Price)
# Load aplpack to make the bagplot() function available
library(aplpack)
## Loading required package: tcltk
# Create a bagplot for the same two variables
bagplot(Cars93$Min.Price, Cars93$Max.Price,
cex = 1.20)
# Add an equality reference line
abline(0,1, lty = 2)
# Load the corrplot library for the corrplot() function
library(corrplot)
## corrplot 0.84 loaded
# Extract the numerical variables from UScereal
str(UScereal)
## 'data.frame': 65 obs. of 11 variables:
## $ mfr : Factor w/ 6 levels "G","K","N","P",..: 3 2 2 1 2 1 6 4 5 1 ...
## $ calories : num 212 212 100 147 110 ...
## $ protein : num 12.12 12.12 8 2.67 2 ...
## $ fat : num 3.03 3.03 0 2.67 0 ...
## $ sodium : num 394 788 280 240 125 ...
## $ fibre : num 30.3 27.3 28 2 1 ...
## $ carbo : num 15.2 21.2 16 14 11 ...
## $ sugars : num 18.2 15.2 0 13.3 14 ...
## $ shelf : int 3 3 3 1 2 3 1 3 2 1 ...
## $ potassium: num 848.5 969.7 660 93.3 30 ...
## $ vitamins : Factor w/ 3 levels "100%","enriched",..: 2 2 2 2 2 2 2 2 2 2 ...
col_numeric <- sapply(UScereal, is.numeric)
numericalVars <- UScereal[,col_numeric]
str(numericalVars)
## 'data.frame': 65 obs. of 9 variables:
## $ calories : num 212 212 100 147 110 ...
## $ protein : num 12.12 12.12 8 2.67 2 ...
## $ fat : num 3.03 3.03 0 2.67 0 ...
## $ sodium : num 394 788 280 240 125 ...
## $ fibre : num 30.3 27.3 28 2 1 ...
## $ carbo : num 15.2 21.2 16 14 11 ...
## $ sugars : num 18.2 15.2 0 13.3 14 ...
## $ shelf : int 3 3 3 1 2 3 1 3 2 1 ...
## $ potassium: num 848.5 969.7 660 93.3 30 ...
# Compute the correlation matrix for these variables
corrMat <- cor(numericalVars)
corrMat
## calories protein fat sodium fibre carbo
## calories 1.0000000 0.7060105 0.5901757 0.5286552 0.3882179 0.78872268
## protein 0.7060105 1.0000000 0.4112661 0.5727222 0.8096397 0.54709029
## fat 0.5901757 0.4112661 1.0000000 0.2595606 0.2260715 0.18285220
## sodium 0.5286552 0.5727222 0.2595606 1.0000000 0.4954831 0.42356172
## fibre 0.3882179 0.8096397 0.2260715 0.4954831 1.0000000 0.20307489
## carbo 0.7887227 0.5470903 0.1828522 0.4235617 0.2030749 1.00000000
## sugars 0.4952942 0.1848484 0.4156740 0.2112437 0.1489158 -0.04082599
## shelf 0.4263400 0.3963311 0.3256975 0.2341275 0.3578429 0.26045989
## potassium 0.4765955 0.8417540 0.3232754 0.5566426 0.9638662 0.24204848
## sugars shelf potassium
## calories 0.49529421 0.4263400 0.4765955
## protein 0.18484845 0.3963311 0.8417540
## fat 0.41567397 0.3256975 0.3232754
## sodium 0.21124365 0.2341275 0.5566426
## fibre 0.14891577 0.3578429 0.9638662
## carbo -0.04082599 0.2604599 0.2420485
## sugars 1.00000000 0.2900511 0.2718335
## shelf 0.29005112 1.0000000 0.4262529
## potassium 0.27183347 0.4262529 1.0000000
# Generate the correlation ellipse plot
corrplot(corrMat, method = "ellipse")
# Load the rpart library
library(rpart)
# Fit an rpart model to predict medv from all other Boston variables
tree_model <- rpart(medv ~ ., data = Boston)
# Plot the structure of this decision tree model
plot(tree_model)
# Add labels to this plot
text(tree_model, cex = .7)
# Assign the return value from the par() function to plot_pars
plot_pars <- par()
# Display the names of the par() function's list elements
names(plot_pars)
## [1] "xlog" "ylog" "adj" "ann" "ask"
## [6] "bg" "bty" "cex" "cex.axis" "cex.lab"
## [11] "cex.main" "cex.sub" "cin" "col" "col.axis"
## [16] "col.lab" "col.main" "col.sub" "cra" "crt"
## [21] "csi" "cxy" "din" "err" "family"
## [26] "fg" "fig" "fin" "font" "font.axis"
## [31] "font.lab" "font.main" "font.sub" "lab" "las"
## [36] "lend" "lheight" "ljoin" "lmitre" "lty"
## [41] "lwd" "mai" "mar" "mex" "mfcol"
## [46] "mfg" "mfrow" "mgp" "mkh" "new"
## [51] "oma" "omd" "omi" "page" "pch"
## [56] "pin" "plt" "ps" "pty" "smo"
## [61] "srt" "tck" "tcl" "usr" "xaxp"
## [66] "xaxs" "xaxt" "xpd" "yaxp" "yaxs"
## [71] "yaxt" "ylbias"
# Display the number of par() function list elements
length(plot_pars)
## [1] 72
# Set up a 2-by-2 plot array
par(mfrow = c(2,2))
# Plot the Animals2 brain weight data as points
head(Animals2)
## body brain
## Mountain beaver 1.35 8.1
## Cow 465.00 423.0
## Grey wolf 36.33 119.5
## Goat 27.66 115.0
## Guinea pig 1.04 5.5
## Dipliodocus 11700.00 50.0
plot(Animals2$brain, type = "p")
# Add the title
title("points")
# Plot the brain weights with lines
plot(Animals2$brain, type = "l")
# Add the title
title("lines")
# Plot the brain weights as lines overlaid with points
plot(Animals2$brain, type = "o")
# Add the title
title("overlaid")
# Plot the brain weights as steps
plot(Animals2$brain, type = "s")
# Add the title
title("steps")
# Compute max_hp
max_hp <- max(Cars93$Horsepower, mtcars$hp)
# Compute max_mpg
max_mpg <- max(Cars93$MPG.city, Cars93$MPG.highway, mtcars$mpg)
# Create plot with type = "n"
plot(Cars93$Horsepower, Cars93$MPG.city,
type = "n",
xlim = c(0, max_hp),
ylim = c(0, max_mpg),
xlab = "Horsepower",
ylab = "Miles per gallon")
# Add open circles to plot
points(mtcars$hp, mtcars$mpg, pch = 1)
# Add solid squares to plot
points(Cars93$Horsepower, Cars93$MPG.city, pch = 15)
# Add open triangles to plot
points(Cars93$Horsepower, Cars93$MPG.highway, pch = 6)
# Create the numerical vector x
x <- seq(0, 10, length = 200)
# Compute the Gaussian density for x with mean 2 and standard deviation 0.2
gauss1 <- dnorm(x, mean = 2, sd = 0.2)
# Compute the Gaussian density with mean 4 and standard deviation 0.5
gauss2 <- dnorm(x, mean = 4, sd = 0.5)
# Plot the first Gaussian density
plot(x, gauss1,
type = "l",
ylab = "Gaussian probability density")
# Add lines for the second Gaussian density
lines(x, gauss2, lty = 2, lwd = 3)
# Create an empty plot using type = "n"
plot(mtcars$hp, mtcars$mpg,
type='n',
xlab = "Horsepower",
ylab = "Gas mileage")
# Add points with shapes determined by cylinder number
points(mtcars$hp, mtcars$mpg, pch = mtcars$cyl)
# Create a second empty plot
plot(mtcars$hp, mtcars$mpg,
type='n',
xlab = "Horsepower",
ylab = "Gas mileage")
# Add points with shapes as cylinder characters
points(mtcars$hp, mtcars$mpg, pch = as.character(mtcars$cyl))
library(MASS)
# Build a linear regression model for the whiteside data
linear_model <- lm(Gas ~ Temp, data = whiteside)
# Create a Gas vs. Temp scatterplot from the whiteside data
plot(whiteside$Temp, whiteside$Gas)
# Use abline() to add the linear regression line
abline(linear_model, lty = 2)
# Create MPG.city vs. Horsepower plot with solid squares
plot(Cars93$Horsepower, Cars93$MPG.city, pch = 15)
# Create index3, pointing to 3-cylinder cars
head(Cars93)
## Manufacturer Model Type Min.Price Price Max.Price MPG.city
## 1 Acura Integra Small 12.9 15.9 18.8 25
## 2 Acura Legend Midsize 29.2 33.9 38.7 18
## 3 Audi 90 Compact 25.9 29.1 32.3 20
## 4 Audi 100 Midsize 30.8 37.7 44.6 19
## 5 BMW 535i Midsize 23.7 30.0 36.2 22
## 6 Buick Century Midsize 14.2 15.7 17.3 22
## MPG.highway AirBags DriveTrain Cylinders EngineSize
## 1 31 None Front 4 1.8
## 2 25 Driver & Passenger Front 6 3.2
## 3 26 Driver only Front 6 2.8
## 4 26 Driver & Passenger Front 6 2.8
## 5 30 Driver only Rear 4 3.5
## 6 31 Driver only Front 4 2.2
## Horsepower RPM Rev.per.mile Man.trans.avail Fuel.tank.capacity
## 1 140 6300 2890 Yes 13.2
## 2 200 5500 2335 Yes 18.0
## 3 172 5500 2280 Yes 16.9
## 4 172 5500 2535 Yes 21.1
## 5 208 5700 2545 Yes 21.1
## 6 110 5200 2565 No 16.4
## Passengers Length Wheelbase Width Turn.circle Rear.seat.room
## 1 5 177 102 68 37 26.5
## 2 5 195 115 71 38 30.0
## 3 5 180 102 67 37 28.0
## 4 6 193 106 70 37 31.0
## 5 4 186 109 69 39 27.0
## 6 6 189 105 69 41 28.0
## Luggage.room Weight Origin Make
## 1 11 2705 non-USA Acura Integra
## 2 15 3560 non-USA Acura Legend
## 3 14 3375 non-USA Audi 90
## 4 17 3405 non-USA Audi 100
## 5 13 3640 non-USA BMW 535i
## 6 16 2880 USA Buick Century
index3 <- which(Cars93$Cylinders == 3)
index3
## [1] 39 80 83
# Add text giving names of cars next to data points
text(x = Cars93$Horsepower[index3],
y = Cars93$MPG.city[index3],
labels = Cars93$Make[index3],
adj = 0)
# Plot MPG.city vs. Horsepower as open circles
plot(Cars93$Horsepower, Cars93$MPG.city, pch = 1)
# Create index3, pointing to 3-cylinder cars
index3 <- which(Cars93$Cylinders == 3)
# Highlight 3-cylinder cars as solid circles
points(Cars93$Horsepower[index3], Cars93$MPG.city[index3], pch = 16)
# Add car names, offset from points, with larger bold text
text(
Cars93$Horsepower[index3],
Cars93$MPG.city[index3],
Cars93$Make[index3],
adj = -0.2,
cex = 1.2,
font = 4)
# Plot Gas vs. Temp as solid triangles
plot(whiteside$Temp, whiteside$Gas, pch=17)
# Create indexB, pointing to "Before" data
indexB <- which(whiteside$Insul == "Before")
# Create indexA, pointing to "After" data
indexA <- which(whiteside$Insul == "After")
# Add "Before" text in blue, rotated 30 degrees, 80% size
text(x = whiteside$Temp[indexB], y = whiteside$Gas[indexB],
labels = "Before", col = 'blue', srt = 30, cex = .8)
# Add "After" text in red, rotated -20 degrees, 80% size
text(x = whiteside$Temp[indexA], y = whiteside$Gas[indexA],
labels = "After", col = 'red', srt = -20, cex = .8)
# Set up and label empty plot of Gas vs. Temp
plot(whiteside$Temp, whiteside$Gas,
type = "n",
xlab = "Outside temperature",
ylab = "Heating gas consumption")
# Create indexB, pointing to "Before" data
indexB <- which(whiteside$Insul == "Before")
# Create indexA, pointing to "After" data
indexA <- which(whiteside$Insul == "After")
# Add "Before" data as solid triangles
points(whiteside$Temp[indexB], whiteside$Gas[indexB], pch = 17)
# Add "After" data as open circles
points(whiteside$Temp[indexA], whiteside$Gas[indexA], pch = 1)
# Add legend that identifies points as "Before" and "After"
legend("topright", pch = c(17, 1), legend = c("Before", "After"))
head(UScereal[,c("sugars", "shelf")],10)
## sugars shelf
## 100% Bran 18.181818 3
## All-Bran 15.151515 3
## All-Bran with Extra Fiber 0.000000 3
## Apple Cinnamon Cheerios 13.333333 1
## Apple Jacks 14.000000 2
## Basic 4 10.666667 3
## Bran Chex 8.955224 1
## Bran Flakes 7.462687 3
## Cap'n'Crunch 16.000000 2
## Cheerios 0.800000 1
# Create a boxplot of sugars by shelf value, without axes
boxplot(sugars ~ shelf, data = UScereal, axes = F)
# Add a default y-axis to the left of the boxplot
axis(side = 2)
# Add an x-axis below the plot, labelled 1, 2, and 3
axis(side = 1, at = c(1,2,3))
# Add a second x-axis above the plot
axis(side = 3, at = c(1,2,3),
labels = c("floor","middle","top"))
# Create a scatterplot of MPG.city vs. Horsepower
plot(Cars93$Horsepower, Cars93$MPG.city)
# Call supsmu() to generate a smooth trend curve, with default bass
trend1 <- supsmu(Cars93$Horsepower, Cars93$MPG.city)
# Add this trend curve to the plot
lines(trend1)
# Call supsmu() for a second trend curve, with bass = 10
trend2 <- supsmu(Cars93$Horsepower, Cars93$MPG.city, bass = 10)
# Add this trend curve as a heavy, dotted line
lines(trend2, lty = 3, lwd = 2)
# Compute the number of plots to be displayed
ncol(Cars93)^2
## [1] 729
# Plot the array of scatterplots
plot(Cars93)
# Construct the vector keep_vars
keep_vars <- c("calories", "protein", "fat",
"fibre", "carbo", "sugars")
# Use keep_vars to extract the desired subset of UScereal
df <- UScereal[, keep_vars]
str(df)
## 'data.frame': 65 obs. of 6 variables:
## $ calories: num 212 212 100 147 110 ...
## $ protein : num 12.12 12.12 8 2.67 2 ...
## $ fat : num 3.03 3.03 0 2.67 0 ...
## $ fibre : num 30.3 27.3 28 2 1 ...
## $ carbo : num 15.2 21.2 16 14 11 ...
## $ sugars : num 18.2 15.2 0 13.3 14 ...
# Set up a two-by-two plot array
par(mfrow = c(2,2))
# Use matplot() to generate an array of two scatterplots
matplot(
UScereal$calories,
UScereal[,c('protein', 'fat')],
xlab = "calories",
ylab = "")
# Add a title
title("Two scatterplots")
# Use matplot() to generate an array of three scatterplots
matplot(
UScereal$calories,
UScereal[,c('protein', 'fat', 'fibre')],
xlab = "calories",
ylab = "")
# Add a title
title("Three scatterplots")
# Use matplot() to generate an array of four scatterplots
matplot(
UScereal$calories,
UScereal[,c('protein', 'fat', 'fibre', 'carbo')],
xlab = "calories",
ylab = "")
# Add a title
title("Four scatterplots")
# Use matplot() to generate an array of five scatterplots
matplot(
UScereal$calories,
UScereal[,c('protein', 'fat', 'fibre', 'carbo','sugars')],
xlab = "calories",
ylab = "")
# Add a title
title("Five scatterplots")
library(wordcloud)
## Loading required package: RColorBrewer
# Create mfr_table of manufacturer frequencies
mfr_table <- table(Cars93$Manufacturer)
# Create the default wordcloud from this table
wordcloud(
words = names(mfr_table),
freq = as.numeric(mfr_table),
scale = c(2, 0.25))
# Change the minimum word frequency
wordcloud(
words = names(mfr_table),
freq = as.numeric(mfr_table),
scale = c(2, 0.25),
min.freq = 1)
# Create model_table of model frequencies
model_table <- table(Cars93$Model)
# Create the wordcloud of all model names with smaller scaling
wordcloud(
words = names(model_table),
freq = as.numeric(model_table),
scale = c(.75, .25),
min.freq = 1)
# Set up a two-by-two plot array
par(mfrow = c(2,2))
# Plot y1 vs. x1
plot(anscombe$x1, anscombe$y1)
# Plot y2 vs. x2
plot(anscombe$x2, anscombe$y2)
# Plot y3 vs. x3
plot(anscombe$x3, anscombe$y3)
# Plot y4 vs. x4
plot(anscombe$x4, anscombe$y4)
# Define common x and y limits for the four plots
xmin <- 4
xmax <- 19
ymin <- 3
ymax <- 13
# Set up a two-by-two plot array
par(mfrow = c(2,2))
# Plot y1 vs. x1 with common x and y limits, labels & title
plot(anscombe$x1, anscombe$y1,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "First dataset")
# Do the same for the y2 vs. x2 plot
plot(anscombe$x2, anscombe$y2,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "Second dataset")
# Do the same for the y3 vs. x3 plot
plot(anscombe$x3, anscombe$y3,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "Third dataset")
# Do the same for the y4 vs. x4 plot
plot(anscombe$x4, anscombe$y4,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "Fourth dataset")
# Set up a two-by-two plot array
par(mfrow = c(2,2))
# Plot the raw duration data
plot(geyser$duration, main = "Raw data")
# Plot the normalized histogram of the duration data
truehist(geyser$duration, main = "Histogram")
# Plot the density of the duration data
plot(density(geyser$duration), main = "Density")
# Construct the normal QQ-plot of the duration data
qqPlot(geyser$duration, main = "QQ-plot")
## [1] 149 12
# Use the matrix function to create a matrix with three rows and two columns
layoutMatrix <- matrix(
c(
0, 1,
2, 0,
0, 3
),
byrow = T,
nrow = 3
)
# Call the layout() function to set up the plot array
layout(layoutMatrix)
# Show where the three plots will go
layout.show(n = 3)
# Set up the plot array
layout(layoutMatrix)
# Construct vectors indexB and indexA
indexB <- which(whiteside$Insul == "Before")
indexA <- which(whiteside$Insul == "After")
# Create plot 1 and add title
plot(whiteside$Temp[indexB], whiteside$Gas[indexB],
ylim = c(0,8))
title("Before data only")
# Create plot 2 and add title
plot(whiteside$Temp, whiteside$Gas,
ylim = c(0,8))
title("Complete dataset")
# Create plot 3 and add title
plot(whiteside$Temp[indexA], whiteside$Gas[indexA],
ylim = c(0,8))
title("After data only")
# Create row1, row2, and layoutVector
row1 <- c(1,0,0)
row2 <- c(0,2,2)
layoutVector <- c(row1,row2, row2)
# Convert layoutVector into layoutMatrix
layoutMatrix <- matrix(layoutVector, byrow = T, nrow = 3)
# Set up the plot array
layout(layoutMatrix)
# Plot scatterplot
plot(Boston$rad, Boston$zn)
# Plot sunflower plot
sunflowerplot(Boston$rad, Boston$zn)
# Create a table of Cylinders frequencies
tbl <- table(Cars93$Cylinders)
# Generate a horizontal barplot of these frequencies
mids <- barplot(tbl,
horiz = T,
col = "transparent",
names.arg = "")
# Add names labels with text()
text(20, mids, names(tbl))
# Add count labels with text()
text(35, mids, as.numeric(tbl))
# Call symbols() to create the default bubbleplot
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = sqrt(Cars93$Price))
# Repeat, with the inches argument specified
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = sqrt(Cars93$Price),
inches = 0.1)
# Call png() with the name of the file we want to create
png('bubbleplot.png')
# Re-create the plot from the last exercise
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = sqrt(Cars93$Price),
inches = 0.1)
# Save our file and return to our interactive session
dev.off()
## png
## 2
# Verify that we have created the file
list.files(pattern = "png")
## [1] "bubbleplot.png" "point_key.png"
# Iliinsky and Steele color name vector
IScolors <- c("red", "green", "yellow", "blue",
"black", "white", "pink", "cyan",
"gray", "orange", "brown", "purple")
# Create the data for the barplot
barWidths <- c(rep(2, 6), rep(1, 6))
# Recreate the horizontal barplot with colored bars
barplot(
rev(barWidths),
horiz = T,
col = rev(IScolors),
axes = F,
names.arg = rev(IScolors),
las = 1)
# Iliinsky and Steele color name vector
IScolors <- c("red", "green", "yellow", "blue",
"black", "white", "pink", "cyan",
"gray", "orange", "brown", "purple")
# Create the `cylinderLevel` variable
cylinderLevel <- as.numeric(Cars93$Cylinders)
# Create the colored bubbleplot
symbols(
Cars93$Horsepower, Cars93$MPG.city,
circles = as.numeric(Cars93$Cylinders),
inches = 0.2,
bg = IScolors[cylinderLevel])
# Create a table of Cylinders by Origin
tbl <- table(Cars93$Cylinders, Cars93$Origin)
# Create the default stacked barplot
barplot(tbl)
# Enhance this plot with color
barplot(tbl, col = IScolors)
# Load the insuranceData package
library(insuranceData)
# Use the data() function to load the dataCar data frame
data(dataCar)
head(dataCar)
## veh_value exposure clm numclaims claimcst0 veh_body veh_age gender area
## 1 1.06 0.3039014 0 0 0 HBACK 3 F C
## 2 1.03 0.6488706 0 0 0 HBACK 2 F A
## 3 3.26 0.5694730 0 0 0 UTE 2 F E
## 4 4.14 0.3175907 0 0 0 STNWG 2 F D
## 5 0.72 0.6488706 0 0 0 HBACK 4 F C
## 6 2.01 0.8542094 0 0 0 HDTOP 3 M C
## agecat X_OBSTAT_
## 1 2 01101 0 0 0
## 2 4 01101 0 0 0
## 3 2 01101 0 0 0
## 4 2 01101 0 0 0
## 5 2 01101 0 0 0
## 6 4 01101 0 0 0
# Load the tabplot package
suppressPackageStartupMessages(library(tabplot))
# Generate the default tableplot() display
tableplot(dataCar)
# Load the lattice package
library(lattice)
# Use xyplot() to construct the conditional scatterplot
xyplot(calories ~ sugars | shelf, data = UScereal)
# Load the ggplot2 package
library(ggplot2)
# Create the basic plot (not displayed): basePlot
basePlot <- ggplot(Cars93, aes(x = Horsepower, y = MPG.city))
# Display the basic scatterplot
basePlot +
geom_point()
# Color the points by Cylinders value
basePlot +
geom_point(colour = IScolors[Cars93$Cylinders])
# Make the point sizes also vary with Cylinders value
basePlot +
geom_point(colour = IScolors[Cars93$Cylinders],
size = as.numeric(Cars93$Cylinders))