Consider the aircraft data (aircraft.csv) with variables Year, Period, Power, Span, Length, Weight, Speed and Range.
These data record six characteristics of aircraft designs which appeared during the twentieth century. The variables are: - Year: year of first manufacture - Period: a code to indicate one of three broad time periods - Power: total engine power (kW) - Span: wing span (m) - Length: length (m) - Weight: maximum take-off weight (kg) - Speed: maximum speed (km/h) - Range: range (km)
Source: The data were collected by P. Saviotti and are described in detail in Saviotti (1996), “Technological Evolution, Variety and Economy”, Edward Elgar: Cheltenham.
Read the data into R, and summarise it to understand the inputs. Are there any missing values? (Hint: read.csv, head and summary.) aircraft<-read.csv(“aircraft.csv”) head aircraft
Show separate histogram of Power and log10( Power ). Describe the shape of the histograms. (Hint: truehist (from MASS) or ggplot with geom_histogram.)
Construct smoothed histograms of Power and log10( Power ). (Hint: ggplot + geom_density.)
Create new variables logPower equal to log10( Power ) and similarly for the remaining variables, apart from Period and Year. Do this by creating a new dataframe with these variables plus Period and Year. Make Period a factor on this new dataframe. Why might the log transformation be a good idea to help in understanding these data? (Note that we have specified log10 rather than the natural log, because this is easier to interpret in the circumstances we have.) (Hint: Can be done directly but could use mutate from dplyr, which is part of tidyverse.)
Show pairwise plots of the logged variables. Which pair has the largest absolute correlation? (Hint: pairs or ggpairs.)
The code chunk below generates a 3D scatterplot of the variables logPower, logSpan and logLength. Comment on the scatterplots in relation to what it says about the variables, and discuss the shortcomings of this sort of display. Experiment with changing the viewing angles phi and theta.
The code chunk below makes a parallel coordinate plot of the 6 logged variables, coloured by the value of Period, and with boxplots and suitable transparency. It is arranged so that the values are not scaled. Comment on what this figure says about the data, and discuss any shortcomings of the plotting method.
Remove the “, eval = FALSE” before running knitting in RStudio.
library( plot3D )
scatter3D (aircraft$logWeight, aircraft$logSpan, aircraft$logLength,
phi = 20, theta = 80,
col = NULL, NAcol = "white", breaks = NULL,
colkey = NULL, panel.first = NULL,
clim = NULL, clab = NULL,
bty = "b", CI = NULL, surf = NULL,
add = FALSE, plot = TRUE)
Remove the “, eval = FALSE” before running knitting in RStudio.
library(GGally)
ggparcoord( aircraft,
groupColumn = "Period",
columns = 3:8,
alpha = 0.1,
scale = "globalminmax",
boxplot = TRUE ) +
theme( legend.position = "bottom")
Continue with the aircraft data of Q1 (using the logged variables).
The data are from three different periods.
(Hints for these: summary, xtabs, sapply, filter.)
# density plots
ggplot( aircraft, aes( logPower, group = Period, colour = Period ) ) +
geom_density( aes( fill = Period ), alpha = 0.4 ) +
ggtitle("All data"))
Continue with the aircraft data of Q2.
ggplot( aircraft, aes( logSpeed, logSpan ) ) +
geom_density_2d( ) +
geom_point( aes( colour = Period ), alpha = 0.6 ) +
theme( legend.position = "bottom")
library(tidyverse)
library(ggplot2)
library(MASS)
library(GGally)
library( plot3D )
library( ggplot2 )
aircraft0 = read.csv( "aircraft.csv" )
head( aircraft0 )
summary( aircraft0 )
## Year Period Power Span Length Weight
## Min. :14.00 Min. :1.00 Min. : 30 Min. : 6.68 Min. : 5.69 Min. : 481
## 1st Qu.:34.00 1st Qu.:1.00 1st Qu.: 421 1st Qu.:11.00 1st Qu.: 9.02 1st Qu.: 2386
## Median :42.00 Median :2.00 Median : 1014 Median :14.30 Median :11.96 Median : 5500
## Mean :46.27 Mean :2.02 Mean : 4062 Mean :18.13 Mean :15.51 Mean : 17236
## 3rd Qu.:61.00 3rd Qu.:3.00 3rd Qu.: 3080 3rd Qu.:22.10 3rd Qu.:17.84 3rd Qu.: 14000
## Max. :84.00 Max. :3.00 Max. :80000 Max. :70.10 Max. :75.54 Max. :365000
## Speed Range
## Min. : 105.0 Min. : 80
## 1st Qu.: 259.0 1st Qu.: 850
## Median : 408.0 Median : 1400
## Mean : 561.3 Mean : 2039
## 3rd Qu.: 655.0 3rd Qu.: 2400
## Max. :3219.0 Max. :20120
library( MASS )
truehist( aircraft0$Power ) # uses MASS::truehist
library( ggplot2 )
ggplot( aircraft0, aes( Power ) ) + geom_histogram() # example, from ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft0, aes( log10( Power) ) ) + geom_histogram() # example, from ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft0, aes( Power ) ) + geom_histogram() + scale_x_log10() # example, from ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft0, aes( Power ) ) + geom_density(fill = "blue")
ggplot( aircraft0, aes( log10( Power ) ) ) + geom_density(fill = "blue")
# Using dplyr
# names( aircraft0 )
# "Year" "Period" "Power" "Span" "Length" "Weight" "Speed" "Range"
aircraft = mutate( aircraft0,
Period = factor( Period),
logPower = log10( Power ),
logSpan = log10( Span ),
logLength = log10( Length ),
logWeight = log10( Weight ),
logSpeed = log10( Speed ),
logRange = log10( Range )
) %>%
dplyr::select( Year, Period, starts_with("log") )
summary( aircraft )
## Year Period logPower logSpan logLength logWeight logSpeed
## Min. :14.00 1:216 Min. :1.477 Min. :0.8248 Min. :0.7551 Min. :2.682 Min. :2.021
## 1st Qu.:34.00 2:263 1st Qu.:2.624 1st Qu.:1.0414 1st Qu.:0.9552 1st Qu.:3.378 1st Qu.:2.413
## Median :42.00 3:230 Median :3.006 Median :1.1553 Median :1.0777 Median :3.740 Median :2.611
## Mean :46.27 Mean :3.084 Mean :1.2038 Mean :1.1265 Mean :3.801 Mean :2.637
## 3rd Qu.:61.00 3rd Qu.:3.489 3rd Qu.:1.3444 3rd Qu.:1.2514 3rd Qu.:4.146 3rd Qu.:2.816
## Max. :84.00 Max. :4.903 Max. :1.8457 Max. :1.8782 Max. :5.562 Max. :3.508
## logRange
## Min. :1.903
## 1st Qu.:2.929
## Median :3.146
## Mean :3.172
## 3rd Qu.:3.380
## Max. :4.304
# alternatively (more typing)
aircraft = data.frame( Year = aircraft0$Year,
Period = factor( aircraft0$Period))
aircraft$logPower = log10( aircraft0$Power )
aircraft$logSpan = log10( aircraft0$Span )
aircraft$logLength = log10( aircraft0$Length )
aircraft$logWeight = log10( aircraft0$Weight )
aircraft$logSpeed = log10( aircraft0$Speed )
aircraft$logRange = log10( aircraft0$Range )
summary( aircraft )
## Year Period logPower logSpan logLength logWeight logSpeed
## Min. :14.00 1:216 Min. :1.477 Min. :0.8248 Min. :0.7551 Min. :2.682 Min. :2.021
## 1st Qu.:34.00 2:263 1st Qu.:2.624 1st Qu.:1.0414 1st Qu.:0.9552 1st Qu.:3.378 1st Qu.:2.413
## Median :42.00 3:230 Median :3.006 Median :1.1553 Median :1.0777 Median :3.740 Median :2.611
## Mean :46.27 Mean :3.084 Mean :1.2038 Mean :1.1265 Mean :3.801 Mean :2.637
## 3rd Qu.:61.00 3rd Qu.:3.489 3rd Qu.:1.3444 3rd Qu.:1.2514 3rd Qu.:4.146 3rd Qu.:2.816
## Max. :84.00 Max. :4.903 Max. :1.8457 Max. :1.8782 Max. :5.562 Max. :3.508
## logRange
## Min. :1.903
## 1st Qu.:2.929
## Median :3.146
## Mean :3.172
## 3rd Qu.:3.380
## Max. :4.304
pairs( aircraft[ , 3:8 ] )
cor( aircraft[ , 3:8 ])
## logPower logSpan logLength logWeight logSpeed logRange
## logPower 1.0000000 0.57436142 0.8556267 0.9477214 0.80368463 0.6835478
## logSpan 0.5743614 1.00000000 0.8496270 0.7699374 0.04056524 0.6200422
## logLength 0.8556267 0.84962700 1.0000000 0.9534497 0.46575591 0.7185541
## logWeight 0.9477214 0.76993744 0.9534497 1.0000000 0.63055999 0.7463050
## logSpeed 0.8036846 0.04056524 0.4657559 0.6305600 1.00000000 0.4645887
## logRange 0.6835478 0.62004216 0.7185541 0.7463050 0.46458873 1.0000000
library( GGally )
ggpairs( aircraft, columns = 3:8, progress = FALSE )
library( plot3D )
scatter3D (aircraft$logWeight, aircraft$logSpan, aircraft$logLength, phi = 20, theta = 80,
col = NULL, NAcol = "white", breaks = NULL,
colkey = NULL, panel.first = NULL,
clim = NULL, clab = NULL,
bty = "b", CI = NULL, surf = NULL,
add = FALSE, plot = TRUE)
library(GGally)
ggparcoord( aircraft, groupColumn = "Period", columns = 3:8, alpha = 0.1, scale = "globalminmax", boxplot = TRUE ) + theme( legend.position = "bottom")
summary( aircraft$Period )
## 1 2 3
## 216 263 230
xtabs( ~ Period + Year, data = aircraft )
## Year
## Period 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 3 4 12 15 8 2 4 3 5 9 5 7 10 8 12 10 7 8 18 18 23 25 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 24 22 23 25 14 20 6 25 13 3 7 9
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Year
## Period 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 11 3 5 9 9 11 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 16 6 5 14 4 12 13 9 8 15 9 13 7 10 9 10 10 3 9 6 6 8 11 5 3 2 5 1
## Year
## Period 84
## 1 0
## 2 0
## 3 1
summary( aircraft )
## Year Period logPower logSpan logLength logWeight logSpeed
## Min. :14.00 1:216 Min. :1.477 Min. :0.8248 Min. :0.7551 Min. :2.682 Min. :2.021
## 1st Qu.:34.00 2:263 1st Qu.:2.624 1st Qu.:1.0414 1st Qu.:0.9552 1st Qu.:3.378 1st Qu.:2.413
## Median :42.00 3:230 Median :3.006 Median :1.1553 Median :1.0777 Median :3.740 Median :2.611
## Mean :46.27 Mean :3.084 Mean :1.2038 Mean :1.1265 Mean :3.801 Mean :2.637
## 3rd Qu.:61.00 3rd Qu.:3.489 3rd Qu.:1.3444 3rd Qu.:1.2514 3rd Qu.:4.146 3rd Qu.:2.816
## Max. :84.00 Max. :4.903 Max. :1.8457 Max. :1.8782 Max. :5.562 Max. :3.508
## logRange
## Min. :1.903
## 1st Qu.:2.929
## Median :3.146
## Mean :3.172
## 3rd Qu.:3.380
## Max. :4.304
sapply( aircraft[,-2], IQR )
## Year logPower logSpan logLength logWeight logSpeed logRange
## 27.0000000 0.8642686 0.3029996 0.2961883 0.7684576 0.4029415 0.4507923
lapply( aircraft[,-2], IQR )
## $Year
## [1] 27
##
## $logPower
## [1] 0.8642686
##
## $logSpan
## [1] 0.3029996
##
## $logLength
## [1] 0.2961883
##
## $logWeight
## [1] 0.7684576
##
## $logSpeed
## [1] 0.4029415
##
## $logRange
## [1] 0.4507923
sapply( aircraft[,-2], median )
## Year logPower logSpan logLength logWeight logSpeed logRange
## 42.000000 3.006038 1.155336 1.077731 3.740363 2.610660 3.146128
sapply( aircraft[,-2], mean) > sapply( aircraft[,-2], median )
## Year logPower logSpan logLength logWeight logSpeed logRange
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sapply( aircraft[,-2], mean) > 1.04 * sapply( aircraft[,-2], median )
## Year logPower logSpan logLength logWeight logSpeed logRange
## TRUE FALSE TRUE TRUE FALSE FALSE FALSE
aircraft.per1 = filter( aircraft, Period == "1" )
aircraft.per2 = filter( aircraft, Period == "2" )
aircraft.per3 = filter( aircraft, Period == "3" )
ggplot(data = aircraft.per1, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.01) +
geom_density(colour = "blue", alpha=0.5 )+ ggtitle("Period 1")
ggplot(data = aircraft.per2, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.01) +
geom_density(colour = "blue", alpha=0.5 )+ ggtitle("Period 2")
ggplot(data = aircraft.per3, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.01) +
geom_density(colour = "blue", alpha=0.5 )+ ggtitle("Period 3")
ggplot(data = aircraft.per1, mapping = aes(x = logLength, y=..density..)) +
geom_histogram(binwidth=0.05) +
geom_density(colour = "blue", alpha=0.5 )+ ggtitle("Period 1")
ggplot(data = aircraft.per2, mapping = aes(x = logLength, y=..density..)) +
geom_histogram(binwidth=0.05) +
geom_density(colour = "blue", alpha=0.5 )+ ggtitle("Period 2")
ggplot(data = aircraft.per3, mapping = aes(x = logLength, y=..density..)) +
geom_histogram(binwidth=0.05) +
geom_density(colour = "blue", alpha=0.5 )+ ggtitle("Period 3")
aircraft.per1 = filter( aircraft, Period == "1" )
aircraft.per2 = filter( aircraft, Period == "2" )
aircraft.per3 = filter( aircraft, Period == "3" )
ggplot(data = aircraft, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.05) +
geom_density(colour = "blue", alpha=0.5 )
ggplot(data = aircraft.per1, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.1) +
geom_density(colour = "blue", alpha=0.5 )
ggplot(data = aircraft.per2, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.1) +
geom_density(colour = "blue", alpha=0.5 )
ggplot(data = aircraft.per3, mapping = aes(x = logPower, y=..density..)) +
geom_histogram(binwidth=0.1) +
geom_density(colour = "blue", alpha=0.5 )
# histograms
ggplot( aircraft, aes( logPower ) ) + geom_histogram() + ggtitle("All data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft.per1, aes( logPower ) ) + geom_histogram() + ggtitle("Period 1")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft.per2, aes( logPower ) ) + geom_histogram() + ggtitle("Period 2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft.per3, aes( logPower ) ) + geom_histogram() + ggtitle("Period 3")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft.per1, aes( logLength ) ) + geom_histogram() + ggtitle("Period 1")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft.per2, aes( logLength ) ) + geom_histogram() + ggtitle("Period 2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot( aircraft.per3, aes( logLength ) ) + geom_histogram() + ggtitle("Period 3")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# density plots
ggplot( aircraft, aes( logPower, group = Period, colour = Period ) ) + geom_density( aes( fill = Period ), alpha = 0.4 ) + ggtitle("All data")
# pairwise plots
pairs( aircraft.per1[ , 3:8 ] )
ggpairs( aircraft.per1, columns = 3:8, progress = FALSE )
pairs( aircraft.per2[ , 3:8 ] )
ggpairs( aircraft.per2, columns = 3:8, progress = FALSE )
pairs( aircraft.per3[ , 3:8 ] )
ggpairs( aircraft.per3, columns = 3:8, progress = FALSE )
#
scatter3D (aircraft.per1$logWeight, aircraft.per1$logSpan, aircraft.per1$logLength, phi = 20, theta = 80,
col = NULL, NAcol = "white", breaks = NULL,
colkey = NULL, panel.first = NULL,
clim = NULL, clab = NULL,
bty = "b", CI = NULL, surf = NULL,
add = FALSE, plot = TRUE)
#
# scatter3D (aircraft.per2$logWeight, aircraft.per2$logSpan, aircraft.per2$logLength, phi = 20, theta = 80,
# col = NULL, NAcol = "white", breaks = NULL,
# colkey = NULL, panel.first = NULL,
# clim = NULL, clab = NULL,
# bty = "b", CI = NULL, surf = NULL,
# add = FALSE, plot = TRUE)
#
# scatter3D (aircraft.per3$logWeight, aircraft.per3$logSpan, aircraft.per3$logLength, phi = 20, theta = 80,
# col = NULL, NAcol = "white", breaks = NULL,
# colkey = NULL, panel.first = NULL,
# clim = NULL, clab = NULL,
# bty = "b", CI = NULL, surf = NULL,
# add = FALSE, plot = TRUE)
ggparcoord( aircraft.per1, columns = c(1,3:8), alpha = 0.3, scale = "uniminmax" ) + ggtitle("Period 1")
ggparcoord( aircraft.per2, columns = c(1,3:8), alpha = 0.3, scale = "uniminmax" ) + ggtitle("Period 2")
# ggparcoord( aircraft.per3, columns = c(1,3:8), alpha = 0.3, scale = "uniminmax" ) + ggtitle("Period 3")
#
ggparcoord( aircraft.per1, columns = c(3:8), groupColumn = "Year", alpha = 0.3, scale = "globalminmax", boxplot = TRUE ) + ggtitle("Period 1")
ggparcoord( aircraft.per2, columns = c(3:8), groupColumn = "Year", alpha = 0.3, scale = "globalminmax", boxplot = TRUE ) + ggtitle("Period 2")
# ggparcoord( aircraft.per3, columns = c(3:8), groupColumn = "Year", alpha = 0.3, scale = "globalminmax", boxplot = TRUE ) + ggtitle("Period 3")
ggplot( aircraft, aes( logPower, logWeight )) + geom_density_2d( ) + geom_point( aes( colour = Period ), alpha = 0.6 ) + theme( legend.position = "bottom")
ggplot( aircraft, aes( logSpeed, logLength )) + geom_density_2d( )+ geom_point( aes( colour = Period ), alpha = 0.6 ) + theme( legend.position = "bottom")