Generate Dataset
library(ggplot2)
# The relationship between x and y
f <- function(x)
x^2/10+0.1
# Parameters of Weibull distributions
shape.1 <- 1.5
scale.1 <- 1
shape.2 <- 3
scale.2 <- 1.5
# Generate datasets
noise.sd <- 0.2
x.1 <- rweibull(100, shape=shape.1, scale=scale.1)
x.2 <- rweibull(100, shape=shape.2, scale=scale.2)
y.1 <- f(x.1) + rnorm(100, mean=0, sd=noise.sd)
y.2 <- f(x.2) + rnorm(100, mean=0, sd=noise.sd)
data.1 <- data.frame(x=x.1, y=y.1, dataset=rep('data.1', length(x.1)))
data.2 <- data.frame(x=x.2, y=y.2, dataset=rep('data.2', length(x.2)))
Test Models
# Return MSE of model trained on train.set and tested on test.set
CalculateMSE <- function(model, train.set, test.set) {
model <- lm(model, train.set)
y.hat <- predict(model, test.set)
mean((y.hat - test.set$y)^2)
}
# Specify models to try
models <- list(linear = y ~ x,
quadratic = y ~ poly(x,2),
cubic = y ~ poly(x,3))
datasets <- list(data.1, data.2)
Test models intra- data.1 and data.2
# Initialize matrix
intra.MSE <- matrix(nrow=length(models), ncol=3,
dimnames=list(names(models), c('data.1', 'data.2', 'avg')))
# Populate matrix
for (i in 1:length(models)) {
model <- models[[i]]
for (j in 1:2) {
dataset <- datasets[[j]]
half.length <- round(nrow(dataset)/2)
train.set <- dataset[1:half.length,]
test.set <- dataset[(half.length+1):nrow(dataset),]
intra.MSE[i,j] <- CalculateMSE(model, train.set, test.set)
}
# Calculate mean
intra.MSE[i, 'avg'] <- mean(intra.MSE[i, 1:2])
}
intra.MSE
## data.1 data.2 avg
## linear 0.03906327 0.04613189 0.04259758
## quadratic 0.03861710 0.05045887 0.04453798
## cubic 0.03859613 0.04973046 0.04416330
Try models inter- data.1 and data.2
# The title of a column is the training set
# Initialize matrix
inter.MSE <- matrix(nrow=length(models), ncol=3,
dimnames=list(names(models), c('data.1', 'data.2', 'avg')))
# Populate matrix
for (i in 1:length(models)) {
model <- models[[i]]
for (j in 1:2) {
train.set <- datasets[[j]]
test.set <- datasets[[3-j]]
inter.MSE[i,j] <- CalculateMSE(model, train.set, test.set)
}
# Calculate mean
inter.MSE[i, 'avg'] <- mean(intra.MSE[i, 1:2])
}
inter.MSE
## data.1 data.2 avg
## linear 0.05763976 0.04388241 0.04259758
## quadratic 0.05882447 0.04394334 0.04453798
## cubic 0.05859174 0.10625345 0.04416330
Plot
# Datasets plots
data <- rbind(data.1, data.2)
# Distribution plots
x <- seq(0,4, length.out=100)
for.plot.1 <- data.frame(x=x, y=dweibull(x, shape=shape.1, scale=scale.1),
dataset=rep("data.1", length(x)))
for.plot.2 <- data.frame(x=x, y=dweibull(x, shape=shape.2, scale=scale.2),
dataset=rep("data.2", length(x)))
for.plot <- rbind(for.plot.1, for.plot.2)
# Relationship plot
rel <- data.frame(x=x, y=f(x))
# Plot
ggplot() +
geom_line(data=for.plot, aes(x, y, colour=dataset)) +
geom_line(data=rel, aes(x, y), linetype="dashed") +
geom_point(data=data, aes(x, y, colour=dataset), alpha=0.25) +
xlim(0, 4) +
ylim(0, 2)
## Warning: Removed 31 rows containing missing values (geom_point).
