Generate Dataset

library(ggplot2)

# The relationship between x and y
f <- function(x)
  x^2/10+0.1

# Parameters of Weibull distributions
shape.1 <- 1.5
scale.1 <- 1
shape.2 <- 3
scale.2 <- 1.5

# Generate datasets
noise.sd <- 0.2
x.1 <- rweibull(100, shape=shape.1, scale=scale.1)
x.2 <- rweibull(100, shape=shape.2, scale=scale.2)
y.1 <- f(x.1) + rnorm(100, mean=0, sd=noise.sd)
y.2 <- f(x.2) + rnorm(100, mean=0, sd=noise.sd)
data.1 <- data.frame(x=x.1, y=y.1, dataset=rep('data.1', length(x.1)))
data.2 <- data.frame(x=x.2, y=y.2, dataset=rep('data.2', length(x.2)))

Test Models

# Return MSE of model trained on train.set and tested on test.set
CalculateMSE <- function(model, train.set, test.set) {
  model <- lm(model, train.set)
  y.hat <- predict(model, test.set)
  mean((y.hat - test.set$y)^2)
}

# Specify models to try
models <- list(linear = y ~ x,
               quadratic = y ~ poly(x,2),
               cubic = y ~ poly(x,3))

datasets <- list(data.1, data.2)

Test models intra- data.1 and data.2

# Initialize matrix
intra.MSE <- matrix(nrow=length(models), ncol=3, 
                     dimnames=list(names(models), c('data.1', 'data.2', 'avg')))
# Populate matrix
for (i in 1:length(models)) {
  model <- models[[i]]
  for (j in 1:2) {
    dataset <- datasets[[j]]
    half.length <- round(nrow(dataset)/2)
    train.set <- dataset[1:half.length,]
    test.set <- dataset[(half.length+1):nrow(dataset),]
    intra.MSE[i,j] <- CalculateMSE(model, train.set, test.set)
  }
  # Calculate mean
  intra.MSE[i, 'avg'] <- mean(intra.MSE[i, 1:2])
}

intra.MSE
##               data.1     data.2        avg
## linear    0.03906327 0.04613189 0.04259758
## quadratic 0.03861710 0.05045887 0.04453798
## cubic     0.03859613 0.04973046 0.04416330

Try models inter- data.1 and data.2

# The title of a column is the training set

# Initialize matrix
inter.MSE <- matrix(nrow=length(models), ncol=3, 
                     dimnames=list(names(models), c('data.1', 'data.2', 'avg')))
# Populate matrix
for (i in 1:length(models)) {
  model <- models[[i]]
  for (j in 1:2) {
    train.set <- datasets[[j]]
    test.set <- datasets[[3-j]]
    inter.MSE[i,j] <- CalculateMSE(model, train.set, test.set)
  }
  # Calculate mean
  inter.MSE[i, 'avg'] <- mean(intra.MSE[i, 1:2])
}

inter.MSE
##               data.1     data.2        avg
## linear    0.05763976 0.04388241 0.04259758
## quadratic 0.05882447 0.04394334 0.04453798
## cubic     0.05859174 0.10625345 0.04416330

Plot

# Datasets plots
data <- rbind(data.1, data.2)

# Distribution plots
x <- seq(0,4, length.out=100)
for.plot.1 <- data.frame(x=x, y=dweibull(x, shape=shape.1, scale=scale.1), 
                         dataset=rep("data.1", length(x)))
for.plot.2 <- data.frame(x=x, y=dweibull(x, shape=shape.2, scale=scale.2),
                         dataset=rep("data.2", length(x)))
for.plot <- rbind(for.plot.1, for.plot.2)

# Relationship plot
rel <- data.frame(x=x, y=f(x))

# Plot
ggplot() +
  geom_line(data=for.plot, aes(x, y, colour=dataset)) +
  geom_line(data=rel, aes(x, y), linetype="dashed") +
  geom_point(data=data, aes(x, y, colour=dataset), alpha=0.25) +
  xlim(0, 4) +
  ylim(0, 2)
## Warning: Removed 31 rows containing missing values (geom_point).