[Video]
#specify dataframe, set plot aesthetics in geom_point (note y=0)
p <- ggplot(data = drink_samples) +
geom_point(aes(x = drink_samples$sugar_content,y = c(0)))
#label each point with sugar content value, adjust text size and location
p <- p +
geom_text(aes(x=drink_samples$sugar_content, y=c(0)),
label=drink_samples$sugar_content,
size=2.5,
vjust=2,
hjust=0.5)
#display plot
p
Decision boundaries
Let’s pick two points in the interval as candidate boundaries:
Classification (decision) rules:
Let’s visualize them on the plot shown on the previous slide.
Decision boundaries - visualization code
Create a dataframe containing the two decision boundaries.
#define data frame containing decision boundaries
d_bounds <- data.frame(sep=c(9.1,9.7))
Add to plot using geom_point()
#add decision boundaries to previous plot
p <- p +
geom_point(data=d_bounds,
aes(x=d_bounds$sep, y=c(0)),
color="red",
size=3) +
geom_text(data=d_bounds,
aes(x=d_bounds$sep, y=c(0)),
label=d_bounds$sep,
size=2.5,
vjust=2,
hjust=0.5,
color="red")
#display plot
p
Maximum margin separator
#create data frame with maximal margin separator
mm_sep <- data.frame(sep = c((8.8+10)/2))
#add mm boundary to previous plot
p <- p +
geom_point(data=mm_sep,
aes(x=mm_sep$sep, y=c(0)),
color="blue",
size=4)
#display plot
p
# Load ggplot2
library(ggplot2)
# Print variable names
colnames(df)
## [1] "sample." "sugar_content"
# Plot sugar content along the x-axis
plot_df <- ggplot(data = df, aes(x = sugar_content, y = 0)) +
geom_point() +
geom_text(aes(label = sugar_content), size = 2.5, vjust = 2, hjust = 0.5)
# Display plot
plot_df
Based on the plot you created in the previous exercise (reproduced on the right), which of the following points is not a legitimate decision boundary?
#The maximal margin separator is at the midpoint of the two extreme points in each cluster.
mm_separator <- (8.9+10)/2
#create data frame containing the maximum margin separator
separator <- data.frame(sep = mm_separator)
#add separator to sugar content scatterplot
plot_sep <- plot_ + geom_point(data = separator, aes(x = mm_separator, y = 0), color = "blue", size = 4)
#display plot
plot_sep
[Video]
Overview of lesson
Generating a two-dimensional dataset using runif()
#Preliminaries...
#set required number of data points
n <- 200
#set seed to ensure reproducibility
set.seed(42)
#Generate dataframe with two predictors x1 and x2 in (0,1)
df <- data.frame(x1 = runif(n),
x2 = runif(n))
Creating two classes Create two classes, separated by the straight line decision boundary x1 = x2 Line passes through (0,0) and makes a 45 degree angle with horizontal Class variable y = -1 for points below line and y = 1 for points above it
#classify points as -1 or +1
df$y <- factor(ifelse(df$x1-df$x2>0,-1,1),
levels = c(-1,1))
Visualizing dataset using ggplot
#load ggplot2
library(ggplot2)
#build plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("-1" = "red","1" = "blue")) +
geom_abline(slope = 1, intercept = 0)
#display it
p
Introducing a margin
#create a margin of 0.05 in dataset
delta <- 0.05
# retain only those points that lie outside the margin
df1 <- df[abs(df$x1-df$x2)>delta,]
#check number of data points remaining
nrow(df1)
## [1] 180
#replot dataset with margin (code is exactly same as before)
p <- ggplot(data = df1, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red","blue")) +
geom_abline(slope = 1, intercept = 0)
#display plot
p
Plotting the margin boundaries
The margin boundaries are: * parallel to the decision boundary (slope = 1). * located delta units on either side of it (delta = 0.05).
p <- p +
geom_abline(slope = 1, intercept = delta, linetype = "dashed") +
geom_abline(slope = 1, intercept = -delta, linetype = "dashed")
p
#set seed
set.seed(42)
#set number of data points.
n <- 600
#Generate data frame with two uniformly distributed predictors lying between 0 and 1.
df <- data.frame(x1 = runif(n),
x2 = runif(n))
#classify data points depending on location
df$y <- factor(ifelse(df$x2-1.4*df$x1 < 0, -1, 1),
levels = c(-1, 1))
#set margin
delta <- 0.07
# retain only those points that lie outside the margin
df1 <- df[abs(1.4*df$x1 - df$x2) > delta, ]
#build plot
plot_margins <- ggplot(data = df1, aes(x = x1, y = x2, color = y)) + geom_point() +
scale_color_manual(values = c("red", "blue")) +
geom_abline(slope = 1.4, intercept = 0)+
geom_abline(slope = 1.4, intercept = delta, linetype = "dashed") +
geom_abline(slope = 1.4, intercept = -delta, linetype = "dashed")
#display plot
plot_margins
ggplot2plot()tune.svm()Michael is a hybrid thinker and doer—a byproduct of being a CliftonStrengths “Learner” over time. With 20+ years of engineering, design, and product experience, he helps organizations identify market needs, mobilize internal and external resources, and deliver delightful digital customer experiences that align with business goals. He has been entrusted with problem-solving for brands—ranging from Fortune 500 companies to early-stage startups to not-for-profit organizations.
Michael earned his BS in Computer Science from New York Institute of Technology and his MBA from the University of Maryland, College Park. He is also a candidate to receive his MS in Applied Analytics from Columbia University.
LinkedIn | Twitter | www.michaelmallari.com/data | www.columbia.edu/~mm5470