library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.3
str(Wage)
## 'data.frame': 3000 obs. of 12 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ sex : Factor w/ 2 levels "1. Male","2. Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
# Build Linear Model: lm_wage
lm_wage <- lm(wage ~ age, data = Wage)
unseen <- data.frame(age = 60)
# Predict the wage for a 60-year old worker
predict(lm_wage, unseen)
## 1
## 124.1413
Predict category of new observation
earlier Observations -> (Estimate) -> Classifier
Unseen Data -> (Classifier) -> Classpredictors -> (Regression Function) Response
Relationship : Height - Weight? Linear? Predict : Weight -> Height
emails <- data.frame(avg_capital_seq=c(1.000, 2.112, 4.123, 1.863, 2.973, 1.687, 5.891, 3.167, 1.230, 2.441, 3.555, 3.250, 1.333), spam=c(0,0,1,0,1,0,1,0,0,1,0,0,1))
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(prediction) # prediction is either 0 or 1
}
# Apply the classifier to the avg_capital_seq column: spam_pred
spam_pred <- spam_classifier(emails$avg_capital_seq)
# Compare spam_pred to emails$spam. Use ==
emails$spam == spam_pred
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
linkedin <- c(5,7,4,9,11,10,14,17,13,11,18,17,21,21,24,23,28,35,21,27,23)
days <- seq(1:21)
# Fit a linear model called on the linkedin views per day: linkedin_lm
linkedin_lm <- lm(linkedin ~ days)
# Predict the number of views for the next three days: linkedin_pred
future_days <- data.frame(days = 22:24)
linkedin_pred <- predict(linkedin_lm, future_days)
# Plot historical data and predictions
plot(linkedin ~ days, xlim = c(1, 24))
points(22:24, linkedin_pred, col = "green")
This technique tries to group your objects. It does this without any prior knowledge of what these groups could or should look like. For clustering, the concepts of prior knowledge and unseen observations are less meaningful than for classification and regression.
set.seed(1)
my_iris <- iris[-5]
species <- iris$Species
# Perform k-means clustering on my_iris: kmeans_iris
kmeans_iris <- kmeans(my_iris, 3)
# Compare the actual Species to the clustering using table()
table(species, kmeans_iris$cluster)
##
## species 1 2 3
## setosa 50 0 0
## versicolor 0 2 48
## virginica 0 36 14
# Plot Petal.Width against Petal.Length, coloring by cluster
plot(Petal.Length ~ Petal.Width, data = my_iris, col = kmeans_iris$cluster)
library(rpart)
## Warning: package 'rpart' was built under R version 3.3.3
tree <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
data = iris, method = "class")
# A dataframe containing unseen observations
unseen <- data.frame(Sepal.Length = c(5.3, 7.2),
Sepal.Width = c(2.9, 3.9),
Petal.Length = c(1.7, 5.4),
Petal.Width = c(0.8, 2.3))
# Predict the label of the unseen observations. Print out the result.
predict(tree, unseen, type="class")
## 1 2
## setosa virginica
## Levels: setosa versicolor virginica
set.seed(1)
str(cars)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
km_cars$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 2 1 2 2 2
## [36] 1 1 2 1 1 2 2 2 2 2 2 2 2 2 2
plot(cars, col = km_cars$cluster)
km_cars$centers
## speed dist
## 1 12.84375 27.15625
## 2 19.94444 71.11111
points(km_cars$centers, pch = 22, bg = c(1, 2), cex = 2)
Centroids are kind of like the centers of each cluster.