###KNN! ####will explore how to implement the KNN algorithm for classification. ####Application #1: Stock Market Data! ####this data set consists of percentage returns for the S&P 500 stock index over 1, 250 days, from the beginning of 2001 until the end of 2005. ####For each date, recorded the percentage returns for each of the five previous trading days, Lag1 through Lag5. ####also recorded Volume (the number of shares traded on the previous day, in billions), #Today (the percentage return on the date in question) and Direction (whether the market was Up or Down on this date).
library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("ggplot2")
library("dplyr")
library("magrittr")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(ISLR)
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
attach(Smarket)
library(class)
head(Smarket)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
#Seperate out the 2005 data, which was the most current in the dataset.
#This is very true to how forecasting would be done in a business setting because we have the past data to forecast the future.
# YEAR 2005
train=(Year <2005)
Smarket.2005= Smarket [! train ,]
dim(Smarket.2005)
## [1] 252 9
Direction.2005=Direction[!train]
# Creating a variable for if the data is from the training set or testing set
Smarket$Year05<-"No"
Smarket$Year05[which(Smarket$Year==2005)]<-"Yes"
#We only want to predict the stock market Direction based on Lag1 and Lag2. Lets create a plot to visualize this!
# Remember we want to only consider the closest neighbors (this uses Euclidean distance!)
library(tidyverse)
ggplot(Smarket, aes(Lag1, Lag2, color=Direction, pch=Year05))+
geom_jitter(alpha=0.5)+
theme_bw()
train.X=cbind(Lag1 ,Lag2)[train ,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction = Direction[train]
#Implement the KNN algorithm!
#Model for K=1
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction ,k=1)
#Confusion matrix
table(knn.pred,Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
#Model for K=3
knn.pred=knn(train.X,test.X,train.Direction ,k=3)
#Confusion matrix
table(knn.pred,Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 48 54
## Up 63 87
mean(knn.pred==Direction.2005)
## [1] 0.5357143
detach(Smarket)
#Application #2: Predicting Insurance Sales
#Caravan data set, which is part of the ISLR library.
#This data set includes 85 predictors that measure demographic characteristics for 5,822 individuals.
#The response variable is Purchase, which indicates whether or not a given individual purchases a caravan insurance policy.
#In this data set, only 6% of people purchased caravan insurance.
### Application: Caravan Insurance
library(ISLR)
data("Caravan")
dim(Caravan)
## [1] 5822 86
attach(Caravan)
summary(Purchase)
## No Yes
## 5474 348
# Percent that purchased
348/dim(Caravan)[1]
## [1] 0.05977327
#Since the KNN algorithm uses a distance metric, its VERY important to standardize the variables first.
#For instance, consider the variables salary and age.
#Those are in completely different scale!
#The variability of salary is much more than age and the importance of the age variable could get lost!
#So first, we should start with standardizing the variables.
# KNN uses a distance metric
# so standardizing the units is VERY important! Standardize!!
# consider salary and age
# take out 86th col b/c Purchase variable
standardized.X=scale(Caravan [,-86])
# unstandardized
var(Caravan[ ,1])
## [1] 165.0378
var(Caravan[ ,2])
## [1] 0.1647078
# standardized
var(standardized.X[ ,1])
## [1] 1
var(standardized.X[ ,2])
## [1] 1
#Since we want to predict the insurance sales, let’s train and then test the model.
# Split data into test and train
test=1:1000
train.X=standardized.X[-test ,]
test.X=standardized.X[test ,]
train.Y=Purchase[-test]
test.Y=Purchase[test]
#Model for K=1
# K=1
set.seed (1)
knn.pred1=knn(train.X,test.X,train.Y,k=1)
#Error rate
mean(test.Y!=knn.pred1)
## [1] 0.118
mean(test.Y!="No")
## [1] 0.059
# confusion matrix
table(knn.pred1, test.Y)
## test.Y
## knn.pred1 No Yes
## No 873 50
## Yes 68 9
# Rate correct.. better than random guess
9/(68+9)
## [1] 0.1168831
#Model for K=3
# K=3
knn.pred3=knn(train.X,test.X,train.Y,k=3)
table(knn.pred3, test.Y)
## test.Y
## knn.pred3 No Yes
## No 920 54
## Yes 21 5
5/26
## [1] 0.1923077
#Model for K=5
# K=5
knn.pred3=knn(train.X,test.X,train.Y,k=5)
table(knn.pred3, test.Y)
## test.Y
## knn.pred3 No Yes
## No 930 55
## Yes 11 4
4/15
## [1] 0.2666667