###KNN! ####will explore how to implement the KNN algorithm for classification. ####Application #1: Stock Market Data! ####this data set consists of percentage returns for the S&P 500 stock index over 1, 250 days, from the beginning of 2001 until the end of 2005. ####For each date, recorded the percentage returns for each of the five previous trading days, Lag1 through Lag5. ####also recorded Volume (the number of shares traded on the previous day, in billions), #Today (the percentage return on the date in question) and Direction (whether the market was Up or Down on this date).

library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("ggplot2")
library("dplyr")
library("magrittr")
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(ISLR)

names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
attach(Smarket)

library(class)
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up
#Seperate out the 2005 data, which was the most current in the dataset. 
#This is very true to how forecasting would be done in a business setting because we have the past data to forecast the future.

# YEAR 2005
train=(Year <2005)
Smarket.2005= Smarket [! train ,]
dim(Smarket.2005)
## [1] 252   9
Direction.2005=Direction[!train]

# Creating a variable for if the data is from the training set or testing set
Smarket$Year05<-"No"
Smarket$Year05[which(Smarket$Year==2005)]<-"Yes"

#We only want to predict the stock market Direction based on Lag1 and Lag2. Lets create a plot to visualize this! 
# Remember we want to only consider the closest neighbors (this uses Euclidean distance!)

library(tidyverse)
ggplot(Smarket, aes(Lag1, Lag2, color=Direction, pch=Year05))+
  geom_jitter(alpha=0.5)+
  theme_bw()

train.X=cbind(Lag1 ,Lag2)[train ,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction = Direction[train]


#Implement the KNN algorithm!
  
  #Model for K=1
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction ,k=1)

#Confusion matrix
table(knn.pred,Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83


#Model for K=3
knn.pred=knn(train.X,test.X,train.Direction ,k=3)

#Confusion matrix
table(knn.pred,Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   48 54
##     Up     63 87
mean(knn.pred==Direction.2005)
## [1] 0.5357143
detach(Smarket)
#Application #2: Predicting Insurance Sales
#Caravan data set, which is part of the ISLR library. 
#This data set includes 85 predictors that measure demographic characteristics for 5,822 individuals. 
#The response variable is Purchase, which indicates whether or not a given individual purchases a caravan insurance policy. 
#In this data set, only 6% of people purchased caravan insurance.

### Application: Caravan Insurance
library(ISLR)
data("Caravan")
dim(Caravan)
## [1] 5822   86
attach(Caravan)
summary(Purchase)
##   No  Yes 
## 5474  348
# Percent that purchased 
348/dim(Caravan)[1]
## [1] 0.05977327
#Since the KNN algorithm uses a distance metric, its VERY important to standardize the variables first.
#For instance, consider the variables salary and age. 
#Those are in completely different scale! 
#The variability of salary is much more than age and the importance of the age variable could get lost!
#So first, we should start with standardizing the variables.

# KNN uses a distance metric 
# so standardizing the units is VERY important! Standardize!!
# consider salary and age

# take out 86th col b/c Purchase variable
standardized.X=scale(Caravan [,-86])

# unstandardized 
var(Caravan[ ,1])
## [1] 165.0378
var(Caravan[ ,2])
## [1] 0.1647078
# standardized 
var(standardized.X[ ,1])
## [1] 1
var(standardized.X[ ,2])
## [1] 1
#Since we want to predict the insurance sales, let’s train and then test the model.

# Split data into test and train
test=1:1000
train.X=standardized.X[-test ,]
test.X=standardized.X[test ,]
train.Y=Purchase[-test]
test.Y=Purchase[test]

#Model for K=1
# K=1
set.seed (1)
knn.pred1=knn(train.X,test.X,train.Y,k=1)

#Error rate 
mean(test.Y!=knn.pred1)
## [1] 0.118
mean(test.Y!="No")
## [1] 0.059
# confusion matrix
table(knn.pred1, test.Y)
##          test.Y
## knn.pred1  No Yes
##       No  873  50
##       Yes  68   9
# Rate correct.. better than random guess
9/(68+9)
## [1] 0.1168831
#Model for K=3
# K=3
knn.pred3=knn(train.X,test.X,train.Y,k=3)
table(knn.pred3, test.Y)
##          test.Y
## knn.pred3  No Yes
##       No  920  54
##       Yes  21   5
5/26
## [1] 0.1923077
#Model for K=5
# K=5
knn.pred3=knn(train.X,test.X,train.Y,k=5)
table(knn.pred3, test.Y)
##          test.Y
## knn.pred3  No Yes
##       No  930  55
##       Yes  11   4
4/15
## [1] 0.2666667