4.6.1 The Stock Market Data

### Call the Library ISLR and data file smarket.
library(ISLR)
### List the variable names 
names(Smarket)
[1] "Year"      "Lag1"      "Lag2"      "Lag3"     
[5] "Lag4"      "Lag5"      "Volume"    "Today"    
[9] "Direction"
## Dimensions and summary stats for continuous variables  
dim(Smarket)
[1] 1250    9
summary(Smarket)
      Year           Lag1          
 Min.   :2001   Min.   :-4.922000  
 1st Qu.:2002   1st Qu.:-0.639500  
 Median :2003   Median : 0.039000  
 Mean   :2003   Mean   : 0.003834  
 3rd Qu.:2004   3rd Qu.: 0.596750  
 Max.   :2005   Max.   : 5.733000  
      Lag2                Lag3          
 Min.   :-4.922000   Min.   :-4.922000  
 1st Qu.:-0.639500   1st Qu.:-0.640000  
 Median : 0.039000   Median : 0.038500  
 Mean   : 0.003919   Mean   : 0.001716  
 3rd Qu.: 0.596750   3rd Qu.: 0.596750  
 Max.   : 5.733000   Max.   : 5.733000  
      Lag4                Lag5         
 Min.   :-4.922000   Min.   :-4.92200  
 1st Qu.:-0.640000   1st Qu.:-0.64000  
 Median : 0.038500   Median : 0.03850  
 Mean   : 0.001636   Mean   : 0.00561  
 3rd Qu.: 0.596750   3rd Qu.: 0.59700  
 Max.   : 5.733000   Max.   : 5.73300  
     Volume           Today           Direction 
 Min.   :0.3561   Min.   :-4.922000   Down:602  
 1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648  
 Median :1.4229   Median : 0.038500             
 Mean   :1.4783   Mean   : 0.003138             
 3rd Qu.:1.6417   3rd Qu.: 0.596750             
 Max.   :3.1525   Max.   : 5.733000             

### Scatterplot Matrix
pairs(Smarket)
#cor(Smarket) #### Code has error.
cor(Smarket[,-9])
             Year         Lag1         Lag2
Year   1.00000000  0.029699649  0.030596422
Lag1   0.02969965  1.000000000 -0.026294328
Lag2   0.03059642 -0.026294328  1.000000000
Lag3   0.03319458 -0.010803402 -0.025896670
Lag4   0.03568872 -0.002985911 -0.010853533
Lag5   0.02978799 -0.005674606 -0.003557949
Volume 0.53900647  0.040909908 -0.043383215
Today  0.03009523 -0.026155045 -0.010250033
               Lag3         Lag4         Lag5
Year    0.033194581  0.035688718  0.029787995
Lag1   -0.010803402 -0.002985911 -0.005674606
Lag2   -0.025896670 -0.010853533 -0.003557949
Lag3    1.000000000 -0.024051036 -0.018808338
Lag4   -0.024051036  1.000000000 -0.027083641
Lag5   -0.018808338 -0.027083641  1.000000000
Volume -0.041823686 -0.048414246 -0.022002315
Today  -0.002447647 -0.006899527 -0.034860083
            Volume        Today
Year    0.53900647  0.030095229
Lag1    0.04090991 -0.026155045
Lag2   -0.04338321 -0.010250033
Lag3   -0.04182369 -0.002447647
Lag4   -0.04841425 -0.006899527
Lag5   -0.02200231 -0.034860083
Volume  1.00000000  0.014591823
Today   0.01459182  1.000000000
###Attach the data file smarket and plot a graph of volume vs. Index.
attach(Smarket)
plot(Volume)

4.6.2 Logistic Regression

### Fitting a GLM Model.
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial)
### Summary of the Model that was fitted above, Quantiles of the deviance, coefficients and the exact estimates.
summary(glm.fit)

Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial, data = Smarket)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.446  -1.203   1.065   1.145   1.326  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000   0.240736  -0.523    0.601
Lag1        -0.073074   0.050167  -1.457    0.145
Lag2        -0.042301   0.050086  -0.845    0.398
Lag3         0.011085   0.049939   0.222    0.824
Lag4         0.009359   0.049974   0.187    0.851
Lag5         0.010313   0.049511   0.208    0.835
Volume       0.135441   0.158360   0.855    0.392

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1731.2  on 1249  degrees of freedom
Residual deviance: 1727.6  on 1243  degrees of freedom
AIC: 1741.6

Number of Fisher Scoring iterations: 3
coef(glm.fit)
 (Intercept)         Lag1         Lag2         Lag3 
-0.126000257 -0.073073746 -0.042301344  0.011085108 
        Lag4         Lag5       Volume 
 0.009358938  0.010313068  0.135440659 
### Just the coefficients from the GLM fit
summary(glm.fit)$coef
                Estimate Std. Error    z value
(Intercept) -0.126000257 0.24073574 -0.5233966
Lag1        -0.073073746 0.05016739 -1.4565986
Lag2        -0.042301344 0.05008605 -0.8445733
Lag3         0.011085108 0.04993854  0.2219750
Lag4         0.009358938 0.04997413  0.1872757
Lag5         0.010313068 0.04951146  0.2082966
Volume       0.135440659 0.15835970  0.8552723
             Pr(>|z|)
(Intercept) 0.6006983
Lag1        0.1452272
Lag2        0.3983491
Lag3        0.8243333
Lag4        0.8514445
Lag5        0.8349974
Volume      0.3924004
### Only The 4th column in Coefficients, prob using z test.
summary(glm.fit)$coef[,4]
(Intercept)        Lag1        Lag2        Lag3 
  0.6006983   0.1452272   0.3983491   0.8243333 
       Lag4        Lag5      Volume 
  0.8514445   0.8349974   0.3924004 
### Generating probabilities using predict function and  Type=response. 
glm.probs=predict(glm.fit,type="response")
### Probabilities 1 through 10 are printed and the direction of prediction is up=1.
glm.probs[1:10]
        1         2         3         4         5 
0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 
        6         7         8         9        10 
0.5069565 0.4926509 0.5092292 0.5176135 0.4888378 
contrasts(Direction)
     Up
Down  0
Up    1
### Using the same predict function on all 1250 observations, we are creating the direction of the market as Up if the prob exceeds 0.5 and downotherwise. The table of predicted vs. direction is printed too.
glm.pred=rep("Down",1250)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
        Direction
glm.pred Down  Up
    Down  145 141
    Up    457 507
### Following divides the total true prediction by total to get the portion of correct predictions in this table.
mean(glm.pred==Direction)
[1] 0.5216
(507+145)/1250
[1] 0.5216

Dividing the Obs prior to 2005 as training data and the rest as test data

### The vector train pertains to all observationd prior to year 2005.
train=(Year<2005)
Smarket.2005=Smarket[!train,]
dim(Smarket.2005) ## Contains 252 obs and 9 variables
[1] 252   9
Direction.2005=Direction[!train]
###GLM Logistic regression for the training dataset using all 6 predictors.
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
###direction calculation and mean direction of training data
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
        Direction.2005
glm.pred Down Up
    Down   77 97
    Up     34 44
mean(glm.pred==Direction.2005)
[1] 0.4801587
mean(glm.pred!=Direction.2005)
[1] 0.5198413
### GLM Logistic regression model using only lag1 & lag2 as predictors and the training dataset, generating proabilities, direction and mean direction
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
### Table of actual vs. predicted directions 
table(glm.pred,Direction.2005)
        Direction.2005
glm.pred Down  Up
    Down   35  35
    Up     76 106
### Dividing correct predictions by total to get the correct portion predicted
mean(glm.pred==Direction.2005)
[1] 0.5595238
106/(106+76)
[1] 0.5824176
### Another Log Reg GLM model and predicted probabilities using specific lag1 and lag2 values.
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")
        1         2 
0.4791462 0.4960939 

4.6.3 Linear Discriminant Analysis

### Calling the library named MASS.
library(MASS)
### Fit a Linear Discriminant Analysis model to get Baye's Prior Probabilities, group average for predictors and the coefficients.
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
plot(lda.fit)
### Predictions using Linear Discriminant Analysis(LDA)
lda.pred=predict(lda.fit, Smarket.2005)
names(lda.pred)
[1] "class"     "posterior" "x"        
### Yields class=var for LDA's predictions, Posterior=Var for Posterior Prob & value x
### Assign the predicted class to the LDA Class and generating a table of this predicted class vs. the direction.
lda.class=lda.pred$class
table(lda.class,Direction.2005)
         Direction.2005
lda.class Down  Up
     Down   35  35
     Up     76 106
### Generating the Test-Error Rate
mean(lda.class==Direction.2005)
[1] 0.5595238
###Splitting the predicted probabilities by those above 50% and below 50% and getting their occurences
sum(lda.pred$posterior[,1]>=.5)
[1] 70
sum(lda.pred$posterior[,1]<.5)
[1] 182
### Printing the Predicted posteriors of 20 obserations and their class
lda.pred$posterior[1:20,1]
      999      1000      1001      1002      1003 
0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 
     1004      1005      1006      1007      1008 
0.4938562 0.4951016 0.4872861 0.4907013 0.4844026 
     1009      1010      1011      1012      1013 
0.4906963 0.5119988 0.4895152 0.4706761 0.4744593 
     1014      1015      1016      1017      1018 
0.4799583 0.4935775 0.5030894 0.4978806 0.4886331 
lda.class[1:20]
 [1] Up   Up   Up   Up   Up   Up   Up   Up   Up  
[10] Up   Up   Down Up   Up   Up   Up   Up   Down
[19] Up   Up  
Levels: Down Up
### If classification of posterior is set at 90% 
sum(lda.pred$posterior[,1]>.9)
[1] 0
### there are no observation with 90% prob.

4.6.4 Quadratic Discriminant Analysis(QDA)

### Fitting a QDA model for data=smarket.
qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit
Call:
qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544
### Creating the table of class created by QDA model(QDA.class) vs. the direction for year 2005 
qda.class=predict(qda.fit,Smarket.2005)$class
table(qda.class,Direction.2005)
         Direction.2005
qda.class Down  Up
     Down   30  20
     Up     81 121
### The mean value of QDA class for 2005.
mean(qda.class==Direction.2005)
[1] 0.5992063

4.6.5 K-Nearest Neighbors(KNN)

###Calling Library class for KNN using knn() function.
###Setting the training and test data sets
library(class)
train.X=cbind(Lag1,Lag2)[train,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
### with k=1, we try knn
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,Direction.2005)
        Direction.2005
knn.pred Down Up
    Down   43 58
    Up     68 83
### calculated error rate =50%
(83+43)/252
[1] 0.5
### Now, with k=3 rerun knn, derive table and the mean eror rate as 53.6%
knn.pred=knn(train.X,test.X,train.Direction,k=3)
table(knn.pred,Direction.2005)
        Direction.2005
knn.pred Down Up
    Down   48 54
    Up     63 87
mean(knn.pred==Direction.2005)
[1] 0.5357143

4.6.6 An Application to Caravan Insurance Data

### Next using Caravan data
dim(Caravan)
[1] 5822   86
attach(Caravan)
summary(Purchase)
  No  Yes 
5474  348 
348/5822
[1] 0.05977327
### Standardizing the data for all but outcome=86th variable
standardized.X=scale(Caravan[,-86])
var(Caravan[,1])
[1] 165.0378
var(Caravan[,2])
[1] 0.1647078
var(standardized.X[,1])
[1] 1
var(standardized.X[,2])
[1] 1
### knn models are again built for k=1 & k=3
set.seed(1)
knn.pred=knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)
[1] 0.118
mean(test.Y!="No")
[1] 0.059
table(knn.pred,test.Y)
        test.Y
knn.pred  No Yes
     No  873  50
     Yes  68   9
9/(68+9)
[1] 0.1168831
### After standardizing, k=1 yields 11.69% purchase rate and k=3 yields 19.23%
knn.pred=knn(train.X,test.X,train.Y,k=3)
table(knn.pred,test.Y)
        test.Y
knn.pred  No Yes
     No  920  54
     Yes  21   5
5/26
[1] 0.1923077
### When k=5 the purchase rate is at 26.67%
knn.pred=knn(train.X,test.X,train.Y,k=5)
table(knn.pred,test.Y)
        test.Y
knn.pred  No Yes
     No  930  55
     Yes  11   4
4/15
[1] 0.2666667
###Fitting a logistic regression model
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs=predict(glm.fit,Caravan[test,],type="response")
### Setting prob>50% as purchase=Y there are no correct purchase predictions.
glm.pred=rep("No",1000)
glm.pred[glm.probs>.5]="Yes"
table(glm.pred,test.Y)
        test.Y
glm.pred  No Yes
     No  934  59
     Yes   7   0
### But setting prob>.25 as Purchase=Y we get 11 correct predictions and a 33.3% Purchase rate.
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.Y)
        test.Y
glm.pred  No Yes
     No  919  48
     Yes  22  11
11/(22+11)
[1] 0.3333333
---
title: "### R Notebook for Lab-4 0690-Adv. Topics in Biostat Fall 2018"
output: html_notebook
Name: Jayashree Sampath
Date: October 2, 2018. Due by October 3, 2018.
---
###4.6.1 The Stock Market Data
```{r}
### Call the Library ISLR and data file smarket.
library(ISLR)
### List the variable names 
names(Smarket)
## Dimensions and summary stats for continuous variables  
dim(Smarket)
summary(Smarket)
```

```{r}
### Scatterplot Matrix
pairs(Smarket)
#cor(Smarket) #### Code has error.
cor(Smarket[,-9])
###Attach the data file smarket and plot a graph of volume vs. Index.
attach(Smarket)
plot(Volume)
```
## 4.6.2 Logistic Regression
```{r}
### Fitting a GLM logistic Regression model .
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial)
### Summary of the Model that was fitted above, Quantiles of the deviance Residuals, coefficients, exact estimates null deviance from intercept only model and the residual deviance from the full model.
summary(glm.fit)
coef(glm.fit)
```

```{r}
### Just the coefficients from the GLM fit
summary(glm.fit)$coef
### Only The 4th column in Coefficients, prob using z test.
summary(glm.fit)$coef[,4]
```

```{r}
### Generating probabilities using predict function and  Type=response. 
glm.probs=predict(glm.fit,type="response")
### Probabilities 1 through 10 are printed and the direction of prediction is up=1.
glm.probs[1:10]
contrasts(Direction)
```

```{r}
### Using the same predict function on all 1250 observations, we are creating the direction of the market as Up if the prob exceeds 0.5 and downotherwise. The table of predicted vs. direction is printed too.
glm.pred=rep("Down",1250)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
### Following divides the total true prediction by total to get the portion of correct predictions in this table.
mean(glm.pred==Direction)
(507+145)/1250
```
## Dividing the Obs prior to 2005 as training data and the rest as test data
```{r}
### The vector train pertains to all observationd prior to year 2005.
train=(Year<2005)
Smarket.2005=Smarket[!train,]
dim(Smarket.2005) ## Contains 252 obs and 9 variables
Direction.2005=Direction[!train]
###GLM Logistic regression for the training dataset using all 6 predictors.
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
###direction calculation and mean direction of training data
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
mean(glm.pred!=Direction.2005)
```

```{r}
### GLM Logistic regression model using only lag1 & lag2 as predictors and the training dataset, generating proabilities, direction and mean direction
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
### Table of actual vs. predicted directions 
table(glm.pred,Direction.2005)
### Dividing correct predictions by total to get the correct portion predicted
mean(glm.pred==Direction.2005)
106/(106+76)
```

```{r}
### Another Log Reg GLM model and predicted probabilities using specific lag1 and lag2 values.
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")
```
## 4.6.3 Linear Discriminant Analysis
```{r}
### Calling the library named MASS.
library(MASS)
### Fit a Linear Discriminant Analysis model to get Baye's Prior Probabilities, group average for predictors and the coefficients.
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
plot(lda.fit)
```

```{r}
### Predictions using Linear Discriminant Analysis(LDA)
lda.pred=predict(lda.fit, Smarket.2005)
names(lda.pred)
### Yields class=var for LDA's predictions, Posterior=Var for Posterior Prob & var x for the discriminants, as follows;
```

```{r}
### Assign the predicted class to the LDA Class and generating a table of this predicted class vs. the direction.
lda.class=lda.pred$class
table(lda.class,Direction.2005)
### Generating the Test-Error Rate
mean(lda.class==Direction.2005)
```

```{r}
###Splitting the predicted probabilities by those above 50% and below 50% and getting their occurences
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
```

```{r}
### Printing the Predicted posteriors of 20 obserations and their class
lda.pred$posterior[1:20,1]
lda.class[1:20]
### If classification of posterior is set at 90% 
sum(lda.pred$posterior[,1]>.9)
### there are no observations in 2005 with 90% prob.
```
### 4.6.4 Quadratic Discriminant Analysis(QDA)
```{r}
### Fitting a QDA model for data=smarket.
qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit
```

```{r}
### Creating the table of class created by QDA model(QDA.class) vs. the direction for year 2005 
qda.class=predict(qda.fit,Smarket.2005)$class
table(qda.class,Direction.2005)
### The mean value of QDA class for 2005.
mean(qda.class==Direction.2005)
```
### 4.6.5 K-Nearest Neighbors(KNN)
```{r}
###Calling Library class for KNN using knn() function.
###Setting the training and test data sets
library(class)
train.X=cbind(Lag1,Lag2)[train,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
### with k=1, we try knn
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,Direction.2005)
### calculated error rate =50%
(83+43)/252
### Now, with k=3 rerun knn, derive table and the mean eror rate as 53.6%
knn.pred=knn(train.X,test.X,train.Direction,k=3)
table(knn.pred,Direction.2005)
mean(knn.pred==Direction.2005)
```
### 4.6.6 An Application to Caravan Insurance Data
```{r}
### Next we use Caravan data with 5822 obs & 86 variables. Outcome=Purchase Y/N. Yes=0.0597 = 6%.
dim(Caravan)
attach(Caravan)
summary(Purchase)
348/5822
```

```{r}
### Standardizing the data for all but outcome=86th variable
standardized.X=scale(Caravan[,-86])
var(Caravan[,1])
var(Caravan[,2])
var(standardized.X[,1])
var(standardized.X[,2])

```

```{r}
### Test data is the firrst 1000 obs and remainint are train data without the variable purchase
test=1:1000
train.X=standardized.X[-test,]
test.X=standardized.X[test,]
train.Y=Purchase[-test]
test.Y=Purchase[test]
```

```{r}
### knn models are again built for k=1 & k=3
set.seed(1)
knn.pred=knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)
mean(test.Y!="No")
table(knn.pred,test.Y)
9/(68+9)
### After standardizing, k=1 yields 11.69% purchase rate and k=3 yields 19.23%
knn.pred=knn(train.X,test.X,train.Y,k=3)
table(knn.pred,test.Y)
5/26
```

```{r}
### When k=5 the purchase rate is at 26.67%
knn.pred=knn(train.X,test.X,train.Y,k=5)
table(knn.pred,test.Y)
4/15
```

```{r}
###Fitting a logistic regression model
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
glm.probs=predict(glm.fit,Caravan[test,],type="response")
### Setting prob>50% as purchase=Y there are no correct purchase predictions.
glm.pred=rep("No",1000)
glm.pred[glm.probs>.5]="Yes"
table(glm.pred,test.Y)
### But setting prob>.25 as Purchase=Y we get 11 correct predictions and a 33.3% Purchase rate.
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.Y)
11/(22+11)

```



