Assignment 3 Classification

R Markdown

###13

##a

library(MASS)
library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.1.3

## 
## Attaching package: 'ISLR2'

## The following object is masked from 'package:MASS':
## 
##     Boston

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

plot(Weekly)

The only significant patterns are between Volume and Year.

##b

glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family="binomial")
summary (glm.fits)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag2 was the only statistically significant variable.

##c

glm.probs=predict(glm.fits, type="response")
glm.pred=rep("Down", 1089)
glm.pred[glm.probs >.5]=" Up"
table(glm.pred, Weekly$Direction)

##         
## glm.pred Down  Up
##      Up   430 557
##     Down   54  48

This displays that the model can correctly predict the Up trends 96.3%. While Down rates were only correctly predicted at 4.75%.

##D

train = (Weekly$Year<2009)
Weekly.0910= Weekly[!train ,]
Direction.0910 = Weekly$Direction[!train]


glm.fits=glm(Direction~Lag2, data=Weekly,family=binomial ,subset=train)
glm.probs=predict (glm.fits, Weekly.0910, type="response")
glm.pred=rep("Down", 1089)
glm.pred[glm.probs >.5]=" Up"


table(glm.pred, Direction.0910)

## Error in table(glm.pred, Direction.0910): all arguments must have the same length

mean(logWeekly.pred == Direction.0910)

## Error in mean(logWeekly.pred == Direction.0910): object 'logWeekly.pred' not found

##e

lda.fit = lda(Direction~Lag2, data= Weekly,family=binomial, subset=train)
lda.pred<-predict(lda.fit, Weekly.0910)
table(lda.pred$class, Direction.0910)

##       Direction.0910
##        Down Up
##   Down    9  5
##   Up     34 56

mean(lda.pred$class==Direction.0910)

## [1] 0.625

##f

qda.fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred = predict(qda.fit, Weekly.0910)
table(qda.pred$class, Direction.0910)

##       Direction.0910
##        Down Up
##   Down    0  0
##   Up     43 61

mean(qda.pred$class==Direction.0910)

## [1] 0.5865385

##g

library(class)
train.X=cbind(Lag1, Lag2)[train ,]

## Error in cbind(Lag1, Lag2): object 'Lag1' not found

test.X=cbind(Lag1, Lag2)[!train ,]

## Error in cbind(Lag1, Lag2): object 'Lag1' not found

train.Direction =Direction [train]

## Error in eval(expr, envir, enclos): object 'Direction' not found

set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction ,k=1)

## Error in as.matrix(train): object 'train.X' not found

table(knn.pred ,Direction.0910)

## Error in table(knn.pred, Direction.0910): object 'knn.pred' not found

##h

##i Both the Logistic Regression method and the Linear Discriminant Analysis have the higest rates.

##j

###14

##a

attach(Auto)
summary(Auto)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365

mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto = data.frame(Auto, mpg01)

##b

plot(Auto)

Variables that can strongly associate with mpg01 include Weight, Displacement, and Cylinders.

##c

train.auto <- Auto[train,]
test.auto <- Auto[-train,]

##d

lda_m <- lda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data = train.auto)
lda_p <- predict(lda_m, test.auto)
table(lda_p$class, test.auto$mpg01)

##    
##       0   1
##   0 166   7
##   1  29 189

##e

qda_m <- qda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data = train.auto)
qda_p <- predict(qda_m, test.auto)
table(qda_p$class, test.auto$mpg01)

##    
##       0   1
##   0 175  14
##   1  20 182

##f

glm_m <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = train.auto, family = binomial)
summary(glm_m)

## 
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower, 
##     family = binomial, data = train.auto)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4400  -0.1910   0.0476   0.3572   3.3790  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  11.7966002  1.7091082   6.902 5.12e-12 ***
## cylinders    -0.0129303  0.3457347  -0.037  0.97017    
## weight       -0.0019458  0.0006918  -2.812  0.00492 ** 
## displacement -0.0129820  0.0082203  -1.579  0.11428    
## horsepower   -0.0421321  0.0139763  -3.015  0.00257 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 543.43  on 391  degrees of freedom
## Residual deviance: 207.27  on 387  degrees of freedom
##   (593 observations deleted due to missingness)
## AIC: 217.27
## 
## Number of Fisher Scoring iterations: 7

probs <- predict(glm_m, test.auto, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, mpg01.test)

## Error in table(pred.glm, mpg01.test): object 'mpg01.test' not found

mean(pred.glm != mpg01.test)

## Error in mean(pred.glm != mpg01.test): object 'mpg01.test' not found

##g

##h

###16

data(Boston)
crim_1 <- rep(0, length(Boston$crim))
crim_1[Boston$crim > median(Boston$crim)] <- 1
Boston <- data.frame(Boston, crim_1)
summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv           crim_1   
##  Min.   : 5.00   Min.   :0.0  
##  1st Qu.:17.02   1st Qu.:0.0  
##  Median :21.20   Median :0.5  
##  Mean   :22.53   Mean   :0.5  
##  3rd Qu.:25.00   3rd Qu.:1.0  
##  Max.   :50.00   Max.   :1.0

set.seed(1337)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
Boston.train <- Boston[train, ]
Boston.test <- Boston[test, ]
crim_1.test <- crim_1[test]

##Logistic Regression

fit.glm <- glm(crim_1~nox+indus+age+rad, data = Boston, family = binomial)
probs <- predict(fit.glm, Boston.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, crim_1.test)

##         crim_1.test
## pred.glm  0  1
##        0 63 16
##        1  7 66

mean(pred.glm != crim_1.test)

## [1] 0.1513158

The Error rate for the Logistic Regression is 15.1%

###Linear Discriminant Analysis

fit.lda <- lda(crim_1~nox+indus+age+rad, data = Boston)
pred.lda <- predict(fit.lda, Boston.test)
table(pred.lda$class, crim_1.test)

##    crim_1.test
##      0  1
##   0 67 22
##   1  3 60

mean(pred.lda$class != crim_1.test)

## [1] 0.1644737

The Error rate for the LDA model is 16.4%

##KKN

data = scale(Boston[,-c(1,15)])
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
training_data = data[train, c("nox","indus","age","rad")]
testing_data = data[test, c("nox","indus","age","rad")]
train.crime_1 = Boston$crim0_1[train]
test.crime_1= Boston$crim_1[test]

Assignment 3 Classification

Alejandro Perez

2023-02-23

R Markdown