Please submit your answers by 5.59 pm Monday March 4, 2019
We are going to perform perdiction on a voting dataset (files->assignments->assignment_4). The dataset contains the party affliation of 435 congressional members along with voting record on 16 issues that were put to vote in a single year. The party affliation is indicated as a binary variable as a 1 if the congress-person was a member of the ‘A’ party and 0 otherwise. The vote is indicated as 1 (if the member voted yes) and 0 (if ithe member voted no).
0 : 168 members 1 : 267 members
Using caret, create a rough 80-20 split of the dataset into training and testing. In other words, 80% of the data should comprise the training dataset and 20% of the data should comprise the testing dataset. Ensure that the class membership is even (in other words, the proportion of 1 and 0 in both training and testing should be the approximately the same)
NOTE: Set the seed to 476
# Insert your code below
rm(list=ls())
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data_votes <- read.csv("~/Desktop/data_votes.csv", header = TRUE)
data_votes <- data_votes %>%
mutate_if(is.numeric,as.factor)
#change party_A to factor
#data_votes$party_A <- as.factor(data_votes$party_A)
set.seed(476)
#to create the 80-20 split
data_index <- createDataPartition(data_votes$party_A, p=.8, list = FALSE, times=1)
head(data_index) #to double check
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 8
data_votes.train <- data_votes[data_index,]
data_votes.test <- data_votes[-data_index,]
summary(as.factor(data_votes.train$party_A))
## 0 1
## 135 214
summary(as.factor(data_votes.test$party_A))
## 0 1
## 33 53
# Insert your code below
#Logistic regression on dataset
data_votes.train.log <- glm(party_A~., data=data_votes.train, family="binomial")
#Perform prediction on test dataset
data_votes.test$pred_party_A <- predict.glm(data_votes.train.log, newdata = data_votes.test, type="response")
# Insert your code below
#Note we create new variable pred_party_A1 to prevent values from b) overriding the values in c)
data_votes.test$pred_party_A1 <- ifelse(data_votes.test$pred_party_A >= 0.5,1,0)
#True positive TP
tp <- data_votes.test %>%
filter(party_A == 1 & pred_party_A1 == 1) %>% nrow()
print(tp)
## [1] 50
cat("True positive TP:", tp,"\n")
## True positive TP: 50
#True negative TN
tn <- data_votes.test %>%
filter(party_A == 0 & pred_party_A1 == 0) %>% nrow()
print(tn)
## [1] 33
cat("True negative TN:", tn,"\n")
## True negative TN: 33
#False positive FP
fp <- data_votes.test %>%
filter(party_A == 0 & pred_party_A1 == 1) %>% nrow()
print(fp)
## [1] 0
cat("False positive FP:", fp,"\n")
## False positive FP: 0
#False negative FN
fn <- data_votes.test %>%
filter(party_A == 1 & pred_party_A1 == 0) %>% nrow()
print(fn)
## [1] 3
cat("False negative FN:", fn,"\n")
## False negative FN: 3
| Table | Actual_positive | Actual_negative |
|---|---|---|
| Pred_positive | 50 | 0 |
| Pred_negative(FN) | 3 | 33 |
## [1] 0.9433962
## Sensitivity TPR: 0.9433962
## [1] 1
## Specificity (1-FPR): 1
## [1] 1
## Positive Predictive Value PPV: 1
## [1] 0.9166667
## Negative Predictive Value NPV: 0.9166667
## [1] 0
## False Positive Rate FPR: 0
## [1] 0.9651163
## Accuracy: 0.9651163
# Insert code below
#First create a prediction object
pred <- roc(response = data_votes.test$party_A, predictor = data_votes.test$pred_party_A, direction = "<")
#Get AUC performance = 0.9716981
auc_perf <- auc(pred)
cat("AUC: ", auc_perf, "\n")
## AUC: 0.9914237
#AUC 95% CI = 0.940294 0.9716981 1
ci_auc_perf <- ci.auc(pred)
cat("95% CI: ", ci_auc_perf, "\n")
## 95% CI: 0.9761996 0.9914237 1
Write a program that will perform 3-fold cross-validation (using the caret package) on the above train dataset. Calculate AUC (with 95% CI) on the test dataset.
NOTE : Set your seed as 156
# Insert code here
library(plyr)
library(dplyr)
library(caret)
library(pROC)
set.seed(156)
training_params <- trainControl(method="cv", number=3)
#Training
data_votes.glm <- train(as.factor(party_A) ~., data=data_votes.train, method = "glm", trControl = training_params)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(data_votes.glm)
#Prediction using caret
votes_yhat_glm <- predict(data_votes.glm, newdata = data_votes.test, type = "prob")
#Create a prediction object
votes_glm.pred <- roc(predictor = votes_yhat_glm[,2],
response = data_votes.test$party_A, direction = "<")
# Get performance
AUC <- auc(votes_glm.pred)
CI_AUC <- ci.auc(votes_glm.pred)
cat("AUC:",AUC,"\n")
## AUC: 0.9914237
cat("95%CI:", CI_AUC, "\n")
## 95%CI: 0.9761996 0.9914237 1
We are going to use the USArrests dataset. Load this using the following command
rm(list=ls())
library(datasets)
d.in <- data.frame(USArrests)
#to remove missing values present in data use the following
#d.in <- na.omit(df)
Ans.
# Insert code here
# Using Euclidean distance
USArrest_in <- dist(d.in, method = "euclidean")
USArrest_temp_matrix <- as.matrix(USArrest_in)
print(USArrest_temp_matrix[1:6,1:6])
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.00000 37.17701 63.00833 46.92814 55.52477 41.93256
## Alaska 37.17701 0.00000 46.59249 77.19741 45.10222 66.47594
## Arizona 63.00833 46.59249 0.00000 108.85192 23.19418 90.35115
## Arkansas 46.92814 77.19741 108.85192 0.00000 97.58202 36.73486
## California 55.52477 45.10222 23.19418 97.58202 0.00000 73.19713
## Colorado 41.93256 66.47594 90.35115 36.73486 73.19713 0.00000
#Euclidean distance plot
USArrest_euclidean <- dist(USArrest_in, method = "euclidean")
plot(USArrest_euclidean, main="Euclidean Distance")
#Average linkage plot
USArrest_average.h_in <- hclust(USArrest_in, method="average")
plot(USArrest_average.h_in, main="Average Linkage Cluster Dendogram")
# Insert code here
d.in.scale <- as.data.frame(scale(d.in))
summary(d.in.scale)
## Murder Assault UrbanPop Rape
## Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
## 1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
## Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
## Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
USArrest_in_scale <- dist(d.in.scale, method="euclidean")
USArrest_temp_matrix_scale <- as.matrix(USArrest_in_scale)
print(USArrest_temp_matrix_scale[1:6,1:6])
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.000000 2.703754 2.293520 1.289810 3.263110 2.651067
## Alaska 2.703754 0.000000 2.700643 2.826039 3.012541 2.326519
## Arizona 2.293520 2.700643 0.000000 2.717758 1.310484 1.365031
## Arkansas 1.289810 2.826039 2.717758 0.000000 3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641 0.000000 1.287619
## Colorado 2.651067 2.326519 1.365031 2.831051 1.287619 0.000000
#sd(d.in.scale$Assault) to confirm our std dev is 1
#Euclidean distance plot scaled
USArrest_euclidean_scale <- dist(USArrest_in_scale, method = "euclidean")
plot(USArrest_euclidean_scale, main="Scaled Euclidean distance")
#Average linkage plot scaled
USArrest_average.h_in_scale <- hclust(USArrest_in_scale, method="complete")
plot(USArrest_average.h_in_scale, main="Scaled Average linkage Cluster Dendogram")
Download the dataset kmeans_data.csv (Files->Assignments->Assignment_4). The dataset contains randomly generated 100 observations of 2 variables, viz., X1 and X2
# Insert code
rm(list=ls())
kmeans_data <- read.csv("~/Desktop/kmeans_data.csv", header = TRUE)
plot(x=kmeans_data$X1, y=kmeans_data$X2, type = "p", col="black", pch=1, cex=2, xlab="X1", ylab="X2", main="Plot X1 vs X2")
#pch changes shape and cex is size of shape
# Insert code
set.seed(451)
kmeans_data.in <- kmeans(kmeans_data,centers=3)
plot(kmeans_data$X1, kmeans_data$X2, col=kmeans_data.in$cluster, pch=1, cex=1, main="K-means clustering (K=3)")
text(kmeans_data$X1, kmeans_data$X2, labels=kmeans_data.in$cluster,pos=4)
points(kmeans_data.in$centers, col=1:3, pch=3, cex=3, lwd=3)
kmeans_data.in$tot.withinss #distance of points within cluster, we want a low value
## [1] 219.1554
kmeans_data.in$betweenss #distance between the clusters, we want this value to be high
## [1] 1771.599
# Insert code
set.seed(451)
kmeans_data.in.2 <- kmeans(kmeans_data,centers=4)
plot(kmeans_data$X1, kmeans_data$X2, col=kmeans_data.in.2$cluster, pch=1, cex=1, main="K-means clustering (K=4)")
text(kmeans_data$X1, kmeans_data$X2, labels=kmeans_data.in.2$cluster,pos=4)
points(kmeans_data.in.2$centers, col=1:4, pch=3, cex=4, lwd=4)
kmeans_data.in.2$tot.withinss #distance of points within cluster, we want a low value
## [1] 197.9767
kmeans_data.in.2$betweenss #distance between the clusters, we want this value to be high
## [1] 1792.778
K=3 tot.withinss 219.1554 betweenss 1771.599
K=4 tot.withinss 197.9767 betweenss 1792.778
You are given the the following distance matrix that describes the euclidean distance between cities.
| Table | BOS | NY | DC | CHI |
|---|---|---|---|---|
| BOS | 0 | 206 | 429 | 963 |
| NY | 206 | 0 | 233 | 802 |
| DC | 429 | 233 | 0 | 671 |
| CHI | 963 | 802 | 671 | 0 |
You are asked to perform a hierarchical clustering using single linkage.
The nearest pair of cities is BOS and NY, at distance 206.
Ans. Calculations: min[dist(BOS,NY) DC] min[429,233] = 233 B-DC, NY-DC
min[dist(BOS,NY) CHI] min[963,802] = 802 B-CHI NY-CHI
I changed the values and created the table shown below
| Table | (BOS/NY) | DC | CHI |
|---|---|---|---|
| (BOS/NY) | 0 | 233 | 802 |
| DC | 233 | 0 | 671 |
| CHI | 802 | 671 | 0 |
Ans. Calculations: min[dist((BOS/NY,DC), CHI)] min(802,671) = 671
I changed the value and created the table shown below with two clusters
| Table | (BOS/NY, DC) |
|---|---|
| (BOS/NY,DC) | 0 |
| CHI | 671 |