library(RCurl)
## Warning: package 'RCurl' was built under R version 3.2.4
## Loading required package: bitops
library(foreign)
url <- "https://raw.githubusercontent.com/gopakumargeetha/edxTheAnalyticsEdge/master/finalExam/ebay.csv"
ebay.data <- getURL(url)
ebay.data <- read.csv(textConnection(ebay.data))
table(ebay.data$sold)
##
## 0 1
## 2997 799
sort(table(ebay.data$size))
##
## 4 12 11.5 4.5 11 5 10.5 5.5 10 6 9.5 6.5 8.5 9 7
## 14 14 21 28 79 87 139 154 235 264 298 356 385 397 402
## 7.5 8
## 413 442
CONVERTING VARIABLES TO FACTORS
ebay.data$sold <- as.factor(ebay.data$sold)
ebay.data$condition <- as.factor(ebay.data$condition)
ebay.data$heel <- as.factor(ebay.data$heel)
ebay.data$style <- as.factor(ebay.data$style)
ebay.data$color <- as.factor(ebay.data$color)
ebay.data$material <- as.factor(ebay.data$material)
Split the data
library(caTools)
set.seed(144)
spl <- sample.split(ebay.data$sold, SplitRatio = 0.7)
training <- subset(ebay.data, spl == T)
testing <- subset(ebay.data, spl == F)
TRAINING A LOGISTIC REGRESSION MODEL
LR <- glm(sold ~ biddable + startprice + condition + heel + style + color + material, data = training, family = binomial)
summary(LR)
##
## Call:
## glm(formula = sold ~ biddable + startprice + condition + heel +
## style + color + material, family = binomial, data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5805 -0.7022 -0.5002 -0.2166 5.9322
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5990788 0.3285428 1.823 0.068236 .
## biddable 0.0113984 0.1114610 0.102 0.918547
## startprice -0.0044423 0.0003041 -14.607 < 2e-16 ***
## conditionNew with defects -0.2451855 0.3727966 -0.658 0.510736
## conditionNew without box -0.2145965 0.2290351 -0.937 0.348780
## conditionPre-owned -0.4952981 0.1374505 -3.603 0.000314 ***
## heelFlat 0.1431346 0.6387994 0.224 0.822704
## heelHigh 0.1224260 0.1340119 0.914 0.360955
## heelLow -2.5549302 1.0411255 -2.454 0.014127 *
## heelMedium -0.5830418 0.2674958 -2.180 0.029285 *
## styleOther/Missing 0.5268920 0.2127852 2.476 0.013280 *
## stylePlatform -0.1712048 0.2102085 -0.814 0.415386
## stylePump 0.4683107 0.1817995 2.576 0.009996 **
## styleSlingback -0.2294999 0.2535765 -0.905 0.365438
## styleStiletto 0.8325406 0.2606786 3.194 0.001404 **
## colorBlack 0.2226547 0.1766847 1.260 0.207604
## colorBrown -0.5252811 0.2982060 -1.761 0.078159 .
## colorOther/Missing -0.2051389 0.1793759 -1.144 0.252779
## colorRed -0.1261035 0.2705234 -0.466 0.641111
## materialOther/Missing -0.2192565 0.1531385 -1.432 0.152214
## materialPatent Leather 0.0809572 0.1431549 0.566 0.571719
## materialSatin -1.1078098 0.3153264 -3.513 0.000443 ***
## materialSnakeskin 0.1562727 0.3444677 0.454 0.650070
## materialSuede -0.0713244 0.1789439 -0.399 0.690199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2733.9 on 2656 degrees of freedom
## Residual deviance: 2372.7 on 2633 degrees of freedom
## AIC: 2420.7
##
## Number of Fisher Scoring iterations: 5
PREDICTING USING A LOGISTIC REGRESSION MODEL
Consider a shoe that is not for auction (biddable=0), that has start price $100, that is in condition “Pre-owned”, that has “High” heels, that has style “Open Toe”, that has color “Black”, and that has material “Satin”. What is the predicted probability that this shoe will be sold according to the logistic regression model?
LRpredict <- predict(LR, newdata = data.frame(biddable = 0, startprice = 100, condition = "Pre-owned", heel = "High", style = "Open Toe", color = "Black", material = "Satin"), type = "response")
LRpredict # ans 0.2491433
## 1
## 0.2491443
What is the meaning of the coefficient labeled “styleStiletto” in the logistic regression summary output? The coefficients of the model are the log odds associated with that variable; so we see that the odds of being sold are exp(0.8325406)=2.299153 those of an otherwise identical shoe in the baseline category for the style variable (which is “Open Toe”). This means the stiletto is predicted to have 129.9% higher odds of being sold. #styleStiletto 0.8325406 0.2606786 3.194 0.001404 ** 1-exp(0.8325406)
OBTAINING TEST SET PREDICTIONS
LRpred <- predict(LR, newdata = testing, type = "response")
table(LRpred >= 0.5)
##
## FALSE TRUE
## 1059 80
table(training$sold)
##
## 0 1
## 2098 559
library(ROCR)
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
ROCRpred <- prediction(LRpred, testing$sold)
as.numeric(performance(ROCRpred, "auc")@y.values)
## [1] 0.7444244
library(RCurl)
library(foreign)
url <- "https://raw.githubusercontent.com/gopakumargeetha/edxTheAnalyticsEdge/master/finalExam/HubwayTrips.csv"
Hubway.data <- getURL(url)
Hubway.data <- read.csv(textConnection(Hubway.data))
Understanding Customers of Hubway
Problem 1 - Reading in the Data How many observations are in this dataset?
str(Hubway.data) #Ans 185190
## 'data.frame': 185190 obs. of 7 variables:
## $ Duration : int 212 229 259 273 279 291 295 298 298 303 ...
## $ Morning : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Afternoon: int 1 1 0 1 1 1 1 1 1 1 ...
## $ Evening : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Weekday : int 1 1 0 1 1 1 1 1 1 1 ...
## $ Male : int 1 1 0 1 0 1 1 0 0 1 ...
## $ Age : int 17 17 17 17 17 17 17 17 17 17 ...
Problem 2 - Average Duration What is the average duration (in seconds) of all trips in this dataset? What is the average duration (in seconds) of trips taken on the weekdays? What is the average duration (in seconds) of trips taken on the weekends?
summary(Hubway.data$Duration) #Ans 721.6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 180.0 377.0 562.0 721.6 860.0 85040.0
tapply(Hubway.data$Duration, Hubway.data$Weekday, mean) #Ans 700.0921 # 1
## 0 1
## 826.2457 700.0921
tapply(Hubway.data$Duration, Hubway.data$Weekday, mean) #Ans 826.2457 # 0
## 0 1
## 826.2457 700.0921
Problem 3 - Time of Day How many trips were taken in the morning? How many trips were taken in the afternoon? How many trips were taken in the evening?
table(Hubway.data$Morning) #Ans 60399
##
## 0 1
## 124791 60399
table(Hubway.data$Afternoon) #Ans 74021
##
## 0 1
## 111169 74021
table(Hubway.data$Evening) #Ans 46264
##
## 0 1
## 138926 46264
Problem 4 - Gender Distribution In this dataset, what proportion of trips are taken by male users?
table(Hubway.data$Male)
##
## 0 1
## 48685 136505
136505/(136505+48685) #Ans 0.7371078
## [1] 0.7371078
Problem 5 - Importance of Normalizing which variable would you expect to dominate in the distance calculations? Duration
Problem 6 - Normalizing the Data What is the maximum value of Duration in the normalized dataset? # 67.465 What is the maximum value of Age in the normalized dataset? # 3.8770
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
preproc = preProcess(Hubway.data)
HubwayNorm = predict(preproc, Hubway.data)
summary(HubwayNorm)
## Duration Morning Afternoon Evening
## Min. :-0.4333 Min. :-0.6957 Min. :-0.816 Min. :-0.5771
## 1st Qu.:-0.2757 1st Qu.:-0.6957 1st Qu.:-0.816 1st Qu.:-0.5771
## Median :-0.1277 Median :-0.6957 Median :-0.816 Median :-0.5771
## Mean : 0.0000 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.1108 3rd Qu.: 1.4374 3rd Qu.: 1.225 3rd Qu.:-0.5771
## Max. :67.4650 Max. : 1.4374 Max. : 1.225 Max. : 1.7329
## Weekday Male Age
## Min. :-2.2088 Min. :-1.6745 Min. :-1.6711
## 1st Qu.: 0.4527 1st Qu.:-1.6745 1st Qu.:-0.7616
## Median : 0.4527 Median : 0.5972 Median :-0.3068
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4527 3rd Qu.: 0.5972 3rd Qu.: 0.6027
## Max. : 0.4527 Max. : 0.5972 Max. : 3.8770
Problem 7 - Hierarchical Clustering Why do you think hierarchical clustering might have a problem with this dataset? Ans: We might have too many observations in our dataset for Hierarchical clustering to handle
Problem 8 - K-Means Clustering How many observations are in the smallest cluster?
set.seed(5000)
HubwayKmeans = kmeans(HubwayNorm, 10)
HubwayCluster = HubwayKmeans$cluster
table(HubwayCluster)
## HubwayCluster
## 1 2 3 4 5 6 7 8 9 10
## 28473 36409 20256 10819 9720 22931 16067 14981 15635 9899
Problem 9 - Understanding the Clusters Which cluster best fits the description “trips taken by female users on weekday evenings”? Ans: cluster 10
HubwayCluster1 = subset(HubwayNorm, HubwayCluster==1)
HubwayCluster2 = subset(HubwayNorm, HubwayCluster==2)
HubwayCluster3 = subset(HubwayNorm, HubwayCluster==3)
HubwayCluster4 = subset(HubwayNorm, HubwayCluster==4)
HubwayCluster5 = subset(HubwayNorm, HubwayCluster==5)
HubwayCluster6 = subset(HubwayNorm, HubwayCluster==6)
HubwayCluster7 = subset(HubwayNorm, HubwayCluster==7)
HubwayCluster8 = subset(HubwayNorm, HubwayCluster==8)
HubwayCluster9 = subset(HubwayNorm, HubwayCluster==9)
HubwayCluster10 = subset(HubwayNorm, HubwayCluster==10)
colMeans(HubwayCluster1)
## Duration Morning Afternoon Evening Weekday Male
## -0.03423716 -0.69570011 -0.81598928 1.73288205 0.45273035 0.59720321
## Age
## -0.21374647
colMeans(HubwayCluster2)
## Duration Morning Afternoon Evening Weekday Male
## -0.03164463 -0.69570011 1.22544361 -0.57707020 0.45273035 -0.11039559
## Age
## -0.54633125
colMeans(HubwayCluster3)
## Duration Morning Afternoon Evening Weekday
## -0.075974588 1.274694288 -0.815989277 -0.577070205 0.002308007
## Male Age
## 0.597203212 -0.751562117
colMeans(HubwayCluster4)
## Duration Morning Afternoon Evening Weekday Male
## -0.03339589 1.43128118 -0.81598928 -0.57664319 0.21189048 0.53694181
## Age
## 1.70549223
colMeans(HubwayCluster5)
## Duration Morning Afternoon Evening Weekday Male
## 0.01065871 -0.69570011 -0.81598928 1.31652029 -2.20880842 0.02391135
## Age
## -0.43695957
colMeans(HubwayCluster6)
## Duration Morning Afternoon Evening Weekday Male
## -0.02970026 -0.69570011 1.20769419 -0.57707020 0.44425743 0.20767851
## Age
## 1.30751246
colMeans(HubwayCluster7)
## Duration Morning Afternoon Evening Weekday
## 0.009374313 1.361320350 -0.815989277 -0.577070205 0.071729589
## Male Age
## -1.674462863 -0.074165535
colMeans(HubwayCluster8)
## Duration Morning Afternoon Evening Weekday Male
## 0.1050650 -0.6952729 1.2098284 -0.5770702 -2.2088084 -0.1314101
## Age
## -0.1854647
colMeans(HubwayCluster9)
## Duration Morning Afternoon Evening Weekday Male
## -0.1109532 1.4020577 -0.8159893 -0.5770702 0.4527303 0.5972032
## Age
## 0.2622499
colMeans(HubwayCluster10)
## Duration Morning Afternoon Evening Weekday Male
## 0.4661932 -0.6898820 -0.8126896 1.7146806 0.4473530 -1.6517439
## Age
## -0.3148030
Problem 10 - Understanding the Clusters Which cluster best fits the description “leisurely (longer than average) afternoon trips taken on the weekends”? HubwayKmeans$centers Ans: Cluster 8 0.105065026 -0.6952729 1.2098284 -0.5770702 -2.208808422 -0.13141006 -0.18546466
Problem 11 - Understanding the Clusters Which cluster best fits the description “morning trips taken by older male users”? HubwayKmeans$centers Ans: cluster 4 4 -0.033395894 1.4312812 -0.8159893 -0.5766432 0.211890485 0.53694181 1.70549223