library(RCurl)
## Warning: package 'RCurl' was built under R version 3.2.4
## Loading required package: bitops
library(foreign)

url <- "https://raw.githubusercontent.com/gopakumargeetha/edxTheAnalyticsEdge/master/finalExam/ebay.csv"
ebay.data <- getURL(url)                
ebay.data <- read.csv(textConnection(ebay.data))
table(ebay.data$sold)
## 
##    0    1 
## 2997  799
sort(table(ebay.data$size))
## 
##    4   12 11.5  4.5   11    5 10.5  5.5   10    6  9.5  6.5  8.5    9    7 
##   14   14   21   28   79   87  139  154  235  264  298  356  385  397  402 
##  7.5    8 
##  413  442

CONVERTING VARIABLES TO FACTORS

ebay.data$sold <- as.factor(ebay.data$sold)
ebay.data$condition <- as.factor(ebay.data$condition)
ebay.data$heel <- as.factor(ebay.data$heel)
ebay.data$style <- as.factor(ebay.data$style)
ebay.data$color <- as.factor(ebay.data$color)
ebay.data$material <- as.factor(ebay.data$material)

Split the data

library(caTools)
set.seed(144)
spl <- sample.split(ebay.data$sold, SplitRatio = 0.7)
training <- subset(ebay.data, spl == T)
testing <- subset(ebay.data, spl == F)

TRAINING A LOGISTIC REGRESSION MODEL

LR <- glm(sold ~ biddable + startprice + condition + heel + style + color + material, data = training, family = binomial)
summary(LR)
## 
## Call:
## glm(formula = sold ~ biddable + startprice + condition + heel + 
##     style + color + material, family = binomial, data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5805  -0.7022  -0.5002  -0.2166   5.9322  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.5990788  0.3285428   1.823 0.068236 .  
## biddable                   0.0113984  0.1114610   0.102 0.918547    
## startprice                -0.0044423  0.0003041 -14.607  < 2e-16 ***
## conditionNew with defects -0.2451855  0.3727966  -0.658 0.510736    
## conditionNew without box  -0.2145965  0.2290351  -0.937 0.348780    
## conditionPre-owned        -0.4952981  0.1374505  -3.603 0.000314 ***
## heelFlat                   0.1431346  0.6387994   0.224 0.822704    
## heelHigh                   0.1224260  0.1340119   0.914 0.360955    
## heelLow                   -2.5549302  1.0411255  -2.454 0.014127 *  
## heelMedium                -0.5830418  0.2674958  -2.180 0.029285 *  
## styleOther/Missing         0.5268920  0.2127852   2.476 0.013280 *  
## stylePlatform             -0.1712048  0.2102085  -0.814 0.415386    
## stylePump                  0.4683107  0.1817995   2.576 0.009996 ** 
## styleSlingback            -0.2294999  0.2535765  -0.905 0.365438    
## styleStiletto              0.8325406  0.2606786   3.194 0.001404 ** 
## colorBlack                 0.2226547  0.1766847   1.260 0.207604    
## colorBrown                -0.5252811  0.2982060  -1.761 0.078159 .  
## colorOther/Missing        -0.2051389  0.1793759  -1.144 0.252779    
## colorRed                  -0.1261035  0.2705234  -0.466 0.641111    
## materialOther/Missing     -0.2192565  0.1531385  -1.432 0.152214    
## materialPatent Leather     0.0809572  0.1431549   0.566 0.571719    
## materialSatin             -1.1078098  0.3153264  -3.513 0.000443 ***
## materialSnakeskin          0.1562727  0.3444677   0.454 0.650070    
## materialSuede             -0.0713244  0.1789439  -0.399 0.690199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2733.9  on 2656  degrees of freedom
## Residual deviance: 2372.7  on 2633  degrees of freedom
## AIC: 2420.7
## 
## Number of Fisher Scoring iterations: 5

PREDICTING USING A LOGISTIC REGRESSION MODEL

Consider a shoe that is not for auction (biddable=0), that has start price $100, that is in condition “Pre-owned”, that has “High” heels, that has style “Open Toe”, that has color “Black”, and that has material “Satin”. What is the predicted probability that this shoe will be sold according to the logistic regression model?

LRpredict <- predict(LR, newdata = data.frame(biddable = 0, startprice = 100, condition = "Pre-owned", heel = "High", style = "Open Toe", color = "Black", material = "Satin"), type = "response")
LRpredict  # ans 0.2491433
##         1 
## 0.2491443

What is the meaning of the coefficient labeled “styleStiletto” in the logistic regression summary output? The coefficients of the model are the log odds associated with that variable; so we see that the odds of being sold are exp(0.8325406)=2.299153 those of an otherwise identical shoe in the baseline category for the style variable (which is “Open Toe”). This means the stiletto is predicted to have 129.9% higher odds of being sold. #styleStiletto 0.8325406 0.2606786 3.194 0.001404 ** 1-exp(0.8325406)

OBTAINING TEST SET PREDICTIONS

LRpred <- predict(LR, newdata = testing, type = "response")
table(LRpred >= 0.5)
## 
## FALSE  TRUE 
##  1059    80
table(training$sold)
## 
##    0    1 
## 2098  559
library(ROCR)
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.4
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
ROCRpred <- prediction(LRpred, testing$sold)
as.numeric(performance(ROCRpred, "auc")@y.values)
## [1] 0.7444244
library(RCurl)
library(foreign)
url <- "https://raw.githubusercontent.com/gopakumargeetha/edxTheAnalyticsEdge/master/finalExam/HubwayTrips.csv"
Hubway.data <- getURL(url)                
Hubway.data <- read.csv(textConnection(Hubway.data))

Understanding Customers of Hubway

Problem 1 - Reading in the Data How many observations are in this dataset?

str(Hubway.data)  #Ans 185190
## 'data.frame':    185190 obs. of  7 variables:
##  $ Duration : int  212 229 259 273 279 291 295 298 298 303 ...
##  $ Morning  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Afternoon: int  1 1 0 1 1 1 1 1 1 1 ...
##  $ Evening  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Weekday  : int  1 1 0 1 1 1 1 1 1 1 ...
##  $ Male     : int  1 1 0 1 0 1 1 0 0 1 ...
##  $ Age      : int  17 17 17 17 17 17 17 17 17 17 ...

Problem 2 - Average Duration What is the average duration (in seconds) of all trips in this dataset? What is the average duration (in seconds) of trips taken on the weekdays? What is the average duration (in seconds) of trips taken on the weekends?

summary(Hubway.data$Duration)  #Ans 721.6
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   180.0   377.0   562.0   721.6   860.0 85040.0
tapply(Hubway.data$Duration, Hubway.data$Weekday, mean)  #Ans 700.0921  # 1
##        0        1 
## 826.2457 700.0921
tapply(Hubway.data$Duration, Hubway.data$Weekday, mean)  #Ans 826.2457  # 0
##        0        1 
## 826.2457 700.0921

Problem 3 - Time of Day How many trips were taken in the morning? How many trips were taken in the afternoon? How many trips were taken in the evening?

table(Hubway.data$Morning)  #Ans 60399
## 
##      0      1 
## 124791  60399
table(Hubway.data$Afternoon)  #Ans 74021
## 
##      0      1 
## 111169  74021
table(Hubway.data$Evening)    #Ans 46264
## 
##      0      1 
## 138926  46264

Problem 4 - Gender Distribution In this dataset, what proportion of trips are taken by male users?

table(Hubway.data$Male)
## 
##      0      1 
##  48685 136505
136505/(136505+48685)  #Ans 0.7371078
## [1] 0.7371078

Problem 5 - Importance of Normalizing which variable would you expect to dominate in the distance calculations? Duration

Problem 6 - Normalizing the Data What is the maximum value of Duration in the normalized dataset? # 67.465 What is the maximum value of Age in the normalized dataset? # 3.8770

library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
preproc = preProcess(Hubway.data)
HubwayNorm = predict(preproc, Hubway.data)
summary(HubwayNorm)
##     Duration          Morning          Afternoon         Evening       
##  Min.   :-0.4333   Min.   :-0.6957   Min.   :-0.816   Min.   :-0.5771  
##  1st Qu.:-0.2757   1st Qu.:-0.6957   1st Qu.:-0.816   1st Qu.:-0.5771  
##  Median :-0.1277   Median :-0.6957   Median :-0.816   Median :-0.5771  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.000   Mean   : 0.0000  
##  3rd Qu.: 0.1108   3rd Qu.: 1.4374   3rd Qu.: 1.225   3rd Qu.:-0.5771  
##  Max.   :67.4650   Max.   : 1.4374   Max.   : 1.225   Max.   : 1.7329  
##     Weekday             Male              Age         
##  Min.   :-2.2088   Min.   :-1.6745   Min.   :-1.6711  
##  1st Qu.: 0.4527   1st Qu.:-1.6745   1st Qu.:-0.7616  
##  Median : 0.4527   Median : 0.5972   Median :-0.3068  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4527   3rd Qu.: 0.5972   3rd Qu.: 0.6027  
##  Max.   : 0.4527   Max.   : 0.5972   Max.   : 3.8770

Problem 7 - Hierarchical Clustering Why do you think hierarchical clustering might have a problem with this dataset? Ans: We might have too many observations in our dataset for Hierarchical clustering to handle

Problem 8 - K-Means Clustering How many observations are in the smallest cluster?

set.seed(5000)
HubwayKmeans = kmeans(HubwayNorm, 10)
HubwayCluster = HubwayKmeans$cluster
table(HubwayCluster)
## HubwayCluster
##     1     2     3     4     5     6     7     8     9    10 
## 28473 36409 20256 10819  9720 22931 16067 14981 15635  9899

Problem 9 - Understanding the Clusters Which cluster best fits the description “trips taken by female users on weekday evenings”? Ans: cluster 10

HubwayCluster1 = subset(HubwayNorm, HubwayCluster==1)
HubwayCluster2 = subset(HubwayNorm, HubwayCluster==2)
HubwayCluster3 = subset(HubwayNorm, HubwayCluster==3)
HubwayCluster4 = subset(HubwayNorm, HubwayCluster==4)
HubwayCluster5 = subset(HubwayNorm, HubwayCluster==5)
HubwayCluster6 = subset(HubwayNorm, HubwayCluster==6)
HubwayCluster7 = subset(HubwayNorm, HubwayCluster==7)
HubwayCluster8 = subset(HubwayNorm, HubwayCluster==8)
HubwayCluster9 = subset(HubwayNorm, HubwayCluster==9)
HubwayCluster10 = subset(HubwayNorm, HubwayCluster==10)
colMeans(HubwayCluster1)
##    Duration     Morning   Afternoon     Evening     Weekday        Male 
## -0.03423716 -0.69570011 -0.81598928  1.73288205  0.45273035  0.59720321 
##         Age 
## -0.21374647
colMeans(HubwayCluster2)
##    Duration     Morning   Afternoon     Evening     Weekday        Male 
## -0.03164463 -0.69570011  1.22544361 -0.57707020  0.45273035 -0.11039559 
##         Age 
## -0.54633125
colMeans(HubwayCluster3)
##     Duration      Morning    Afternoon      Evening      Weekday 
## -0.075974588  1.274694288 -0.815989277 -0.577070205  0.002308007 
##         Male          Age 
##  0.597203212 -0.751562117
colMeans(HubwayCluster4)
##    Duration     Morning   Afternoon     Evening     Weekday        Male 
## -0.03339589  1.43128118 -0.81598928 -0.57664319  0.21189048  0.53694181 
##         Age 
##  1.70549223
colMeans(HubwayCluster5)
##    Duration     Morning   Afternoon     Evening     Weekday        Male 
##  0.01065871 -0.69570011 -0.81598928  1.31652029 -2.20880842  0.02391135 
##         Age 
## -0.43695957
colMeans(HubwayCluster6)
##    Duration     Morning   Afternoon     Evening     Weekday        Male 
## -0.02970026 -0.69570011  1.20769419 -0.57707020  0.44425743  0.20767851 
##         Age 
##  1.30751246
colMeans(HubwayCluster7)
##     Duration      Morning    Afternoon      Evening      Weekday 
##  0.009374313  1.361320350 -0.815989277 -0.577070205  0.071729589 
##         Male          Age 
## -1.674462863 -0.074165535
colMeans(HubwayCluster8)
##   Duration    Morning  Afternoon    Evening    Weekday       Male 
##  0.1050650 -0.6952729  1.2098284 -0.5770702 -2.2088084 -0.1314101 
##        Age 
## -0.1854647
colMeans(HubwayCluster9)
##   Duration    Morning  Afternoon    Evening    Weekday       Male 
## -0.1109532  1.4020577 -0.8159893 -0.5770702  0.4527303  0.5972032 
##        Age 
##  0.2622499
colMeans(HubwayCluster10)
##   Duration    Morning  Afternoon    Evening    Weekday       Male 
##  0.4661932 -0.6898820 -0.8126896  1.7146806  0.4473530 -1.6517439 
##        Age 
## -0.3148030

Problem 10 - Understanding the Clusters Which cluster best fits the description “leisurely (longer than average) afternoon trips taken on the weekends”? HubwayKmeans$centers Ans: Cluster 8 0.105065026 -0.6952729 1.2098284 -0.5770702 -2.208808422 -0.13141006 -0.18546466

Problem 11 - Understanding the Clusters Which cluster best fits the description “morning trips taken by older male users”? HubwayKmeans$centers Ans: cluster 4 4 -0.033395894 1.4312812 -0.8159893 -0.5766432 0.211890485 0.53694181 1.70549223