Collinearity check
x=cor(travel[,c("NumberofResponses","InFoNumber","FoFoNumber")])
x
## NumberofResponses InFoNumber FoFoNumber
## NumberofResponses 1.0000000 0.8712372 0.7815957
## InFoNumber 0.8712372 1.0000000 0.4716184
## FoFoNumber 0.7815957 0.4716184 1.0000000
Create a few variables
#####################################################################################
#Create a field called Response rate to be used as a dependent variable
travel[,"ResponseRate"]=0; travel[,"ResponseRate"]=travel$NumberofResponses/travel$ViewNumber
x=quantile(travel$ResponseRate) #50 percentile is 0.0123%
midResponseRate=x[3]
#Create a categorical variable: ResponseRateLevel
travel[(travel$ResponseRate<=midResponseRate),"ResponseRateLevel"]=0
travel[(travel$ResponseRate>midResponseRate),"ResponseRateLevel"]=1
tabResponseRateLevel=table(travel$ResponseRateLevel)
#Convert NumberofResponses into two categories by creating a separate variable "ResponseCategory"
#ResponseCategory=1 implies High level of responses; O = Low level of responses
y=quantile(travel$NumberofResponses) #50 percentile is 8
midNumberofResponses=y[3]
travel[(travel$NumberofResponses<=midNumberofResponses),"ResponseCategory"]=0 #The value of 8 is the 50 percentile in its distribution
travel[(travel$NumberofResponses>midNumberofResponses),"ResponseCategory"]=1
tabResponseCategory=table(travel$ResponseCategory)
#Create a variable called Engagement=InFoNumber/NumberofResponses
travel[,"Engagement"]=travel$InFoNumber/travel$NumberofResponses
summary(travel$Engagement)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1667 0.2022 0.3077 1.0000
midEngage=quantile(travel$Engagement)[3]
travel[(travel$Engagement<=midEngage),"EngageLevel"]=0
travel[(travel$Engagement>midEngage),"EngageLevel"]=1
aggregate(logNumResponses~EngageLevel,data=travel,mean)
## EngageLevel logNumResponses
## 1 0 0.8831148
## 2 1 1.0288536
xtab=xtabs(~Male+EngageLevel,travel);xtab
## EngageLevel
## Male 0 1
## 0 10770 9632
## 1 5201 4819
chisq.test(xtab)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: xtab
## X-squared = 2.0646, df = 1, p-value = 0.1508
Perform Logistic Regressions (ResponseCategory as a dependent var)
##SOME LOGISTIC REGRESSIONS
#The response variable should be binomial (0, 1) or quasibinomial (0-1)
###What determines ResponseCategory?
#Model 1: It is random; depends on nothing, Null model
m1=glm(ResponseCategory~1, data=travel,family=binomial()); summary(m1)
##
## Call:
## glm(formula = ResponseCategory ~ 1, family = binomial(), data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.138 -1.138 -1.138 1.218 1.218
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09395 0.01148 -8.184 2.74e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 42107 on 30421 degrees of freedom
## Residual deviance: 42107 on 30421 degrees of freedom
## AIC: 42109
##
## Number of Fisher Scoring iterations: 3
#The output shows that Null deviance is 42107. Smaller values of Null deviance indicate better fit.
#Model 2: The posted object itself, nothing else
m2=glm(ResponseCategory~logNote+logPicture+Exotic+Natural+Heritage+Urban, data=travel,family=binomial()); summary(m2)
##
## Call:
## glm(formula = ResponseCategory ~ logNote + logPicture + Exotic +
## Natural + Heritage + Urban, family = binomial(), data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8396 -1.0929 -0.5089 1.1104 2.3418
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.67536 0.05474 -48.873 <2e-16 ***
## logNote 0.59206 0.01583 37.410 <2e-16 ***
## logPicture 0.43794 0.01866 23.470 <2e-16 ***
## Exotic 0.27967 0.02793 10.013 <2e-16 ***
## Natural 0.31757 0.02841 11.178 <2e-16 ***
## Heritage -0.02111 0.03679 -0.574 0.566
## Urban NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 42107 on 30421 degrees of freedom
## Residual deviance: 38950 on 30416 degrees of freedom
## AIC: 38962
##
## Number of Fisher Scoring iterations: 4
#The output shows that logNote,logPicture, Exotic, Natural are significant and positively associated.
#Model 2 seems better than the null model (Constant only-m1) because Residual deviance<Null deviance.
#However, the large value of the deviance is still a concern.
#Therefore, we need to test the significance of this new model as well. The chisq probability for 1649.931 (43539-41889) with 5 (31556-31551) degrees of freedom is close to zero suggesting that the model 1 is significant.
#Model fit test
modelChi= m2$null.deviance-m2$deviance
df=m2$df.null-m2$df.residual
chisq.prob=1-pchisq(modelChi,df); chisq.prob
## [1] 0
#Chi square probability of zero indicates that the model 2 is significant in explaining the variation.
#Model 3: The posted object itself PLUS Poster's characteristics
m3=glm(ResponseCategory~Male+logNote+logPicture+Exotic+Natural+Heritage+Urban, data=travel,family=binomial()); summary(m3)
##
## Call:
## glm(formula = ResponseCategory ~ Male + logNote + logPicture +
## Exotic + Natural + Heritage + Urban, family = binomial(),
## data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8451 -1.0930 -0.5086 1.1104 2.3463
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.68668 0.05659 -47.475 <2e-16 ***
## Male 0.02088 0.02626 0.795 0.427
## logNote 0.59384 0.01599 37.139 <2e-16 ***
## logPicture 0.43746 0.01867 23.433 <2e-16 ***
## Exotic 0.28137 0.02801 10.045 <2e-16 ***
## Natural 0.31631 0.02845 11.116 <2e-16 ***
## Heritage -0.02357 0.03691 -0.639 0.523
## Urban NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 42107 on 30421 degrees of freedom
## Residual deviance: 38949 on 30415 degrees of freedom
## AIC: 38963
##
## Number of Fisher Scoring iterations: 4
# The Gender does not seem to make the difference
#Model 4: Posted Object PLUS Poster's Characteristics PLUS Prestige and Social Interaction
m4=glm(ResponseCategory~Male+logFollowing+logFollowed+logNote+logPicture+Exotic+Natural+Heritage, data=travel,family=binomial()); summary(m4)
##
## Call:
## glm(formula = ResponseCategory ~ Male + logFollowing + logFollowed +
## logNote + logPicture + Exotic + Natural + Heritage, family = binomial(),
## data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5072 -0.9674 -0.3532 0.9885 2.8139
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.939690 0.072887 -54.052 < 2e-16 ***
## Male -0.195276 0.028728 -6.798 1.06e-11 ***
## logFollowing -0.007082 0.031394 -0.226 0.821522
## logFollowed 0.945674 0.022642 41.766 < 2e-16 ***
## logNote 0.724261 0.018150 39.904 < 2e-16 ***
## logPicture 0.320825 0.019975 16.062 < 2e-16 ***
## Exotic 0.329928 0.029975 11.007 < 2e-16 ***
## Natural 0.422721 0.030452 13.881 < 2e-16 ***
## Heritage -0.141623 0.039401 -3.594 0.000325 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 42107 on 30421 degrees of freedom
## Residual deviance: 35153 on 30413 degrees of freedom
## AIC: 35171
##
## Number of Fisher Scoring iterations: 4
Perform Logistic Regressions (ResponseRate as a dependent var)
########################################
#What determines ResponseRate?
#Model 1a: It is random; depends on nothing, Null model
m1a=glm(ResponseRate~1, data=travel,family=quasibinomial()); summary(m1a)
##
## Call:
## glm(formula = ResponseRate ~ 1, family = quasibinomial(), data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.17360 -0.07822 -0.02776 0.03795 0.81910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.144890 0.004749 -872.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.01053517)
##
## Null deviance: 264 on 30421 degrees of freedom
## Residual deviance: 264 on 30421 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 7
#The output shows that Null deviance is 264. Smaller values of Null deviance indicate better fit.
#Model 2a: The posted object itself, nothing else
m2a=glm(ResponseRate~logNote+logPicture+Exotic+Natural+Heritage+Urban, data=travel,family=quasibinomial()); summary(m2a)
##
## Call:
## glm(formula = ResponseRate ~ logNote + logPicture + Exotic +
## Natural + Heritage + Urban, family = quasibinomial(), data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.19615 -0.07481 -0.02559 0.03600 0.82647
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.477511 0.018108 -247.272 < 2e-16 ***
## logNote 0.038947 0.005193 7.500 6.54e-14 ***
## logPicture 0.214742 0.007565 28.387 < 2e-16 ***
## Exotic -0.258705 0.011108 -23.291 < 2e-16 ***
## Natural -0.053398 0.011022 -4.845 1.27e-06 ***
## Heritage 0.116986 0.013771 8.495 < 2e-16 ***
## Urban NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.009944538)
##
## Null deviance: 264.00 on 30421 degrees of freedom
## Residual deviance: 248.05 on 30416 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 7
#The output shows that logNote,logPicture, Exotic, Natural are significant and positively associated.
#Model 2 seems better than the null model (Constant only-m1) because Residual deviance<Null deviance.
#However, the large value of the deviance is still a concern.
#Therefore, we need to test the significance of this new model as well. The chisq probability for 1649.931 (43539-41889) with 5 (31556-31551) degrees of freedom is close to zero suggesting that the model 1 is significant.
#Model fit test
modelChi= m2a$null.deviance-m2a$deviance
df=m2a$df.null-m2a$df.residual
chisq.prob=1-pchisq(modelChi,df); chisq.prob
## [1] 0.007001015
#Chi square probability of 0.007 indicates that the model 2a is significant in explaining the variation.
#Model 3a: The posted object itself PLUS Poster's characteristics
m3a=glm(ResponseRate~Male+logNote+logPicture+Exotic+Natural+Heritage+Urban, data=travel,family=quasibinomial()); summary(m3a)
##
## Call:
## glm(formula = ResponseRate ~ Male + logNote + logPicture + Exotic +
## Natural + Heritage + Urban, family = quasibinomial(), data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.19765 -0.07477 -0.02543 0.03616 0.82814
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.487492 0.018821 -238.434 < 2e-16 ***
## Male 0.019384 0.009948 1.949 0.0514 .
## logNote 0.040483 0.005251 7.709 1.31e-14 ***
## logPicture 0.214191 0.007566 28.311 < 2e-16 ***
## Exotic -0.256928 0.011138 -23.068 < 2e-16 ***
## Natural -0.054800 0.011039 -4.964 6.94e-07 ***
## Heritage 0.114492 0.013823 8.283 < 2e-16 ***
## Urban NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.009937192)
##
## Null deviance: 264.00 on 30421 degrees of freedom
## Residual deviance: 248.02 on 30415 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 7
#Model 4: Posted Object PLUS Poster's Characteristics PLUS Prestige and Social Interaction
m4a=glm(ResponseRate~Male+logFollowing+logFollowed+logNote+logPicture+Exotic+Natural+Heritage, data=travel,family=quasibinomial()); summary(m4a)
##
## Call:
## glm(formula = ResponseRate ~ Male + logFollowing + logFollowed +
## logNote + logPicture + Exotic + Natural + Heritage, family = quasibinomial(),
## data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.20255 -0.07389 -0.02487 0.03681 0.86579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.676944 0.021309 -219.486 < 2e-16 ***
## Male -0.021969 0.010078 -2.180 0.0293 *
## logFollowing 0.144431 0.010895 13.257 < 2e-16 ***
## logFollowed 0.005709 0.007461 0.765 0.4441
## logNote 0.047029 0.005239 8.976 < 2e-16 ***
## logPicture 0.199843 0.007539 26.508 < 2e-16 ***
## Exotic -0.240891 0.011046 -21.808 < 2e-16 ***
## Natural -0.051623 0.010914 -4.730 2.26e-06 ***
## Heritage 0.093721 0.013687 6.847 7.67e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.009706847)
##
## Null deviance: 264.00 on 30421 degrees of freedom
## Residual deviance: 244.07 on 30413 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 7
Perform Logistic Regressions (EngageLevel as a dependent var)
#Model 5: Posted Object PLUS Poster's Characteristics PLUS Prestige and Social Interaction
m5=glm(EngageLevel~Male+logFollowing+logFollowed+logNote+logPicture+Exotic+Natural+Heritage, data=travel,family=quasibinomial()); summary(m5)
##
## Call:
## glm(formula = EngageLevel ~ Male + logFollowing + logFollowed +
## logNote + logPicture + Exotic + Natural + Heritage, family = quasibinomial(),
## data = travel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4953 -1.1275 -0.9527 1.2061 1.5050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.648226 0.052303 -12.394 < 2e-16 ***
## Male -0.032333 0.025580 -1.264 0.206
## logFollowing 0.058489 0.027434 2.132 0.033 *
## logFollowed 0.178465 0.018767 9.510 < 2e-16 ***
## logNote 0.012038 0.012709 0.947 0.344
## logPicture 0.213648 0.018120 11.791 < 2e-16 ***
## Exotic -0.110933 0.027105 -4.093 4.27e-05 ***
## Natural -0.019991 0.027337 -0.731 0.465
## Heritage 0.007749 0.035471 0.218 0.827
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.000233)
##
## Null deviance: 42098 on 30421 degrees of freedom
## Residual deviance: 41624 on 30413 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
#Model fit test
modelChi= m5$null.deviance-m5$deviance
df=m5$df.null-m5$df.residual
chisq.prob=1-pchisq(modelChi,df); chisq.prob
## [1] 0
#Chi square probability of zero indicates that the model 5 is significant in explaining the variation.
########################################################
##LINEAR DISCRIMINANT ANALYSIS ON ResponseCategory
library(MASS)
r=nrow(travel)
train=sample(1:r,round(r*0.8))
ld=lda(ResponseCategory~Male+logFollowing+logFollowed+logNote+logPicture+Exotic+Natural+Heritage, data=travel,subset=train)
train.data=travel[train,]
test.data=travel[-train,]
#Predict training data
pTrain=predict(ld,travel[train,])$class
mean(pTrain==train.data$ResponseCategory)
## [1] 0.6955378
#Predict test data
pTest=predict(ld,travel[-train,])$class
mean(pTest==test.data$ResponseCategory)
## [1] 0.6924721
########################################################
##LINEAR DISCRIMINANT ANALYSIS ON EngageLevel
library(MASS)
r=nrow(travel)
train=sample(1:r,round(r*0.8))
train.data=travel[train,]
test.data=travel[-train,]
ld2=lda(EngageLevel~Male+logFollowing+logFollowed+logNote+logPicture+Exotic+Natural+Heritage, data=travel,subset=train)
#Predict training data
pTrain=predict(ld2,travel[train,])$class
mean(pTrain==train.data$ResponseCategory)
## [1] 0.650341
#Predict test data
pTest=predict(ld2,travel[-train,])$class
mean(pTest==test.data$ResponseCategory)
## [1] 0.6518738
Conclusion
- It seems that the factors such as Male+logFollowing+logFollowed+logNote+logPicture+Exotic+Natural+Heritage predict whether a post is going to invite a high level of response or a low level response. The LDA model’s prediction success rate is about 70%.