DATA 607 Final Project - Twitter API

Twitter API
Pull Verified Users Tweets
Pull User Profile Data
- All User Data
- Logistic Regression
Testing Hostlem Goodness of Fit (GOF)

Twitter API

api_key <- "ZYoAk6RPmnwtpSULdOkfKqTga"

secret_key <- "4ghHFmMtMS1EN4FLzh8OMoOoZBsfrnxjGksgBu1RJm0SEwIEvN"

access_token <- "921826327237419010-y3LDsyX852MVRxyCYy74MAnKkRlbVHL"

access_secret <- "Dztjs4tHCeoFL96B3DzZ9z0985EhlddYqzgofBbJ4Ax5f"

setup_twitter_oauth(consumer_key = api_key, consumer_secret = secret_key, access_token = access_token, access_secret = access_secret)

## [1] "Using direct authentication"

Pull Verified Users Tweets

Pull User Profile Data

#Compiling with API calls takes an extended period of time.  Commenting this out.
 
#user_objs <- list()

#for(i in v_users$screen_name){
#  user_objs <- list.append(user_objs,getUser(i))
#  Sys.sleep(10)
#}
#
#id <- vector()

#name <- vector()

#screen_name <- vector()

#statuses_count <- vector()

#followers_count <- vector()

#friends_count <- vector()

#favourites_count <- vector()

#listed_count <- vector()

#created_at <- vector()

#lang <- vector()

#location <- vector()

#default_profile <- vector()

#default_profile_image <- vector()

#profile_image_url <- vector()

#protected <- vector()

#verified <- vector()

#description <- vector()

#for(i in user_objs){
  
#id <- c(id,i$id)

#name <- c(name,i$name)

#screen_name <- c(screen_name,i$screenName)

#statuses_count <- c(statuses_count,i$statusesCount)

#followers_count <- c(followers_count,i$followersCount)

#friends_count <- c(friends_count,i$friendsCount)

#favourites_count <- c(favourites_count,i$favoritesCount)

#listed_count <- c(listed_count,i$getListedCount())

#created_at <- c(created_at,i$created)

#lang <- c(lang,i$lang)

#location <- c(location,i$location)

#default_profile <- c(default_profile,0)

#default_profile_image <- c(default_profile_image,0)

#profile_image_url <- c(profile_image_url,i$getProfileImageUrl())

#protected <- c(protected,i$protected)

#verified <- c(verified,i$verified)

#description <- c(description,i$description)
#}


#users_df <- data.frame(id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,#listed_count,created_at,lang,location,default_profile,default_profile_image,profile_image_url,protected,v#erified,description)

All User Data

#All User Data is available via csv as well:
all_user_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/all_users_data.csv")

#all_user_data <- users_df
all_user_data_numeric <- data.frame(all_user_data$statuses_count,all_user_data$followers_count,all_user_data$friends_count,all_user_data$favourites_count,all_user_data$listed_count,all_user_data$fake)

Logistic Regression

We use a generalized linear model, i.e. binary logistic regression, to try and predict a fake user profile based on its profile meta data.

model <- glm(all_user_data$fake ~ all_user_data$statuses_count + all_user_data$followers_count + all_user_data$friends_count + all_user_data$favourites_count + all_user_data$listed_count)

summary(model)

## 
## Call:
## glm(formula = all_user_data$fake ~ all_user_data$statuses_count + 
##     all_user_data$followers_count + all_user_data$friends_count + 
##     all_user_data$favourites_count + all_user_data$listed_count)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9165   0.0833   0.0834   0.0836   3.3677  
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     9.168e-01  4.491e-03 204.144  < 2e-16 ***
## all_user_data$statuses_count   -1.148e-05  4.173e-07 -27.508  < 2e-16 ***
## all_user_data$followers_count  -1.084e-08  1.872e-09  -5.790 7.59e-09 ***
## all_user_data$friends_count     2.464e-07  2.207e-07   1.117 0.264196    
## all_user_data$favourites_count -7.522e-06  9.055e-07  -8.307  < 2e-16 ***
## all_user_data$listed_count     -2.208e-06  6.502e-07  -3.395 0.000692 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07362741)
## 
##     Null deviance: 418.33  on 3828  degrees of freedom
## Residual deviance: 281.48  on 3823  degrees of freedom
## AIC: 885.37
## 
## Number of Fisher Scoring iterations: 2

#Removing 'friends_count', high p value

model <- glm(all_user_data$fake ~ all_user_data$statuses_count + all_user_data$followers_count + all_user_data$favourites_count + all_user_data$listed_count)

summary(model)

## 
## Call:
## glm(formula = all_user_data$fake ~ all_user_data$statuses_count + 
##     all_user_data$followers_count + all_user_data$favourites_count + 
##     all_user_data$listed_count)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9165   0.0832   0.0834   0.0836   3.3497  
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     9.168e-01  4.491e-03 204.167  < 2e-16 ***
## all_user_data$statuses_count   -1.147e-05  4.173e-07 -27.494  < 2e-16 ***
## all_user_data$followers_count  -1.068e-08  1.866e-09  -5.722 1.14e-08 ***
## all_user_data$favourites_count -7.452e-06  9.034e-07  -8.249  < 2e-16 ***
## all_user_data$listed_count     -2.092e-06  6.419e-07  -3.259  0.00113 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07363217)
## 
##     Null deviance: 418.33  on 3828  degrees of freedom
## Residual deviance: 281.57  on 3824  degrees of freedom
## AIC: 884.62
## 
## Number of Fisher Scoring iterations: 2

The model provided low p-values across its predictor variables, but ultimately a R-Square of ~30% isn’t very good.

We proceed to test Goodness of Fit via the Hoslem Test

Testing Hostlem Goodness of Fit (GOF)

hoslem.test(all_user_data$fake, fitted(model))

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  all_user_data$fake, fitted(model)
## X-squared = 781.36, df = 8, p-value < 2.2e-16

With a hoslem test a low p-value is bad, it shows that the predicted model does not resemble the test set.