DATA 607 Final Project - Twitter API
Twitter API
api_key <- "ZYoAk6RPmnwtpSULdOkfKqTga"
secret_key <- "4ghHFmMtMS1EN4FLzh8OMoOoZBsfrnxjGksgBu1RJm0SEwIEvN"
access_token <- "921826327237419010-y3LDsyX852MVRxyCYy74MAnKkRlbVHL"
access_secret <- "Dztjs4tHCeoFL96B3DzZ9z0985EhlddYqzgofBbJ4Ax5f"
setup_twitter_oauth(consumer_key = api_key, consumer_secret = secret_key, access_token = access_token, access_secret = access_secret)## [1] "Using direct authentication"
Pull Verified Users Tweets
Pull User Profile Data
#Compiling with API calls takes an extended period of time. Commenting this out.
#user_objs <- list()
#for(i in v_users$screen_name){
# user_objs <- list.append(user_objs,getUser(i))
# Sys.sleep(10)
#}
#
#id <- vector()
#name <- vector()
#screen_name <- vector()
#statuses_count <- vector()
#followers_count <- vector()
#friends_count <- vector()
#favourites_count <- vector()
#listed_count <- vector()
#created_at <- vector()
#lang <- vector()
#location <- vector()
#default_profile <- vector()
#default_profile_image <- vector()
#profile_image_url <- vector()
#protected <- vector()
#verified <- vector()
#description <- vector()
#for(i in user_objs){
#id <- c(id,i$id)
#name <- c(name,i$name)
#screen_name <- c(screen_name,i$screenName)
#statuses_count <- c(statuses_count,i$statusesCount)
#followers_count <- c(followers_count,i$followersCount)
#friends_count <- c(friends_count,i$friendsCount)
#favourites_count <- c(favourites_count,i$favoritesCount)
#listed_count <- c(listed_count,i$getListedCount())
#created_at <- c(created_at,i$created)
#lang <- c(lang,i$lang)
#location <- c(location,i$location)
#default_profile <- c(default_profile,0)
#default_profile_image <- c(default_profile_image,0)
#profile_image_url <- c(profile_image_url,i$getProfileImageUrl())
#protected <- c(protected,i$protected)
#verified <- c(verified,i$verified)
#description <- c(description,i$description)
#}
#users_df <- data.frame(id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,#listed_count,created_at,lang,location,default_profile,default_profile_image,profile_image_url,protected,v#erified,description)All User Data
#All User Data is available via csv as well:
all_user_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/all_users_data.csv")
#all_user_data <- users_df
all_user_data_numeric <- data.frame(all_user_data$statuses_count,all_user_data$followers_count,all_user_data$friends_count,all_user_data$favourites_count,all_user_data$listed_count,all_user_data$fake)Logistic Regression
We use a generalized linear model, i.e. binary logistic regression, to try and predict a fake user profile based on its profile meta data.
model <- glm(all_user_data$fake ~ all_user_data$statuses_count + all_user_data$followers_count + all_user_data$friends_count + all_user_data$favourites_count + all_user_data$listed_count)
summary(model)##
## Call:
## glm(formula = all_user_data$fake ~ all_user_data$statuses_count +
## all_user_data$followers_count + all_user_data$friends_count +
## all_user_data$favourites_count + all_user_data$listed_count)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9165 0.0833 0.0834 0.0836 3.3677
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.168e-01 4.491e-03 204.144 < 2e-16 ***
## all_user_data$statuses_count -1.148e-05 4.173e-07 -27.508 < 2e-16 ***
## all_user_data$followers_count -1.084e-08 1.872e-09 -5.790 7.59e-09 ***
## all_user_data$friends_count 2.464e-07 2.207e-07 1.117 0.264196
## all_user_data$favourites_count -7.522e-06 9.055e-07 -8.307 < 2e-16 ***
## all_user_data$listed_count -2.208e-06 6.502e-07 -3.395 0.000692 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07362741)
##
## Null deviance: 418.33 on 3828 degrees of freedom
## Residual deviance: 281.48 on 3823 degrees of freedom
## AIC: 885.37
##
## Number of Fisher Scoring iterations: 2
#Removing 'friends_count', high p value
model <- glm(all_user_data$fake ~ all_user_data$statuses_count + all_user_data$followers_count + all_user_data$favourites_count + all_user_data$listed_count)
summary(model)##
## Call:
## glm(formula = all_user_data$fake ~ all_user_data$statuses_count +
## all_user_data$followers_count + all_user_data$favourites_count +
## all_user_data$listed_count)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9165 0.0832 0.0834 0.0836 3.3497
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.168e-01 4.491e-03 204.167 < 2e-16 ***
## all_user_data$statuses_count -1.147e-05 4.173e-07 -27.494 < 2e-16 ***
## all_user_data$followers_count -1.068e-08 1.866e-09 -5.722 1.14e-08 ***
## all_user_data$favourites_count -7.452e-06 9.034e-07 -8.249 < 2e-16 ***
## all_user_data$listed_count -2.092e-06 6.419e-07 -3.259 0.00113 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07363217)
##
## Null deviance: 418.33 on 3828 degrees of freedom
## Residual deviance: 281.57 on 3824 degrees of freedom
## AIC: 884.62
##
## Number of Fisher Scoring iterations: 2
The model provided low p-values across its predictor variables, but ultimately a R-Square of ~30% isn’t very good.
We proceed to test Goodness of Fit via the Hoslem Test
Testing Hostlem Goodness of Fit (GOF)
hoslem.test(all_user_data$fake, fitted(model))##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: all_user_data$fake, fitted(model)
## X-squared = 781.36, df = 8, p-value < 2.2e-16
With a hoslem test a low p-value is bad, it shows that the predicted model does not resemble the test set.