1 .HU Follower Twitter Data Analysis
- 1.1 Induction
- 1.2 Data pre-processing
2 Twitter Data Analysis
- 2.1 HU Friend Distributions

1 .HU Follower Twitter Data Analysis

1.1 Induction

We use tk to create a domain and App application to assess the twitter data, and get the HU followers info. There are 142 HU followers in total.

consumer_key <- "gFcYNrDbHDGsBelpYfJ4CHD47"
consumer_secret <-"bcadaIy1FdjR3qukWGwuN9kHUEpce3GECae7G5Pq1OkWCkIidQ"
access_token <-"1860181033-ZMZaUhipAOOdyI8I1JXJodaXR3lRhcFRmUk5G43"
access_secret <-"9fFslK3qyuWj1Zwf1vN5hWMPdvymsr761d6MRunYAlOIZ"

setup_twitter_oauth(consumer_key,consumer_secret,access_token,access_secret)

## [1] "Using direct authentication"

#read data in R

user <- getUser("Harrisburgu")
friends <- user$getFriends()
friends_df <- twListToDF(friends)
save(friends_df, file = "hu_friends.RData")

1.2 Data pre-processing

In this step we will perform the data cleaning, first we clean the location by group all the Harrisburg, unkown location. Then we categorize the twitter users into 3 groups. We use the rule <5000 post is passive, from 5000 to 15000 is normal, and greater than 15000 is active.

#clean region distribution

friends_df$loca <- ifelse(friends_df$location %in% c("Harrisburg, Pa." , "Harrisburg, PA", "Harrisburg, Pa" ,"Pennsylvania, USA",
                                                     "Pennsylvania", "Harrisburg Pennsylvania","Harrisburg","Harrisburg PA","Harrisburg, PA | Nashville, TN", "Central PA", "Harrisburg, Pennsylvania","Harrisburg,Pa.","Harrisburg, Pennsylvania, USA","Harrisburg, PA (not Philly)","Downtown Harrisburg, PA","Harrisburg midtown arts center","Harrisburg Pa","Harrisburg, PA, USA","Hershey, PA","South Central Pennsylvania","Londonderry Twp. PA","United States","   
East Coast, United States","East Coast, United States","Central Pennsylvania"), "Harrisburg,Pa",
                          ifelse(friends_df$location %in% c("Camp Hill, PA", "Camp Hill, Pa."), "Camp Hill, PA",
                                 ifelse(friends_df$location %in% c("Lancaster, PA","Lancaster, Pennsylvania, USA"), "Lancaster, PA",
                                        ifelse(friends_df$location %in% c("New York","New York City, NY, USA","New York, New York","New York, NY"), "New York, NY",
                                               ifelse(friends_df$location %in% c("York PA","York, PA"), "York, PA",
                                                      ifelse(friends_df$location %in% c("Washington, D.C. Area","Washington, DC"),
                                                             "Washington, DC",
                                                             ifelse(friends_df$location %in% c("Cumberland, Dauphin & Perry PA","Dauphin County | PA"), "Dauphin, Pa",
                                                                    ifelse(friends_df$location %in% c("India | UAE","Where do you want me?","Worldwide"),"missing",
                            friends_df$location))))))))

#Language
friends_df$language <- ifelse(friends_df$lang=="en", "English", "Other")


#grouping  stat

friends_df$sgp <-2
friends_df$sgp[friends_df$statusesCount <5000] <-1
friends_df$sgp[friends_df$statusesCount >15000] <-3

friends_df$stats <- ifelse(friends_df$sgp == 1, "Passive",
              ifelse( friends_df$sgp ==2 , "Normal",
                            "Active"))

2 Twitter Data Analysis

we will do the analysis perform different analysis on the clean dataset, including to find out what language do HU followers speak ? What is their verified Status?

2.1 HU Friend Distributions

2.1.1 HU Friends Languages distribution?

As we can see from the bar plot, all 142 HU followers speak English.

#Language Distribution
lang1 <- group_by(friends_df, language)
lang2 <- summarise(lang1,  count=n())

g2 <- ggplot(lang2, aes(x=language))+
  geom_bar(aes(y=(..count..)/sum(..count..), fill=language),width=0.3)+scale_y_continuous(labels=percent)+
  labs(x="Languages", y="Count of a Particular Language",
       title="HU Twitter Follower Language Distribution")


g2

2.1.2 Location Distributions

From the Location, we can see majority of HU followers’ are in PA, more generally, in East Coast,However, there are couple friends in Texas, California, Washington and Colorado.

loca <- group_by(friends_df, loca)
loca2 <- summarise(loca,  count=n())
loca3 <- filter(loca2, count !=13 )
loca4 <-filter(loca3, loca!="missing")


pl1 <- ggplot(data=loca4, aes(x=reorder(loca,count),y=count))+
  geom_bar(stat="identity", aes(fill=loca))+theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="none") +
  labs(x="HU followers Location", y="User count",
       title="HU Twitter Location Distribution")
pl1

foo <- data.frame(do.call('rbind', strsplit(as.character(friends_df$loca),',',fixed=TRUE)))
foo$st <- toupper(foo$X2)


foo$st1 <- ifelse(foo$st %in% c("MISSING" , "TIMBUKTU" ), "UN",
                 ifelse(foo$st=="USA", "MO",  
                  ifelse(foo$st %in% c("CALIFORNIA"), "CA",
                         foo$st)))
foo1<- foo[foo$st1 !="UN",]

foo2 <- group_by(foo1, st1)
foo2 <- summarise(foo2,  count=n())
foo3<- filter(foo2, foo2$st1 !="USA")


map <- map_data("state")

maptest<- data.frame("region"= c("pennsylvania","new york","california", "colorado","district of columbia","maryland","texas", "virginia","washington"), "count"=c(91,4,2,1,6,1,1,1,1))

total<- merge(map, maptest, y.by="region")

data("fifty_states")

pt1 <- ggplot(maptest, aes(map_id = region)) + 
  # map points to the fifty_states shape data
  geom_map(aes(fill = count), map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "", title="HU followers' Geographic Distribution") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + scale_fill_colorplane()

pt1

## No valid colorplane mapping found. scale_fill_colorplane requires both fill and fill2 aesthetics to be mapped.

2.1.3 Verified Status

We would like to investigate the verified status Distribution of HU followers, in addition we want to see if there is any assocation relationship between the sentimental words. As we can see from the Mosaic graph, the active users tends to have a verified status=“TRUE”, and most passive users didn’t Verify their status.

vstat <- group_by(friends_df, verified)
vstat1 <- summarise(vstat,  count=n())

pm1 <- ggplot(data=friends_df, aes(x=verified))+
  geom_bar(aes(fill=verified))+
  labs(x="Verified Status", y="User count",
       title="HU Twitter Follower Verified Status")
pm1

vstat2 <- group_by(friends_df, verified,stats)
vstat3 <- summarise(vstat2,  count=n())

v.lm <- loglm(count ~ stats + verified, data=vstat3)
summary(v.lm)

## Formula:
## count ~ stats + verified
## attr(,"variables")
## list(count, stats, verified)
## attr(,"factors")
##          stats verified
## count        0        0
## stats        1        0
## verified     0        1
## attr(,"term.labels")
## [1] "stats"    "verified"
## attr(,"order")
## [1] 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(count, stats, verified)
## attr(,"dataClasses")
##       count       stats    verified 
##   "numeric" "character"   "logical" 
## 
## Statistics:
##                       X^2 df    P(> X^2)
## Likelihood Ratio 10.09077  2 0.006438989
## Pearson          10.06984  2 0.006506734

v.m1<-mosaic(v.lm , clip=FALSE, gp_args = list(interpolate = c(1, 1.8)))

2.1.4 Status correlation

We will investigate the relationships between among the continuous variables, namely StatusCount, FollowesCount, favoritesCount and friendsCount. From the correlation plot we can see all of them share a positive correlation, however, status Count and favorite counts are mostly correlated, approximately 0.3. From the Correlation map, we can see Number of follower and Number of tweets has a correlation of 0.09, insignificant in our opinoin. From the graph, we can see majority of the followers are Passive, and followed by the Active group, Normal group has the smalles amount of people.

stat11 <- friends_df[, c(2,3,4,5)]


cvalue <- cor(stat11)
cvalue

##                statusesCount followersCount favoritesCount friendsCount
## statusesCount     1.00000000     0.09665634     0.28341575   0.07369182
## followersCount    0.09665634     1.00000000     0.04664671  -0.02557733
## favoritesCount    0.28341575     0.04664671     1.00000000   0.17721953
## friendsCount      0.07369182    -0.02557733     0.17721953   1.00000000

corrplot(cvalue,title = "Correlation Plot of Twitter Counts")

# grouped results

s1 <- ggplot(friends_df,aes(x=favoritesCount, y=statusesCount,color=stats)) +geom_point()+
  labs(title="HU Followers statusesCount vs favoritesCount")
s1

s2<- ggplot(friends_df, aes(x=stats, y=statusesCount, group=stats, color=stats))+geom_point(alpha=0.4)+ geom_boxplot()+ labs(x="HU Followers Twitter Status", y="Twitter Number of Posts" ,title="HU Followers Twitter Status")
s2

stat1 <- group_by(friends_df, stats)
stat2 <- summarise(stat1,  count=n())

s3 <- ggplot(data=stat2, aes(x=reorder(stats,count),y=count))+ geom_bar(stat="identity", aes(fill=stats), width=0.4)+theme(legend.position="none") +
  labs(x="HU followers Status", y="Users count",
       title="HU Twitter Followers Status Distribution")
s3

2.1.5 Biggest Network

In this section we will look who has the most followers and who has the most number of tweets, NASA has the most followers, we discovered all the accounts has the most followers are science related accounts. In addition, except the most followers, the most favorites, top 10 most friends, top 10 most tweets are location related. PA dominates majority of those top 10, especially Harrisburg.In addition, we picked out the candidates if they were in top 10 for more than 2 categories.Since the status count was too much, then we weighted the status count by 0.7 and followers count by 0.8 and add all the total numbers together to get our top 4.

# who has the most followers
big1 <- friends_df[, c(2,3,4,5,7,18,10)]
big2<- big1[order(-big1$followersCount),]
big3 <- big2[(big2$loca !=" "),]
big4<- big3[1:10,]


b1 <- ggplot(data=big4, aes(x=reorder(name,followersCount),y=followersCount))+ geom_bar(stat="identity", aes(fill=loca), width=0.4)+theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="bottom") +
  labs(x="Name", y="followers count",
       title=" Top 10 Twitter Account with the Most Followers")
b1

# who has the most likes
like2<- big1[order(-big1$favoritesCount),]
like3 <- like2[1:10,]

b2 <- ggplot(data=like3, aes(x=reorder(name,favoritesCount),y=favoritesCount))+ geom_bar(stat="identity", aes(fill=loca), width=0.4)+theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="bottom") +
  labs(x="Name", y="followers count",
       title=" Top 10 Twitter accounts with the Most Favorites Tweets")
b2

# top 10 account has the most Friends

friend2<- big1[order(-big1$friendsCount),]
friend3 <- friend2[1:10,]

b3 <- ggplot(data=friend3, aes(x=reorder(name,friendsCount),y=friendsCount))+ geom_bar(stat="identity", aes(fill=loca), width=0.4)+theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="bottom") +
  labs(x="Name", y="friendss count",
       title="Most Friends Twitter Account Top 10")
b3

# top 10 tweets 
tweet2<- big1[order(-big1$statusesCount),]
tweet3 <- tweet2[1:10,]

b4 <- ggplot(data=tweet3, aes(x=reorder(name,statusesCount),y=statusesCount))+ geom_bar(stat="identity", aes(fill=loca), width=0.4)+theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="bottom") +
  labs(x="Name", y="tweets count",
       title="Account Tweets the most Top 10")
b4

#winner1
win1<- merge(friend3, big4, x.by="name" )

#winner2
win2<- merge(big4, like3, x.by="name")

#winner3
win3 <- merge(friend3, like3, x.by="name")

#winner4
win4 <- merge(tweet3, friend3, x.by="name")

win <- rbind(win1,win2,win3,win4)

win$total <- win$statusesCount*0.6+win$followersCount*0.7+win$favoritesCount*1.4+win$friendsCount

w1 <- ggplot(data=win, aes(x=reorder(name,total),y=total))+ geom_bar(stat="identity", aes(fill=loca), width=0.4)+theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="bottom") +
  labs(x="Name", y="Weighted Total counts",
       title="The Winner of the Most Popular HU followers")
w1

ggplot(win, aes(name,loca, fill=total))+
  geom_tile(colour = "white")+
scale_fill_gradient(low="green", high="red") +
  labs(x="name ", "Location" , title="HU Most Popular Followers' Heatmap" ,subtitle="Location" , fill="Count")

ggplot(win, aes(name, verified, fill=total))+
  geom_tile(colour = "white")+
scale_fill_gradient(low="green", high="red") +
  labs(x="name", title="HU Most Popular Followers' Heatmap" ,subtitle=" verified status" , fill="Count")

2.1.6 Twitter Text

In this section, we used Wordcloud Package to check what are the most common used words, As the graph shows all the words are very positive, it is about the community, news, report, media. We can sense the prosperity of HU is coming.

tws <- friends_df[, c(1)]
r_stats <- Corpus(VectorSource(tws))
r_stats_text_corpus <- tm_map(r_stats, function(x) iconv(enc2utf8(x), sub = "byte"))
r_stats_text_corpus <- tm_map(r_stats_text_corpus, content_transformer(tolower)) 
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords()))

set.seed(1234)
wordcloud(r_stats_text_corpus, max.words=200 ,random.order=FALSE,rot.per=0.35,colors=brewer.pal(8, "Dark2"))

a545_lab2

Fangya Tan

February 24, 2018