======================================================

Introduction

In this assignment, I have scraped Twitter data for all 30 major league baseball teams and have given the tweet a positve or negative rating based off of Hu & Liu’s opinion lexicon. For example, a tweet with 5 positve words and 3 negative words will be given a sentiment score of +2. Each team is searched for in Twitter by using a hashtag followed by the team name. Teams with common names or with the same name as a team from another sport were included with

Loading and Processing

Install/load R packages

library(devtools)
install_github("twitteR", username="geoffjentry")
## Warning: Username parameter is deprecated. Please use geoffjentry/twitteR
## Downloading github repo geoffjentry/twitteR@master
## Installing twitteR
## '/Library/Frameworks/R.framework/Resources/bin/R' --vanilla CMD INSTALL  \
##   '/private/var/folders/qs/rhsf0ft94qxfyqr9sl988_yh0000gn/T/Rtmp0o9Xdx/devtools22a93afd8743/geoffjentry-twitteR-74c168a'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.1/Resources/library'  \
##   --install-tests
library(twitteR)
library(rjson)
library(httr)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(plyr)
## 
## Attaching package: 'plyr'
## 
## The following object is masked from 'package:twitteR':
## 
##     id
library(bit64)
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## 
## The following object is masked from 'package:base':
## 
##     xor
## 
## Attaching package bit64
## package:bit64 (c) 2011-2012 Jens Oehlschlaegel (GPL-2 with commercial restrictions)
## creators: integer64 seq :
## coercion: as.integer64 as.vector as.logical as.integer as.double as.character as.bin
## logical operator: ! & | xor != == < <= >= >
## arithmetic operator: + - * / %/% %% ^
## math: sign abs sqrt log log2 log10
## math: floor ceiling trunc round
## querying: is.integer64 is.vector [is.atomic} [length] is.na format print
## aggregation: any all min max range sum prod
## cumulation: diff cummin cummax cumsum cumprod
## access: length<- [ [<- [[ [[<-
## combine: c rep cbind rbind as.data.frame
## for more help type ?bit64
## 
## Attaching package: 'bit64'
## 
## The following object is masked from 'package:bit':
## 
##     still.identical
## 
## The following objects are masked from 'package:base':
## 
##     :, %in%, is.double, match, order, rank

Access Twitter’s API

#api_key <- "####"
#api_secret <- "####"
#access_token <- "####"
#access_token_secret <- "####"
#setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"

Data Processing

Load in Hu & Liu’s opinion lexicon of positive and negative words

pos.words <- scan('/Users/Malter/Twitter/positive-words.txt', what='character', comment.char=';')
neg.words <- scan('/Users/Malter/Twitter/negative-words.txt', what='character', comment.char=';')

Twitter Score Sentiment
Use a score sentiment function created by Jeff Breen

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
    require(plyr)
    require(stringr)
    scores = laply(sentences, function(sentence, pos.words, neg.words) {
        sentence = gsub('[[:punct:]]', '', sentence)
        sentence = gsub('[[:cntrl:]]', '', sentence)
        sentence = gsub('\\d+', '', sentence)
        sentence = tolower(sentence)
        word.list = str_split(sentence, '\\s+')
        words = unlist(word.list)
        pos.matches = match(words, pos.words)
        neg.matches = match(words, neg.words)
        pos.matches = !is.na(pos.matches)
        neg.matches = !is.na(neg.matches)
        score = sum(pos.matches) - sum(neg.matches)
        return(score)
    }, pos.words, neg.words, .progress=.progress )
    scores.df = data.frame(score=scores, text=sentences)
    return(scores.df)
}

Scrape Twitter for each MLB team using the team’s official hashtag. Team names that coincide with other sports or common terms were included with an @ rather than a #.
- Example code for AL East

orioles.tweets <- searchTwitter('#orioles', n=500, lang="en")
yankees.tweets <- searchTwitter('#yankees', n=500, lang="en")
bluejays.tweets <- searchTwitter('#bluejays', n=500, lang="en")
rays.tweets <- searchTwitter('#rays', n=500, lang="en")
redsox.tweets <- searchTwitter('#redsox', n=500, lang="en")

Create an array of the output text for each division.
- Example code for AL East

orioles.text = laply(orioles.tweets, function(t) t$getText())
yankees.text = laply(yankees.tweets, function(t) t$getText())
bluejays.text = laply(bluejays.tweets, function(t) t$getText())
rays.text = laply(rays.tweets, function(t) t$getText())
redsox.text = laply(redsox.tweets, function(t) t$getText())

Strip out funny characters, such as emoticons.
- Example code for AL East

orioles.text = gsub("[^[:alnum:]|^[:space:]]", "", orioles.text)
yankees.text = gsub("[^[:alnum:]|^[:space:]]", "", yankees.text)
bluejays.text = gsub("[^[:alnum:]|^[:space:]]", "", bluejays.text)
rays.text = gsub("[^[:alnum:]|^[:space:]]", "", rays.text)
redsox.text = gsub("[^[:alnum:]|^[:space:]]", "", redsox.text)

Give a score to each team’s tweet recorded.

orioles.scores <- score.sentiment(orioles.text, pos.words, 
                                   neg.words, .progress='text')
yankees.scores <- score.sentiment(yankees.text, pos.words, 
                                   neg.words, .progress='text')
bluejays.scores <- score.sentiment(bluejays.text, pos.words, 
                                   neg.words, .progress='text')
rays.scores <- score.sentiment(rays.text, pos.words, 
                                   neg.words, .progress='text')
redsox.scores <- score.sentiment(redsox.text, pos.words, 
                                   neg.words, .progress='text')

Give a name and code to each team
- Example code for AL East

orioles.scores$team = 'Orioles'
orioles.scores$code = 'BAL'
yankees.scores$team = 'Yankees'
yankees.scores$code = 'NYA'
bluejays.scores$team = 'Blue Jays'
bluejays.scores$code = 'TOR'
rays.scores$team = 'Rays'
rays.scores$code = 'TBA'
redsox.scores$team = 'Red Sox'
redsox.scores$code = 'BOS'

Use rbind to put together tweets from each division

aleast.scores = rbind(orioles.scores, yankees.scores, bluejays.scores, rays.scores, redsox.scores)
alcentral.scores = rbind(whitesox.scores, tigers.scores, twins.scores, royals.scores, indians.scores)
alwest.scores = rbind(angels.scores, athletics.scores, mariners.scores, astros.scores, rangers.scores)
nleast.scores = rbind(nationals.scores, mets.scores, braves.scores, marlins.scores, phillies.scores)
nlcentral.scores = rbind(cardinals.scores, pirates.scores, brewers.scores, reds.scores, cubs.scores)
nlwest.scores = rbind(dodgers.scores, giants.scores, padres.scores, rockies.scores, diamondbacks.scores)

Results

Create a bar plot for each division
- Example code for AL East

ggplot(data=aleast.scores) +
    geom_bar(mapping=aes(x=score, fill=team), binwidth=1) + 
    facet_grid(team~.) +
    theme_bw() + scale_color_brewer() +
    labs(title="AL East Sentiment")

Create a box plot for each division
- Example code for AL East

ggplot(aleast.scores, aes(x=team, y=score, group=team)) +
    geom_boxplot(aes(fill=team)) +
    geom_jitter(color="gray40",
                position=position_jitter(width=0.2), alpha=0.3) +
    ggtitle("Boxplot - AL East's Sentiment Scores")