Install required packages
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages("plotly")
library(readr)
library("plyr")
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
User Gender Distribution by Country
# Load data
Trump <- read_csv("Trump.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## MESSAGE_BODY = col_character(),
## MESSAGE_COUNTRY = col_character(),
## MESSAGE_FAVORITES_COUNT = col_integer(),
## MESSAGE_LOCATION = col_character(),
## MESSAGE_LOCATION_DISPLAY_NAME = col_character(),
## MESSAGE_POSTED_TIME = col_datetime(format = ""),
## MESSAGE_RETWEET_COUNT = col_integer(),
## USER_CITY = col_character(),
## USER_COUNTRY = col_character(),
## USER_DISPLAY_NAME = col_character(),
## USER_FOLLOWERS_COUNT = col_integer(),
## USER_FRIENDS_COUNT = col_integer(),
## USER_GENDER = col_character(),
## USER_LOCATION_DISPLAY_NAME = col_character(),
## USER_SCREEN_NAME = col_character()
## )
Trump$country <- tolower(Trump$USER_COUNTRY)
# Get subset of the data to explore
subdf <- subset(Trump, country =='united kingdom' | country =='canada'| country =='italy')
genderCountry <-data.frame(table(subdf[,c("country", "USER_GENDER")]))
xtabs(Freq~country+USER_GENDER,genderCountry)
## USER_GENDER
## country female male unknown
## canada 10 32 28
## italy 0 2 2
## united kingdom 13 55 52
tab = reshape(genderCountry,direction="wide",timevar="USER_GENDER",idvar="country")
p <- plot_ly(tab, x = ~country, y = ~Freq.female, type = 'bar', name = 'Female') %>%
add_trace(y = ~Freq.male, name = 'Male') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p
User Posting Time by Gender Based on Weekdays
Trump$days <- weekdays(as.POSIXlt(Trump$MESSAGE_POSTED_TIME))
dfrm <-data.frame(table(Trump[,c("USER_GENDER","days")]))
genderDays = reshape(dfrm,direction="wide",timevar="days",idvar="USER_GENDER")
p <- plot_ly(genderDays, x = ~USER_GENDER, y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
add_trace(y = ~Freq.Friday, name = 'Friday') %>%
add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p