This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install required packages

# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages("plotly")

library(readr)
library("plyr")
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

User gender distribution by country

# Load data
Trump <- read_csv("C:/Users/Mounika/Trump.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   MESSAGE_BODY = col_character(),
##   MESSAGE_COUNTRY = col_character(),
##   MESSAGE_FAVORITES_COUNT = col_integer(),
##   MESSAGE_LOCATION = col_character(),
##   MESSAGE_LOCATION_DISPLAY_NAME = col_character(),
##   MESSAGE_POSTED_TIME = col_datetime(format = ""),
##   MESSAGE_RETWEET_COUNT = col_integer(),
##   USER_CITY = col_character(),
##   USER_COUNTRY = col_character(),
##   USER_DISPLAY_NAME = col_character(),
##   USER_FOLLOWERS_COUNT = col_integer(),
##   USER_FRIENDS_COUNT = col_integer(),
##   USER_GENDER = col_character(),
##   USER_LOCATION_DISPLAY_NAME = col_character(),
##   USER_SCREEN_NAME = col_character()
## )
Trump$country <- tolower(Trump$USER_COUNTRY)

# Get subset of the data to explore
subdf <- subset(Trump, country =='united states' | country =='india' | country =='canada' | country =='australia' | country =='united kingdom' | country =='china')
genderCountry <-data.frame(table(subdf[,c("country", "USER_GENDER")]))
xtabs(Freq~country+USER_GENDER,genderCountry)
##                 USER_GENDER
## country          female male unknown
##   australia           7   10      13
##   canada             10   32      28
##   china               0    0       1
##   india               2    7       1
##   united kingdom     13   55      52
##   united states     487 1011    1166
tab = reshape(genderCountry,direction="wide",timevar="USER_GENDER",idvar="country")

p <- plot_ly(tab, x = ~country, y = ~Freq.female, type = 'bar', name = 'Female') %>%
  add_trace(y = ~Freq.male, name = 'Male') %>%
  add_trace(y = ~Freq.unknown, name = 'Unknown') %>%
  layout(yaxis = list(title = 'Count'), barmode = 'group')

p

User posting count by country

Trump$days <- weekdays(as.POSIXlt(Trump$MESSAGE_POSTED_TIME))
Trump$country <- tolower(Trump$USER_COUNTRY)

subdf <- subset(Trump, country =='united states' | country =='india' | country =='canada' | country =='australia' | country =='united kingdom' | country =='china')

dfrm <-data.frame(table(subdf[,c("country","days")]))
genderDays = reshape(dfrm,direction="wide",timevar="days",idvar="country")

p <- plot_ly(genderDays, x = ~country, y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
  add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
   add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
   add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
   add_trace(y = ~Freq.Friday, name = 'Friday') %>%
   add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
   add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
  layout(yaxis = list(title = 'Count'), barmode = 'group')

p