Install required packages
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages("plotly")
library(readr)
## Warning: package 'readr' was built under R version 3.4.3
library("plyr")
## Warning: package 'plyr' was built under R version 3.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
User gender distribution by userlocation
Trump <- read_csv("D:\\New folder (7)\\Trump(1).csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## MESSAGE_BODY = col_character(),
## MESSAGE_COUNTRY = col_character(),
## MESSAGE_FAVORITES_COUNT = col_integer(),
## MESSAGE_LOCATION = col_character(),
## MESSAGE_LOCATION_DISPLAY_NAME = col_character(),
## MESSAGE_POSTED_TIME = col_datetime(format = ""),
## MESSAGE_RETWEET_COUNT = col_integer(),
## USER_CITY = col_character(),
## USER_COUNTRY = col_character(),
## USER_DISPLAY_NAME = col_character(),
## USER_FOLLOWERS_COUNT = col_integer(),
## USER_FRIENDS_COUNT = col_integer(),
## USER_GENDER = col_character(),
## USER_LOCATION_DISPLAY_NAME = col_character(),
## USER_SCREEN_NAME = col_character()
## )
summary(Trump)
## X1 MESSAGE_BODY MESSAGE_COUNTRY
## Min. : 1 Length:4955 Length:4955
## 1st Qu.:1240 Class :character Class :character
## Median :2478 Mode :character Mode :character
## Mean :2478
## 3rd Qu.:3716
## Max. :4955
## MESSAGE_FAVORITES_COUNT MESSAGE_LOCATION MESSAGE_LOCATION_DISPLAY_NAME
## Min. : 0.000 Length:4955 Length:4955
## 1st Qu.: 0.000 Class :character Class :character
## Median : 0.000 Mode :character Mode :character
## Mean : 2.442
## 3rd Qu.: 0.000
## Max. :1924.000
## MESSAGE_POSTED_TIME MESSAGE_RETWEET_COUNT USER_CITY
## Min. :2014-11-05 13:42:05 Min. : 0.00 Length:4955
## 1st Qu.:2015-09-17 20:30:05 1st Qu.: 0.00 Class :character
## Median :2016-02-08 07:47:24 Median : 1.00 Mode :character
## Mean :2015-12-21 06:49:03 Mean : 41.34
## 3rd Qu.:2016-03-04 02:39:23 3rd Qu.: 13.00
## Max. :2016-03-30 02:45:13 Max. :871.00
## USER_COUNTRY USER_DISPLAY_NAME USER_FOLLOWERS_COUNT
## Length:4955 Length:4955 Min. : 0
## Class :character Class :character 1st Qu.: 130
## Mode :character Mode :character Median : 534
## Mean : 12514
## 3rd Qu.: 2046
## Max. :4714925
## USER_FRIENDS_COUNT USER_GENDER USER_LOCATION_DISPLAY_NAME
## Min. : 0 Length:4955 Length:4955
## 1st Qu.: 184 Class :character Class :character
## Median : 592 Mode :character Mode :character
## Mean : 2088
## 3rd Qu.: 1714
## Max. :282971
## USER_SCREEN_NAME
## Length:4955
## Class :character
## Mode :character
##
##
##
Trump$location <- tolower(Trump$USER_LOCATION_DISPLAY_NAME)
location <- subset(Trump, location =='united states' | location =='brooklyn, ny')
genderlocation <-data.frame(table(location[,c("location", "USER_GENDER")]))
xtabs(Freq~location+USER_GENDER,genderlocation)
## USER_GENDER
## location female male unknown
## brooklyn, ny 0 4 6
## united states 10 18 92
tab = reshape(genderlocation,direction="wide",timevar="USER_GENDER",idvar="location")
p <- plot_ly(tab, x = ~location, y = ~Freq.female, type = 'bar', name = 'Female') %>%
add_trace(y = ~Freq.male, name = 'Male') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p
## Warning: package 'bindrcpp' was built under R version 3.4.3
User gender distribution by city
Trump$city <- tolower(Trump$USER_CITY)
subdf <- subset(Trump, city =='new york city' | city =='brooklyn')
genderCity <-data.frame(table(subdf[,c("city", "USER_GENDER")]))
xtabs(Freq~city+USER_GENDER,genderCity)
## USER_GENDER
## city female male unknown
## brooklyn 0 10 6
## new york city 8 45 44
tab = reshape(genderCity,direction="wide",timevar="USER_GENDER",idvar="city")
p <- plot_ly(tab, x = ~city, y = ~Freq.female, type = 'bar', name = 'Female') %>%
add_trace(y = ~Freq.male, name = 'Male') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p
User posting time by gender
Trump$days <- weekdays(as.POSIXlt(Trump$MESSAGE_POSTED_TIME))
dfrm <-data.frame(table(Trump[,c("USER_GENDER","days")]))
genderDays = reshape(dfrm,direction="wide",timevar="days",idvar="USER_GENDER")
p <- plot_ly(genderDays, x = ~USER_GENDER, y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
add_trace(y = ~Freq.Friday, name = 'Friday') %>%
add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p