library(RCurl)
## Loading required package: bitops
library(ggplot2)
library(data.table)
# data are from https://archive.ics.uci.edu/ml/datasets/Facebook+metrics#, and saved in github as the url below shows:
url <- "https://raw.githubusercontent.com/ann2014/Assignments/6f46098cfc79ba78e4a58ed49d5f0df6f7c0a698/FacebookMetrics.csv"
data <-getURL(url)
df <- read.csv(text=data,header=T,sep=",", stringsAsFactors=FALSE)
str(df)
## 'data.frame': 500 obs. of 19 variables:
## $ Page.total.likes : int 139441 139441 139441 139441 139441 139441 139441 139441 139441 139441 ...
## $ Type : chr "Photo" "Status" "Photo" "Photo" ...
## $ Category : int 2 2 3 2 2 2 3 3 2 3 ...
## $ Post.Month : int 12 12 12 12 12 12 12 12 12 12 ...
## $ Post.Weekday : int 4 3 3 2 2 1 1 7 7 6 ...
## $ Post.Hour : int 3 10 3 10 3 9 3 9 3 10 ...
## $ Paid : int 0 0 0 1 0 0 1 1 0 0 ...
## $ Lifetime.Post.Total.Reach : int 2752 10460 2413 50128 7244 10472 11692 13720 11844 4694 ...
## $ Lifetime.Post.Total.Impressions : int 5091 19057 4373 87991 13594 20849 19479 24137 22538 8668 ...
## $ Lifetime.Engaged.Users : int 178 1457 177 2211 671 1191 481 537 1530 280 ...
## $ Lifetime.Post.Consumers : int 109 1361 113 790 410 1073 265 232 1407 183 ...
## $ Lifetime.Post.Consumptions : int 159 1674 154 1119 580 1389 364 305 1692 250 ...
## $ Lifetime.Post.Impressions.by.people.who.have.liked.your.Page : int 3078 11710 2812 61027 6228 16034 15432 19728 15220 4309 ...
## $ Lifetime.Post.reach.by.people.who.like.your.Page : int 1640 6112 1503 32048 3200 7852 9328 11056 7912 2324 ...
## $ Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post: int 119 1108 132 1386 396 1016 379 422 1250 199 ...
## $ comment : int 4 5 0 58 19 1 3 0 0 3 ...
## $ like : int 79 130 66 1572 325 152 249 325 161 113 ...
## $ share : int 17 29 14 147 49 33 27 14 31 26 ...
## $ Total.Interactions : int 100 164 80 1777 393 186 279 339 192 142 ...
# Subsetting
df <- df[, c(1, 2, 5, 7, 19)]
head(df, 1)
## Page.total.likes Type Post.Weekday Paid Total.Interactions
## 1 139441 Photo 4 0 100
# rename columns
df2 <- setnames(df, old = c('Page.total.likes','Total.Interactions'), new = c('likes','interactions'))
# replace values
df2$Paid <- ifelse(is.na(df2$Paid), 'UNKNOWN', ifelse(df2$Paid == 0, 'NotPaid', ifelse(df2$Paid == 1, 'Paid', 'NA')))
# creating a new column by Concatenating 2 existing columns
df2$account.type <- paste0(df2$Type, ' ', df2$Paid)
# substitute values
df2$Post.Weekday <- ifelse(df2$Post.Weekday == 1, 'Monday', ifelse(df2$Post.Weekday == 2, 'Tuesday',
ifelse(df2$Post.Weekday == 3, 'Wednesday', ifelse(df2$Post.Weekday == 4, 'Thursday',
ifelse(df2$Post.Weekday == 5, 'Friday', ifelse(df2$Post.Weekday == 6, 'Saturday',
ifelse(df2$Post.Weekday == 7, 'Sunday', 'NA')))))))
# order values
df2$Post.Weekday <- factor(df2$Post.Weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(df2)
## likes Type Post.Weekday Paid
## Min. : 81370 Length:500 Monday :68 Length:500
## 1st Qu.:112676 Class :character Tuesday :66 Class :character
## Median :129600 Mode :character Wednesday:64 Mode :character
## Mean :123194 Thursday :72
## 3rd Qu.:136393 Friday :67
## Max. :139441 Saturday :81
## Sunday :82
## interactions account.type
## Min. : 0.0 Length:500
## 1st Qu.: 71.0 Class :character
## Median : 123.5 Mode :character
## Mean : 212.1
## 3rd Qu.: 228.5
## Max. :6334.0
##
# check on the shapes of the data
hist(df2$likes)
hist(df2$interactions)
table(df2$account.type, df2$Post.Weekday)
##
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## Link NotPaid 2 3 2 2 2 3 2
## Link Paid 0 0 0 2 2 2 0
## Photo NotPaid 50 40 37 35 37 53 54
## Photo Paid 12 18 15 27 16 14 17
## Photo UNKNOWN 0 0 0 1 0 0 0
## Status NotPaid 4 2 6 2 7 5 9
## Status Paid 0 1 2 2 2 3 0
## Video NotPaid 0 2 1 0 0 0 0
## Video Paid 0 0 1 1 1 1 0
table(df2$Paid)
##
## NotPaid Paid UNKNOWN
## 360 139 1
table(df2$Type)
##
## Link Photo Status Video
## 22 426 45 7
# Scatterplot
# There is a negative relationship between likes and interactions for Type of Status and Video
ggplot(df2, aes(likes, interactions, color = Type)) + geom_point()+ stat_smooth(method=lm, se=FALSE, fullrange=TRUE)
# For paid services, likes and interactions have a slightly positive relationship.
ggplot(df2, aes(likes, interactions, color = Paid)) + geom_point()+ stat_smooth(method=lm, se=FALSE, fullrange=TRUE)
# identify outliers
ggplot(df2, aes(likes, interactions, color = account.type)) + geom_point()+ stat_smooth(method=lm, se=FALSE, fullrange=TRUE)
# boxplot
ggplot(df2, aes(Post.Weekday, interactions)) + geom_boxplot()
ggplot(df2, aes(Post.Weekday, likes)) + geom_boxplot()
# type and likes relationship
ds <- plyr::ddply(df2, "Type", plyr::summarise, mean = mean(likes), sd = sd(likes))
ggplot(df2, aes(Type, likes)) + geom_point(colour = "pink", size = 2, show.legend = TRUE) + geom_point(data = ds, aes(y = mean),
colour = 'red', size = 3)
# type and interactions relationship
ds <- plyr::ddply(df2, "Type", plyr::summarise, mean = mean(interactions), sd = sd(interactions))
ggplot(df2, aes(Type, interactions)) + geom_point(colour = "pink", size = 2, show.legend = TRUE) + geom_point(data = ds, aes(y = mean),
colour = 'red', size = 3)
Conclusion: 1. Video type account is associated with the highest mean of likes, and Status the second followed by Photo and Link. 2. Video type account is also associated with the highest mean of interactions, Status and Photo are closed to each other, the Link is associated with the lowest of mean of interatcions. 3. Video type account strongly indicates a negative relationship between likes and interactions.Link type shows flat. 4. Paid Video account has the strongest negative correlation betweeen interactions and likes, followed by Video notPaid and Status Paid. 5. Wednesday shows the highest mean of interactions in boxplot, and show the lowest mean of likes. Sunday has the highest likes.