The goal of this script is to show how to create a relational database in SQLite with data in XML format downloaded using Facebook’s Graph API and Twitter’s API. https://developers.facebook.com/docs/graph-api.
The data downloaded from Facebook includes the posts made by online real estate companies on their fan pages. Each post includes a permanent link, the number of likes it received, the number of shares, number of comments, comment type, a Unix Timestamp. We changed the Unix Timestamp to a variable containing date and time, and we also created a variable including the type of post it was (video or text only post).
We downloaded information from the fan pages of four companies: zillow, redfin, trulia, and realtor.com. The goal is to create three tables:
1. FB_POSTS
2. TWITTER_POSTS
3. COMPANY_INFO
Each row in table FB_POSTS will be a post and all of its information. Each row in table TWITTER_POSTS will be a tweet and all of its information. Finally, each row in COMPANY_INFO will be a company and all of its info.
Each row in FB_POSTS and TWITTER_POSTS will be assign a company id attribute. That attribute will be the key used to join these tables with the table COMPANY_INFO.
suppressPackageStartupMessages(library(XML))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(RSQLite))
###########
# Zillow #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_zillow.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
Zillow <- xmlToDataFrame(doc)
Zillow$comment_info <- as.character(Zillow$comment_info)
Zillow$like_info <- as.integer(paste(Zillow$like_info))
for (i in 1:nrow(Zillow)){
Zillow$comment_number[i] <- as.integer(
gsub("[^0-9]", "", unlist(Zillow$comment_info[i]), ""))
}
Zillow$comment_number <- as.integer(Zillow$comment_number)
Zillow$share_info <- as.integer(paste(Zillow$share_info))
# Changes dates to
Zillow$created_time <- as.character(Zillow$created_time)
Zillow$created_time <- as.integer(Zillow$created_time)
Zillow$created_time <- as.POSIXlt(x =Zillow$created_time, origin="1970-01-01")
# Zillow$created_datex <- strftime(Zillow$created_time, format = "%Y-%m-%d")
# Zillow$created_datex <- ymd(Zillow$created_datex)
#
# Zillow$created_timex <- strftime(Zillow$created_time, format="%H:%M:%S")
# Zillow$created_timex <- as.Time(Zillow$created_timex)
for (i in 1:nrow(Zillow)){
Zillow$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(Zillow$comment_info[i]), "")
}
Zillow$comment_info <- NULL
for (i in 1:nrow(Zillow)){
if (grepl("post", Zillow$permalink[i]) == TRUE){
Zillow$post_type[i] <- "post"
} else if (grepl("video", Zillow$permalink[i]) == TRUE){
Zillow$post_type[i] <- "video"
} else if (grepl("event", Zillow$permalink[i]) == TRUE){
Zillow$post_type[i] <- "event"
} else if (grepl("photo", Zillow$permalink[i]) == TRUE){
Zillow$post_type[i] <- "photo"
} else {
Zillow$post_type[i] <- NA
}
}
###########
# Redfin #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_redfin.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
redfin <- xmlToDataFrame(doc)
redfin$comment_info <- as.character(redfin$comment_info)
redfin$like_info <- as.integer(paste(redfin$like_info))
for (i in 1:nrow(redfin)){
redfin$comment_number[i] <- as.integer(
gsub("[^0-9]", "", unlist(redfin$comment_info[i]), ""))
}
redfin$comment_number <- as.integer(redfin$comment_number)
redfin$share_info <- as.integer(paste(redfin$share_info))
# Changes dates to
redfin$created_time <- as.character(redfin$created_time)
redfin$created_time <- as.integer(redfin$created_time)
redfin$created_time <- as.POSIXlt(x =redfin$created_time, origin="1970-01-01")
for (i in 1:nrow(redfin)){
redfin$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(redfin$comment_info[i]), "")
}
redfin$comment_info <- NULL
for (i in 1:nrow(redfin)){
if (grepl("post", redfin$permalink[i]) == TRUE){
redfin$post_type[i] <- "post"
} else if (grepl("video", redfin$permalink[i]) == TRUE){
redfin$post_type[i] <- "video"
} else if (grepl("event", redfin$permalink[i]) == TRUE){
redfin$post_type[i] <- "event"
} else if (grepl("photo", redfin$permalink[i]) == TRUE){
redfin$post_type[i] <- "photo"
} else {
redfin$post_type[i] <- NA
}
}
###########
# trulia #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_trulia.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
trulia <- xmlToDataFrame(doc)
trulia$comment_info <- as.character(trulia$comment_info)
trulia$like_info <- as.integer(paste(trulia$like_info))
for (i in 1:nrow(trulia)){
trulia$comment_number[i] <- as.integer(
gsub("[^0-9]", "", unlist(trulia$comment_info[i]), ""))
}
trulia$comment_number <- as.integer(trulia$comment_number)
trulia$share_info <- as.integer(paste(trulia$share_info))
# Changes dates to
trulia$created_time <- as.character(trulia$created_time)
trulia$created_time <- as.integer(trulia$created_time)
trulia$created_time <- as.POSIXlt(x =trulia$created_time, origin="1970-01-01")
for (i in 1:nrow(trulia)){
trulia$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(trulia$comment_info[i]), "")
}
trulia$comment_info <- NULL
for (i in 1:nrow(trulia)){
if (grepl("post", trulia$permalink[i]) == TRUE){
trulia$post_type[i] <- "post"
} else if (grepl("video", trulia$permalink[i]) == TRUE){
trulia$post_type[i] <- "video"
} else if (grepl("event", trulia$permalink[i]) == TRUE){
trulia$post_type[i] <- "event"
} else if (grepl("photo", trulia$permalink[i]) == TRUE){
trulia$post_type[i] <- "photo"
} else {
trulia$post_type[i] <- NA
}
}
###########
# realtor.com #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_realtor.com.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
realtor.com <- xmlToDataFrame(doc)
realtor.com$comment_info <- as.character(realtor.com$comment_info)
realtor.com$like_info <- as.integer(paste(realtor.com$like_info))
for (i in 1:nrow(realtor.com)){
realtor.com$comment_number[i] <- as.integer(
gsub("[^0-9]", "", unlist(realtor.com$comment_info[i]), ""))
}
realtor.com$comment_number <- as.integer(realtor.com$comment_number)
realtor.com$share_info <- as.integer(paste(realtor.com$share_info))
# Changes dates to
realtor.com$created_time <- as.character(realtor.com$created_time)
realtor.com$created_time <- as.integer(realtor.com$created_time)
realtor.com$created_time <- as.POSIXlt(x =realtor.com$created_time, origin="1970-01-01")
for (i in 1:nrow(realtor.com)){
realtor.com$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(realtor.com$comment_info[i]), "")
}
realtor.com$comment_info <- NULL
for (i in 1:nrow(realtor.com)){
if (grepl("post", realtor.com$permalink[i]) == TRUE){
realtor.com$post_type[i] <- "post"
} else if (grepl("video", realtor.com$permalink[i]) == TRUE){
realtor.com$post_type[i] <- "video"
} else if (grepl("event", realtor.com$permalink[i]) == TRUE){
realtor.com$post_type[i] <- "event"
} else if (grepl("photo", realtor.com$permalink[i]) == TRUE){
realtor.com$post_type[i] <- "photo"
} else {
realtor.com$post_type[i] <- NA
}
}
Zillow$company_id <- 1
redfin$company_id <- 2
trulia$company_id <- 3
realtor.com$company_id <- 4
FB_posts_data <- rbind(Zillow, redfin, trulia, realtor.com)
We then wrote the newly created fb.posts.data R data frame as csv file. The idea is to then use the csv file to populate the first SQLITE table. But before we start loading the table, we need to create the table and its variables in SQLite. This can be done directly in SQLite or via R. I chose the latter.
# Writes out the data frame of FB posts as a csv file
# write.csv(x = FB_posts_data, file = "~/Documents/cool_R_code_snippets/rel_database_project/FB_posts_data.csv")
# Connects to the SQLITE database POSTS.DB
con <- dbConnect(SQLite(), dbname = "~/Documents/cool_R_code_snippets/rel_database_project/posts.db")
# Creates the structure of the table for the FB posts
# dbSendQuery(con,
# "CREATE TABLE FB_POSTS(ID INTEGER, PERMALINK VARCHAR(75), LIKES INTEGER,
# SHARES INTEGER, CREATED_TIME DATETIME, COMMENTS INTEGER, COMMENT_TYPE VARCHAR(10),
# POST_TYPE VARCHAR(10), COMPANY_ID INTEGER);")
Once we create the table, we need to populate it with the csv file. This step has to be done via SQLite. However, I’m including the code below:
# sqlite> .mode csv
# sqlite> .import FB_posts_data.csv FB_POSTS
We then create the table COMPANY_INFO. For this table, we will hard code the information, since it isn’t too much.
# dbSendQuery(con, "
# CREATE TABLE REL_COMPANY(ID INTEGER,
# NAME VARCHAR(20));")
#
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
# VALUES (1, 'Zillow');")
#
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
# VALUES (2, 'Redfin');")
#
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
# VALUES (3, 'Trulia');")
#
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
# VALUES (4, 'realtor.com');")
So far we’ve created two of the three tables. Before we continue, let’s take a look at the tables!
dbGetQuery(con, "SELECT * FROM FB_POSTS LIMIT 5;")
## ID PERMALINK LIKES
## 1 1 https://www.facebook.com/Zillow/posts/10154046817623594:0 1740
## 2 2 https://www.facebook.com/Zillow/videos/10154033844373594/ 1330
## 3 3 https://www.facebook.com/Zillow/videos/10154033882648594/ 16380
## 4 4 https://www.facebook.com/Zillow/posts/10154036440543594 1930
## 5 5 https://www.facebook.com/Zillow/posts/10154033870168594:0 121800
## SHARES CREATED_TIME COMMENTS COMMENT_TYPE POST_TYPE COMPANY_ID
## 1 11 2016-03-22 11:47:01 1117 ranked post 1
## 2 118 2016-03-21 08:16:00 110 ranked video 1
## 3 1398 2016-03-20 07:31:00 115 ranked video 1
## 4 136 2016-03-19 08:02:02 113 ranked post 1
## 5 1158 2016-03-18 09:52:35 1688 ranked post 1
dbGetQuery(con, "SELECT * FROM REL_COMPANY;")
## ID NAME
## 1 1 Zillow
## 2 2 Redfin
## 3 3 Trulia
## 4 4 realtor.com
Let’s do something more involved. Let’s take a look at the average likes and average shares of posts per company per type of post.
dbGetQuery(con, "SELECT ROUND(AVG(X.LIKES)) AS AVERAGE_LIKES,
ROUND(AVG(X.SHARES)) AS AVERAGE_SHARES,
X.POST_TYPE AS POST_TYPE,
Y.NAME AS COMPANY_NAME
FROM FB_POSTS AS X
INNER JOIN REL_COMPANY AS Y
ON X.COMPANY_ID = Y.ID
GROUP BY X.COMPANY_ID, POST_TYPE
;")
## AVERAGE_LIKES AVERAGE_SHARES POST_TYPE COMPANY_NAME
## 1 6991 209 NA Zillow
## 2 14723 206 post Zillow
## 3 36319 4103 video Zillow
## 4 13150 156 NA Redfin
## 5 6258 126 post Redfin
## 6 2537 94 video Redfin
## 7 937 48 NA Trulia
## 8 4695 85 post Trulia
## 9 1412 59 video Trulia
## 10 2317 323 post realtor.com
## 11 1165 133 video realtor.com
To be continued…