The goal of this script is to show how to create a relational database in SQLite with data in XML format downloaded using Facebook’s Graph API and Twitter’s API. https://developers.facebook.com/docs/graph-api.

The data downloaded from Facebook includes the posts made by online real estate companies on their fan pages. Each post includes a permanent link, the number of likes it received, the number of shares, number of comments, comment type, a Unix Timestamp. We changed the Unix Timestamp to a variable containing date and time, and we also created a variable including the type of post it was (video or text only post).

We downloaded information from the fan pages of four companies: zillow, redfin, trulia, and realtor.com. The goal is to create three tables:
1. FB_POSTS
2. TWITTER_POSTS
3. COMPANY_INFO

Each row in table FB_POSTS will be a post and all of its information. Each row in table TWITTER_POSTS will be a tweet and all of its information. Finally, each row in COMPANY_INFO will be a company and all of its info.

Each row in FB_POSTS and TWITTER_POSTS will be assign a company id attribute. That attribute will be the key used to join these tables with the table COMPANY_INFO.

Reading and converting the XML Facebook data into an R data frame

suppressPackageStartupMessages(library(XML))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(RSQLite))

###########
# Zillow #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_zillow.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
Zillow <- xmlToDataFrame(doc)


Zillow$comment_info <- as.character(Zillow$comment_info)
Zillow$like_info <- as.integer(paste(Zillow$like_info))

for (i in 1:nrow(Zillow)){
  Zillow$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(Zillow$comment_info[i]), ""))
}
Zillow$comment_number <- as.integer(Zillow$comment_number)

Zillow$share_info <- as.integer(paste(Zillow$share_info))

# Changes dates to 
Zillow$created_time <- as.character(Zillow$created_time)
Zillow$created_time <- as.integer(Zillow$created_time)
Zillow$created_time <- as.POSIXlt(x =Zillow$created_time, origin="1970-01-01")


# Zillow$created_datex <- strftime(Zillow$created_time, format = "%Y-%m-%d")
# Zillow$created_datex <- ymd(Zillow$created_datex)
# 
# Zillow$created_timex <- strftime(Zillow$created_time, format="%H:%M:%S")
# Zillow$created_timex <- as.Time(Zillow$created_timex)


for (i in 1:nrow(Zillow)){
  Zillow$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(Zillow$comment_info[i]), "")
}

Zillow$comment_info <- NULL

for (i in 1:nrow(Zillow)){
  if (grepl("post", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "post"
  } else if (grepl("video", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "video"
  } else if (grepl("event", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "event"
  } else if (grepl("photo", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "photo"
  } else {
    Zillow$post_type[i] <- NA
  }
}

###########
# Redfin #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_redfin.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
redfin <- xmlToDataFrame(doc)


redfin$comment_info <- as.character(redfin$comment_info)
redfin$like_info <- as.integer(paste(redfin$like_info))

for (i in 1:nrow(redfin)){
  redfin$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(redfin$comment_info[i]), ""))
}
redfin$comment_number <- as.integer(redfin$comment_number)

redfin$share_info <- as.integer(paste(redfin$share_info))

# Changes dates to 
redfin$created_time <- as.character(redfin$created_time)
redfin$created_time <- as.integer(redfin$created_time)
redfin$created_time <- as.POSIXlt(x =redfin$created_time, origin="1970-01-01")

for (i in 1:nrow(redfin)){
  redfin$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(redfin$comment_info[i]), "")
}

redfin$comment_info <- NULL

for (i in 1:nrow(redfin)){
  if (grepl("post", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "post"
  } else if (grepl("video", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "video"
  } else if (grepl("event", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "event"
  } else if (grepl("photo", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "photo"
  } else {
    redfin$post_type[i] <- NA
  }
}

###########
# trulia #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_trulia.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
trulia <- xmlToDataFrame(doc)


trulia$comment_info <- as.character(trulia$comment_info)
trulia$like_info <- as.integer(paste(trulia$like_info))

for (i in 1:nrow(trulia)){
  trulia$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(trulia$comment_info[i]), ""))
}
trulia$comment_number <- as.integer(trulia$comment_number)

trulia$share_info <- as.integer(paste(trulia$share_info))

# Changes dates to 
trulia$created_time <- as.character(trulia$created_time)
trulia$created_time <- as.integer(trulia$created_time)
trulia$created_time <- as.POSIXlt(x =trulia$created_time, origin="1970-01-01")

for (i in 1:nrow(trulia)){
  trulia$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(trulia$comment_info[i]), "")
}

trulia$comment_info <- NULL

for (i in 1:nrow(trulia)){
  if (grepl("post", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "post"
  } else if (grepl("video", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "video"
  } else if (grepl("event", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "event"
  } else if (grepl("photo", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "photo"
  } else {
    trulia$post_type[i] <- NA
  }
}

###########
# realtor.com #
###########
file_path = "~/Documents/cool_R_code_snippets/rel_database_project/fql.query_realtor.com.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
realtor.com <- xmlToDataFrame(doc)


realtor.com$comment_info <- as.character(realtor.com$comment_info)
realtor.com$like_info <- as.integer(paste(realtor.com$like_info))

for (i in 1:nrow(realtor.com)){
  realtor.com$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(realtor.com$comment_info[i]), ""))
}
realtor.com$comment_number <- as.integer(realtor.com$comment_number)

realtor.com$share_info <- as.integer(paste(realtor.com$share_info))

# Changes dates to 
realtor.com$created_time <- as.character(realtor.com$created_time)
realtor.com$created_time <- as.integer(realtor.com$created_time)
realtor.com$created_time <- as.POSIXlt(x =realtor.com$created_time, origin="1970-01-01")

for (i in 1:nrow(realtor.com)){
  realtor.com$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(realtor.com$comment_info[i]), "")
}

realtor.com$comment_info <- NULL

for (i in 1:nrow(realtor.com)){
  if (grepl("post", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "post"
  } else if (grepl("video", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "video"
  } else if (grepl("event", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "event"
  } else if (grepl("photo", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "photo"
  } else {
    realtor.com$post_type[i] <- NA
  }
}

Zillow$company_id <- 1
redfin$company_id <- 2
trulia$company_id <- 3
realtor.com$company_id <- 4

FB_posts_data <- rbind(Zillow, redfin, trulia, realtor.com)

Migrating the data to SQLite

We then wrote the newly created fb.posts.data R data frame as csv file. The idea is to then use the csv file to populate the first SQLITE table. But before we start loading the table, we need to create the table and its variables in SQLite. This can be done directly in SQLite or via R. I chose the latter.

# Writes out the data frame of FB posts as a csv file
# write.csv(x = FB_posts_data, file = "~/Documents/cool_R_code_snippets/rel_database_project/FB_posts_data.csv")

# Connects to the SQLITE database POSTS.DB
con <- dbConnect(SQLite(), dbname = "~/Documents/cool_R_code_snippets/rel_database_project/posts.db")

# Creates the structure of the table for the FB posts
# dbSendQuery(con, 
# "CREATE TABLE FB_POSTS(ID INTEGER, PERMALINK VARCHAR(75), LIKES INTEGER, 
# SHARES INTEGER, CREATED_TIME DATETIME, COMMENTS INTEGER, COMMENT_TYPE VARCHAR(10),
# POST_TYPE VARCHAR(10), COMPANY_ID INTEGER);")

Once we create the table, we need to populate it with the csv file. This step has to be done via SQLite. However, I’m including the code below:

# sqlite> .mode csv
# sqlite> .import FB_posts_data.csv FB_POSTS

We then create the table COMPANY_INFO. For this table, we will hard code the information, since it isn’t too much.

# dbSendQuery(con, "
# CREATE TABLE REL_COMPANY(ID INTEGER,
# NAME VARCHAR(20));")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (1, 'Zillow');")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (2, 'Redfin');")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (3, 'Trulia');")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (4, 'realtor.com');")

So far we’ve created two of the three tables. Before we continue, let’s take a look at the tables!

Peek of FB_POSTS

dbGetQuery(con, "SELECT * FROM FB_POSTS LIMIT 5;")
##   ID                                                 PERMALINK  LIKES
## 1  1 https://www.facebook.com/Zillow/posts/10154046817623594:0   1740
## 2  2 https://www.facebook.com/Zillow/videos/10154033844373594/   1330
## 3  3 https://www.facebook.com/Zillow/videos/10154033882648594/  16380
## 4  4   https://www.facebook.com/Zillow/posts/10154036440543594   1930
## 5  5 https://www.facebook.com/Zillow/posts/10154033870168594:0 121800
##   SHARES        CREATED_TIME COMMENTS COMMENT_TYPE POST_TYPE COMPANY_ID
## 1     11 2016-03-22 11:47:01     1117       ranked      post          1
## 2    118 2016-03-21 08:16:00      110       ranked     video          1
## 3   1398 2016-03-20 07:31:00      115       ranked     video          1
## 4    136 2016-03-19 08:02:02      113       ranked      post          1
## 5   1158 2016-03-18 09:52:35     1688       ranked      post          1

Peek of COMPANY_INFO

dbGetQuery(con, "SELECT * FROM REL_COMPANY;")
##   ID        NAME
## 1  1      Zillow
## 2  2      Redfin
## 3  3      Trulia
## 4  4 realtor.com

Let’s do something more involved. Let’s take a look at the average likes and average shares of posts per company per type of post.

dbGetQuery(con, "SELECT ROUND(AVG(X.LIKES)) AS AVERAGE_LIKES, 
           ROUND(AVG(X.SHARES)) AS AVERAGE_SHARES,
           X.POST_TYPE AS POST_TYPE,
           Y.NAME AS COMPANY_NAME
           FROM FB_POSTS AS X 
           INNER JOIN REL_COMPANY AS Y
           ON X.COMPANY_ID = Y.ID
           GROUP BY X.COMPANY_ID, POST_TYPE
           ;")
##    AVERAGE_LIKES AVERAGE_SHARES POST_TYPE COMPANY_NAME
## 1           6991            209        NA       Zillow
## 2          14723            206      post       Zillow
## 3          36319           4103     video       Zillow
## 4          13150            156        NA       Redfin
## 5           6258            126      post       Redfin
## 6           2537             94     video       Redfin
## 7            937             48        NA       Trulia
## 8           4695             85      post       Trulia
## 9           1412             59     video       Trulia
## 10          2317            323      post  realtor.com
## 11          1165            133     video  realtor.com

To be continued…