Building an SQLite database in R from scraped Facebook and Twitter posts of competing companies with online real estate listings

The goal of this script is to show how to create a relational database in SQLite with data in XML format downloaded using Facebook’s Graph API and Twitter’s API. https://developers.facebook.com/docs/graph-api.

The data downloaded from Facebook includes the posts made by online real estate companies on their fan pages. Each post includes a permanent link, the number of likes it received, the number of shares, number of comments, comment type, a Unix Timestamp. We changed the Unix Timestamp to a variable containing date and time, and we also created a variable including the type of post it was (video or text only post).

We downloaded information from the fan pages of four companies: zillow, redfin, trulia, and realtor.com. The goal is to create three tables: 1. FB_POSTS 2. TWITTER_POSTS 3. COMPANY_INFO

Each row in table FB_POSTS will be a post and all of its information. Each row in table TWITTER_POSTS will be a tweet and all of its information. Finally, each row in COMPANY_INFO will be a company and all of its info.

Each row in FB_POSTS and TWITTER_POSTS will be assign a company id attribute. That attribute will be the key used to join these tables with the table COMPANY_INFO.

Reading and converting the XML Facebook data into an R data frame

suppressPackageStartupMessages(library(XML))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(RSQLite))

###########
# Zillow #
###########
file_path = "~/Level_Documents/RealEstate_Competitor_Scraping/zillow.query.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
Zillow <- xmlToDataFrame(doc)


Zillow$comment_info <- as.character(Zillow$comment_info)
Zillow$like_info <- as.integer(paste(Zillow$like_info))

for (i in 1:nrow(Zillow)){
  Zillow$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(Zillow$comment_info[i]), ""))
}
Zillow$comment_number <- as.integer(Zillow$comment_number)

Zillow$share_info <- as.integer(paste(Zillow$share_info))

# Changes dates to 
Zillow$created_time <- as.character(Zillow$created_time)
Zillow$created_time <- as.integer(Zillow$created_time)
Zillow$created_time <- as.POSIXlt(x =Zillow$created_time, origin="1970-01-01")


# Zillow$created_datex <- strftime(Zillow$created_time, format = "%Y-%m-%d")
# Zillow$created_datex <- ymd(Zillow$created_datex)
# 
# Zillow$created_timex <- strftime(Zillow$created_time, format="%H:%M:%S")
# Zillow$created_timex <- as.Time(Zillow$created_timex)


for (i in 1:nrow(Zillow)){
  Zillow$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(Zillow$comment_info[i]), "")
}

Zillow$comment_info <- NULL

for (i in 1:nrow(Zillow)){
  if (grepl("post", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "post"
  } else if (grepl("video", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "video"
  } else if (grepl("event", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "event"
  } else if (grepl("photo", Zillow$permalink[i]) == TRUE){
    Zillow$post_type[i] <- "photo"
  } else {
    Zillow$post_type[i] <- NA
  }
}

###########
# Redfin #
###########
file_path = "~/Level_Documents/RealEstate_Competitor_Scraping/Redfin.query.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
redfin <- xmlToDataFrame(doc)


redfin$comment_info <- as.character(redfin$comment_info)
redfin$like_info <- as.integer(paste(redfin$like_info))

for (i in 1:nrow(redfin)){
  redfin$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(redfin$comment_info[i]), ""))
}
redfin$comment_number <- as.integer(redfin$comment_number)

redfin$share_info <- as.integer(paste(redfin$share_info))

# Changes dates to 
redfin$created_time <- as.character(redfin$created_time)
redfin$created_time <- as.integer(redfin$created_time)
redfin$created_time <- as.POSIXlt(x =redfin$created_time, origin="1970-01-01")

for (i in 1:nrow(redfin)){
  redfin$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(redfin$comment_info[i]), "")
}

redfin$comment_info <- NULL

for (i in 1:nrow(redfin)){
  if (grepl("post", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "post"
  } else if (grepl("video", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "video"
  } else if (grepl("event", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "event"
  } else if (grepl("photo", redfin$permalink[i]) == TRUE){
    redfin$post_type[i] <- "photo"
  } else {
    redfin$post_type[i] <- NA
  }
}

###########
# trulia #
###########
file_path = "~/Level_Documents/RealEstate_Competitor_Scraping/Trulia.query.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
trulia <- xmlToDataFrame(doc)


trulia$comment_info <- as.character(trulia$comment_info)
trulia$like_info <- as.integer(paste(trulia$like_info))

for (i in 1:nrow(trulia)){
  trulia$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(trulia$comment_info[i]), ""))
}
trulia$comment_number <- as.integer(trulia$comment_number)

trulia$share_info <- as.integer(paste(trulia$share_info))

# Changes dates to 
trulia$created_time <- as.character(trulia$created_time)
trulia$created_time <- as.integer(trulia$created_time)
trulia$created_time <- as.POSIXlt(x =trulia$created_time, origin="1970-01-01")

for (i in 1:nrow(trulia)){
  trulia$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(trulia$comment_info[i]), "")
}

trulia$comment_info <- NULL

for (i in 1:nrow(trulia)){
  if (grepl("post", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "post"
  } else if (grepl("video", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "video"
  } else if (grepl("event", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "event"
  } else if (grepl("photo", trulia$permalink[i]) == TRUE){
    trulia$post_type[i] <- "photo"
  } else {
    trulia$post_type[i] <- NA
  }
}

###########
# realtor.com #
###########
file_path = "~/Level_Documents/RealEstate_Competitor_Scraping/Realtor.com.query.xml"
doc <- xmlInternalTreeParse(file_path)
doc_list <- xmlToList(doc)
realtor.com <- xmlToDataFrame(doc)


realtor.com$comment_info <- as.character(realtor.com$comment_info)
realtor.com$like_info <- as.integer(paste(realtor.com$like_info))

for (i in 1:nrow(realtor.com)){
  realtor.com$comment_number[i] <- as.integer(
    gsub("[^0-9]", "", unlist(realtor.com$comment_info[i]), ""))
}
realtor.com$comment_number <- as.integer(realtor.com$comment_number)

realtor.com$share_info <- as.integer(paste(realtor.com$share_info))

# Changes dates to 
realtor.com$created_time <- as.character(realtor.com$created_time)
realtor.com$created_time <- as.integer(realtor.com$created_time)
realtor.com$created_time <- as.POSIXlt(x =realtor.com$created_time, origin="1970-01-01")

for (i in 1:nrow(realtor.com)){
  realtor.com$comment_type[i] <- gsub("[^aA-zZ]", "", unlist(realtor.com$comment_info[i]), "")
}

realtor.com$comment_info <- NULL

for (i in 1:nrow(realtor.com)){
  if (grepl("post", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "post"
  } else if (grepl("video", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "video"
  } else if (grepl("event", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "event"
  } else if (grepl("photo", realtor.com$permalink[i]) == TRUE){
    realtor.com$post_type[i] <- "photo"
  } else {
    realtor.com$post_type[i] <- NA
  }
}

Zillow$company_id <- 1
redfin$company_id <- 2
trulia$company_id <- 3
realtor.com$company_id <- 4

FB_posts_data <- rbind(Zillow, redfin, trulia, realtor.com)

Migrating the data to SQLite We then wrote the newly created fb.posts.data R data frame as csv file. The idea is to then use the csv file to populate the first SQLITE table. But before we start loading the table, we need to create the table and its variables in SQLite. This can be done directly in SQLite or via R. I chose the latter.

# Writes out the data frame of FB posts as a csv file
# write.csv(x = FB_posts_data, file = "~/Level_Documents/RealEstate_Competitor_Scraping/FB_posts_data.csv")

# Connects to the SQLITE database POSTS.DB
# con <- dbConnect(SQLite(), dbname = "~/Level_Documents/RealEstate_Competitor_Scraping/posts.db")

# Creates the structure of the table for the FB posts
# dbSendQuery(con, 
# "CREATE TABLE FB_POSTS(ID INTEGER, PERMALINK VARCHAR(75), LIKES INTEGER, 
# SHARES INTEGER, CREATED_TIME DATETIME, COMMENTS INTEGER, COMMENT_TYPE 
# VARCHAR(10),
# POST_TYPE VARCHAR(10), COMPANY_ID INTEGER);")

Once we create the table, we need to populate it with the csv file. This step has to be done via SQLite. However, I’m including the code below:

# sqlite> .mode csv
# sqlite> .import FB_posts_data.csv FB_POSTS

We then create the table COMPANY_INFO. For this table, we will hard code the information, since it isn’t too much.

# dbSendQuery(con, "
# CREATE TABLE REL_COMPANY(ID INTEGER,
# NAME VARCHAR(20));")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (1, 'Zillow');")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (2, 'Redfin');")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (3, 'Trulia');")
# 
# dbGetQuery(con, "INSERT INTO REL_COMPANY (ID,NAME)
#            VALUES (4, 'realtor.com');")

So far we’ve created two of the three tables. Before we continue, let’s take a look at the tables!

# dbGetQuery(con, "SELECT * FROM FB_POSTS LIMIT 5;")

Building an SQLite database in R from scraped Facebook and Twitter posts of competing companies with online real estate listings

Gregory Merrell in collaboration with Juan Pablo Murillo

March 23, 2016