1 - importing-data-from-databases

# importing-data-in-r-part-2
# https://rpubs.com/potentialwjy/ImportDataIntoR05

# importing-data-from-databases

##############################################
# cat("\014") # clear screen before next section
##############################################

# import data from a SQL database

# Load the DBI package
# install.packages("RMySQL")
library(DBI)

# dbConnect() call to connect to the MySQL database
con <- dbConnect(RMySQL::MySQL(), 
                 dbname = "tweater", 
                 host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                 port = 3306,
                 user = "student",
                 password = "datacamp")

# you need different packages depending on the database you want to connect to

# dbConnect() = Connect to a DBMS going through the appropriate authorization procedure. 

# RMySQL::MySQL() = DBIdriver object = specifies how connections are made 
                                      # and how data is mapped 
                                      # between R and the database

# dbConnect() here produces an object of class MySQLConnection

##############################################
# cat("\014") # clear screen before next section
##############################################

con
## <MySQLConnection:0,0>
# <MySQLConnection:0,2>

dbListTables(con) # outputs a character vector with the table names.
## [1] "comments" "tweats"   "users"

# Build a vector of table names: tables
tables <- dbListTables(con) 

# Display structure of tables
str(tables)
##  chr [1:3] "comments" "tweats" "users"

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import the users table from tweater: users
users <- dbReadTable(con, "users")

# Print users
users
##   id      name     login
## 1  1 elisabeth  elismith
## 2  2      mike     mikey
## 3  3      thea   teatime
## 4  4    thomas tomatotom
## 5  5    oliver olivander
## 6  6      kate  katebenn
## 7  7    anjali    lianja

##############################################
# cat("\014") # clear screen before next section
##############################################

# Get table names
table_names <- dbListTables(con)
# containing the names of all the tables in the database.

# Import all tables
tables <- lapply(table_names, dbReadTable, conn = con)

# Print out tables
tables
## [[1]]
##      id tweat_id user_id            message
## 1  1022       87       7              nice!
## 2  1000       77       7             great!
## 3  1011       49       5            love it
## 4  1012       87       1   awesome! thanks!
## 5  1010       88       6              yuck!
## 6  1026       77       4      not my thing!
## 7  1004       49       1  this is fabulous!
## 8  1030       75       6           so easy!
## 9  1025       88       2             oh yes
## 10 1007       49       3           serious?
## 11 1020       77       1 couldn't be better
## 12 1014       77       1       saved my day
## 
## [[2]]
##   id user_id
## 1 75       3
## 2 88       4
## 3 77       6
## 4 87       5
## 5 49       1
## 6 24       7
##                                                                  post
## 1                                       break egg. bake egg. eat egg.
## 2                           wash strawberries. add ice. blend. enjoy.
## 3                       2 slices of bread. add cheese. grill. heaven.
## 4               open and crush avocado. add shrimps. perfect starter.
## 5 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
## 6                              just eat an apple. simply and healthy.
##         date
## 1 2015-09-05
## 2 2015-09-14
## 3 2015-09-21
## 4 2015-09-22
## 5 2015-09-22
## 6 2015-09-24
## 
## [[3]]
##   id      name     login
## 1  1 elisabeth  elismith
## 2  2      mike     mikey
## 3  3      thea   teatime
## 4  4    thomas tomatotom
## 5  5    oliver olivander
## 6  6      kate  katebenn
## 7  7    anjali    lianja

##############################################
# cat("\014") # clear screen before next section
##############################################

# see that the tweats table, for example, contains a column user_id. 

# issue with Rstudio?
# tables$tweats does not work
# so
# tables[[2]] = tables$tweats
tables[[2]]
##   id user_id
## 1 75       3
## 2 88       4
## 3 77       6
## 4 87       5
## 5 49       1
## 6 24       7
##                                                                  post
## 1                                       break egg. bake egg. eat egg.
## 2                           wash strawberries. add ice. blend. enjoy.
## 3                       2 slices of bread. add cheese. grill. heaven.
## 4               open and crush avocado. add shrimps. perfect starter.
## 5 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
## 6                              just eat an apple. simply and healthy.
##         date
## 1 2015-09-05
## 2 2015-09-14
## 3 2015-09-21
## 4 2015-09-22
## 5 2015-09-22
## 6 2015-09-24
colnames(tables[[2]])
## [1] "id"      "user_id" "post"    "date"

# ids in the column refer to the users that have posted the tweat. 
# comments contain both a user_id and a tweat_id column. 
# i.e. which user posted a comment on which tweat.

# Q: 
# who posted the tweat on which somebody commented "awesome! thanks!" (comment 1012)?

tweat_id_required <- tables[[1]][,2][tables[[1]][,4] == "awesome! thanks!"]
tweat_id_required
## [1] 87
tweat_id_required <- tables[[1]][,"tweat_id"][tables[[1]][,4] == "awesome! thanks!"]
tweat_id_required
## [1] 87

user_id_required <- tables[[2]][,"user_id"][tables[[2]][,"id"] == tweat_id_required]
user_id_required
## [1] 5

user_name <- tables[[3]][,"name"][tables[[3]][,"id"] == user_id_required]
user_name
## [1] "oliver"

# A:
# user with user_id 5, so Oliver.

##############################################
# cat("\014") # clear screen before next section
##############################################

con <- dbConnect(RMySQL::MySQL(), 
                 dbname = "company", 
                 host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                 port = 3306,
                 user = "student",
                 password = "datacamp")

##############################################
# cat("\014") # clear screen before next section
##############################################

# Selective Importing

# SQL Queries from inside R: example 1

# import entire table
# then subset in R

employees <- dbReadTable(con, "employees")

subset(employees,
       subset = started_at > "2012-09-01",
       select = name)
##      name
## 3   Julie
## 4 Heather
## 5    John

# alternatively,

# send SQL to db
# SQL run on db
# only results imported
# i.e. more efficient

dbGetQuery(con, "SELECT name FROM employees
                    WHERE started_at > \"2012-09-01\"")
##      name
## 1   Julie
## 2 Heather
## 3    John

# virtual function from the DBI package
# but is actually implemented by the RMySQL package

##############################################
# cat("\014") # clear screen before next section
##############################################

# SQL Queries from inside R: example 2

products <- dbReadTable(con, "products")

subset(products, subset = contract == 1)
##   id          name contract
## 2  2     Call Plus        1
## 4  9 Biz Unlimited        1

dbGetQuery(con, "SELECT * FROM products
                                 WHERE contract = 1")
##   id          name contract
## 1  2     Call Plus        1
## 2  9 Biz Unlimited        1

##############################################
# cat("\014") # clear screen before next section
##############################################

con <- dbConnect(RMySQL::MySQL(),
                 dbname = "tweater",
                 host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
                 port = 3306,
                 user = "student",
                 password = "datacamp")

# Import tweat_id column of comments where user_id is 1: elisabeth
elisabeth <- dbGetQuery(con, "SELECT tweat_id FROM comments
                                 WHERE user_id = 1")

# Print elisabeth
elisabeth
##   tweat_id
## 1       87
## 2       49
## 3       77
## 4       77

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import post column of tweats where date is higher than '2015-09-21': latest
latest <- dbGetQuery(con, "SELECT post FROM tweats
                                 WHERE date > \"2015-09-21\"")

# Print latest
latest
##                                                                  post
## 1               open and crush avocado. add shrimps. perfect starter.
## 2 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
## 3                              just eat an apple. simply and healthy.

##############################################
# cat("\014") # clear screen before next section
##############################################

# Create an R data frame, specific, 
# that selects the message column 
# from the comments table 
# where the tweat_id is 77 
# and the user_id is greater than 4.
specific <- dbGetQuery(con, "SELECT message FROM comments
                                 WHERE tweat_id = 77 AND user_id > 4")

# Print specific
specific
##   message
## 1  great!

##############################################
# cat("\014") # clear screen before next section
##############################################

# Create a data frame, short, 
# that selects the id and name columns 
# from the users table 
# where the number of characters in the name is strictly less than 5.
short <- dbGetQuery(con, "SELECT id, name FROM users
                                 WHERE CHAR_lENGTH(name) < 5")

# Print short
short
##   id name
## 1  2 mike
## 2  3 thea
## 3  6 kate

##############################################
# cat("\014") # clear screen before next section
##############################################

question <- dbGetQuery(con, "SELECT post, message
                             FROM tweats INNER JOIN comments on tweats.id = tweat_id
                             WHERE tweat_id = 77")

# answer
question
##                                            post            message
## 1 2 slices of bread. add cheese. grill. heaven.             great!
## 2 2 slices of bread. add cheese. grill. heaven.      not my thing!
## 3 2 slices of bread. add cheese. grill. heaven. couldn't be better
## 4 2 slices of bread. add cheese. grill. heaven.       saved my day

##############################################
# cat("\014") # clear screen before next section
##############################################

# DBI Internals

# dbGetQuery()
# Behind the scenes, the following steps are performed:
# Sending the specified query with dbSendQuery();
# Fetching the result of executing the query on the database with dbFetch();
# Clearing the result with dbClearResult().

# Send query to the database
res <- dbSendQuery(con, "SELECT * FROM comments WHERE user_id > 4")

dbGetInfo(res) 
## $statement
## [1] "SELECT * FROM comments WHERE user_id > 4"
## 
## $isSelect
## [1] 1
## 
## $rowsAffected
## [1] -1
## 
## $rowCount
## [1] 0
## 
## $completed
## [1] 0
## 
## $fieldDescription
## $fieldDescription[[1]]
## NULL

    # $statement
    # [1] "SELECT * FROM comments WHERE user_id > 4"
    # 
    # $isSelect
    # [1] 1
    # 
    # $rowsAffected
    # [1] -1
    # 
    # $rowCount
    # [1] 0
    # 
    # $completed
    # [1] 0
    # 
    # $fieldDescription
    # $fieldDescription[[1]]
    # NULL


# Use dbFetch() twice
dbFetch(res, n = 2) # import only two records of the query result
##     id tweat_id user_id message
## 1 1022       87       7   nice!
## 2 1000       77       7  great!
dbFetch(res) # import all remaining queries (don't specify n).
##     id tweat_id user_id  message
## 1 1011       49       5  love it
## 2 1010       88       6    yuck!
## 3 1030       75       6 so easy!

# Clear res
dbClearResult(res)
## [1] TRUE

##############################################
# cat("\014") # clear screen before next section
##############################################

# Create the data frame  long_tweats
# selects the post and date columns
# from the observations in tweats 
# where the character length of the post variable exceeds 40.
long_tweats <- dbGetQuery(con, "SELECT post, date FROM tweats
                                 WHERE CHAR_lENGTH(post) > 40")

  
  
# Print long_tweats
print(long_tweats)
##                                                                  post
## 1                           wash strawberries. add ice. blend. enjoy.
## 2                       2 slices of bread. add cheese. grill. heaven.
## 3               open and crush avocado. add shrimps. perfect starter.
## 4 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
##         date
## 1 2015-09-14
## 2 2015-09-21
## 3 2015-09-22
## 4 2015-09-22

# Disconnect from the database
dbDisconnect(con)
## [1] TRUE